# Masinsko ucenje - projekat

Dataset: https://archive.ics.uci.edu/ml/datasets/Diabetes+130-US+hospitals+for+years+1999-2008
 
Klasifikacija - Predviđanje ponovne hospitalizacije pacijenta sa dijabetesom na osnovu podataka sa inicijalne hospitalizacije

1.   Priprema podataka

In [None]:
import pandas as pd
import sklearn as scikit
import numpy as np
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer, LabelEncoder
import copy
import matplotlib.pyplot as plt
import category_encoders as ce
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import VarianceThreshold
from math import ceil
from sklearn.feature_selection import SelectKBest, SelectPercentile, mutual_info_classif


In [None]:
path = "../dataset/diabetic_data.csv"
dataframe = pd.read_csv(path, low_memory=False, na_values=[
                        "?", "Unknown/Invalid"])
pd.set_option("display.max_columns", None)
dataframe.head()


In [None]:
dataframe.shape

In [None]:
dataframe["readmitted"].value_counts()

In [None]:
dataframe.info()

In [None]:
def get_null_col(df, treshold):
    missing = df.isna()
    missing_count = missing.sum()
    total_rows=df.shape[0]
    to_drop = []
    for i in range(0, missing_count.count()):
        if missing_count[i] / total_rows > treshold:
            to_drop.append(i)
    return to_drop

for i in get_null_col(dataframe, 0.1):
    print(dataframe.columns[i])

#dataframe = dataframe.drop(dataframe.columns[to_drop], axis=1)


In [None]:
dataframe["medical_specialty"].fillna("Unknown", inplace=True)
dataframe["payer_code"].fillna("Unknown", inplace=True)
dataframe["weight"].value_counts()

In [None]:
weigth_scale_mapper = {
    "[0-25)": 0 
    ,"[25-50)": 1
    ,"[50-75)": 2
    ,"[75-100)": 3
    ,"[100-125)": 4
    ,"[125-150)": 5
    ,"[150-175)": 6
    ,"[175-200)": 7
    ,">200": 8
}
dataframe["weight"].replace(weigth_scale_mapper, inplace=True)
dataframe["weight"].fillna(round(dataframe["weight"].mean()), inplace=True)


In [None]:
print(dataframe.shape)
dataframe.dropna(inplace=True)
to_drop = ["encounter_id",]
dataframe.drop(
    dataframe[to_drop],
    axis=1,
    inplace=True,
)
dataframe.shape

In [None]:
def get_one_value_cols(df):
    to_drop=[]
    for col in df.columns:
        if len(df[col].unique()) == 1:
            print(col)
            to_drop.append(col)
    return to_drop

dataframe.drop(get_one_value_cols(dataframe),axis=1,inplace=True)

In [None]:
print("Number Of Rows In The Original DataFrame:", len(dataframe))
print("Number Of Rows After Deduping:", len(dataframe.drop_duplicates()))

In [None]:
dataframe.info()

Diagnose mapping http://icd9cm.chrisendres.com/index.php?action=contents

In [None]:
def diag_mapper(value: str) -> int:
    try:
        int_value = int(float(value))
        if int_value <= 139:
            return 'Infectious and Parasitic'
        elif int_value <= 239:
            return 'Neoplasms'
        elif int_value <= 279:
            return 'Metabolic'
        elif int_value <= 289:
            return 'Blood'
        elif int_value <= 319:
            return 'Mental'
        elif int_value <= 389:
            return 'Nervous System'
        elif int_value <= 459:
            return 'Circulatory System'
        elif int_value <= 519:
            return 'Respiratory System'
        elif int_value <= 579:
            return 'Digestive System'
        elif int_value <= 629:
            return 'Genitourinary System'
        elif int_value <= 679:
            return 'Pregnancy'
        elif int_value <= 709:
            return 'Skin'
        elif int_value <= 739:
            return 'Musculoskeletal'
        elif int_value <= 759:
            return 'Congenital'
        elif int_value <= 779:
            return 'Perinatal Period'
        elif int_value <= 799:
            return 'Ill Defined'
        else:
            return 'Injury'
    except ValueError:
        if value[0] == 'V':
            return 'Status'
        else:  # E
            return 'Cause'


dataframe['diag_1_class'] = dataframe['diag_1'].apply(lambda x: diag_mapper(x))
dataframe['diag_2_class'] = dataframe['diag_2'].apply(lambda x: diag_mapper(x))
dataframe['diag_3_class'] = dataframe['diag_3'].apply(lambda x: diag_mapper(x))


In [None]:
glu_scale_mapper = {
    'None': 0,
    'Norm': 1,
    '>200': 2,
    '>300': 3
}
a1_scale_mapper = {
    'None': 0,
    'Norm': 1,
    '>7': 2,
    '>8': 3,
}

one_hot = LabelBinarizer()
lab_enc = LabelEncoder()

dataframe['admission_type_id'] = dataframe['admission_type_id'].astype('category')
dataframe['discharge_disposition_id'] = dataframe['discharge_disposition_id'].astype('category')
dataframe['admission_source_id'] = dataframe['admission_source_id'].astype('category')
dataframe['weight'] = dataframe['weight'].astype('int32')

dataframe['age']=lab_enc.fit_transform(dataframe['age'])
dataframe['gender'] = one_hot.fit_transform(dataframe['gender'])
dataframe['max_glu_serum'].replace(glu_scale_mapper, inplace=True)
dataframe['A1Cresult'].replace(a1_scale_mapper, inplace=True)
dataframe['medical_specialty']=lab_enc.fit_transform(dataframe['medical_specialty'])

lab_coded = ['metformin','repaglinide','nateglinide','chlorpropamide','glimepiride','acetohexamide',
    'glipizide','glyburide','tolbutamide','pioglitazone','rosiglitazone','acarbose','miglitol','troglitazone',
    'tolazamide','insulin','glyburide-metformin','glipizide-metformin','glimepiride-pioglitazone','metformin-pioglitazone'
    ,'diag_1','diag_2','diag_3','payer_code']

for col in lab_coded:
    dataframe[col] = lab_enc.fit_transform(dataframe[col])

dataframe['change'] = one_hot.fit_transform(dataframe['change'])
dataframe['diabetesMed'] = one_hot.fit_transform(dataframe['diabetesMed'])
dataframe['readmitted'] = lab_enc.fit_transform(dataframe['readmitted'])

dataframe.head()

In [None]:
dataframe_label = dataframe
dataframe_onehot = dataframe.copy(deep=True)
dataframe_onehot_plus = dataframe.copy(deep=True)
dataframe.info()

In [None]:

def df_one_hot(df, columns):
    tmp = pd.DataFrame()
    for col in columns:
        res = one_hot.fit_transform(df[col])
        for val in range(0, len(res[0])):
            new_col = res[:, val]
            tmp_col = pd.DataFrame(
                {col + '(' + str(one_hot.classes_[val])+')': new_col})
            tmp = pd.concat((tmp, tmp_col), axis=1)
    return pd.concat((df.reset_index(drop=True), tmp.reset_index(drop=True)), axis=1, join='inner')

for col in ['diag_1_class', 'diag_2_class', 'diag_3_class','race']:
    dataframe_label[col] = lab_enc.fit_transform(dataframe_label[col])

dataframe_onehot_plus = df_one_hot(dataframe_onehot_plus, ['diag_1_class', 'diag_2_class', 'diag_3_class',
                              'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'race'])
dataframe_onehot_plus.drop(['diag_1_class', 'diag_2_class', 'diag_3_class', 'admission_type_id',
                      'discharge_disposition_id', 'admission_source_id', 'race'], axis=1, inplace=True)

dataframe_onehot = df_one_hot(dataframe_onehot, ['diag_1_class', 'diag_2_class', 'diag_3_class','race'])
dataframe_onehot.drop(['diag_1_class', 'diag_2_class', 'diag_3_class','race'], axis=1, inplace=True)


print(dataframe_label.shape)
print(dataframe_onehot_plus.shape)
print(dataframe_onehot.shape)

In [None]:
def remove_low_variance_columns(df, target, treshold):

    y = df[target]
    X_cat = df.drop(columns=[target,])
    selector = VarianceThreshold(threshold=treshold)
    X_reduced = selector.fit_transform(X_cat, y)
    cols = selector.get_support(indices=True)
    selected_columns = X_cat.iloc[:, cols].columns.tolist()
    selected_columns.append(target)
    print(X_reduced.shape)
    return selected_columns


# selected_columns = remove_low_variance_columns(
#     dataframe_onehot, "readmitted", 0.005)

# to_drop = []
# print("deleted columns:")

# for col in dataframe_onehot.columns:
#     if col not in selected_columns:
#         print(col)
#         to_drop.append(col)

# # dataframe_onehot.drop(dataframe_onehot[to_drop], inplace=True, axis=1)
# print(dataframe_onehot.shape)


In [None]:
def select_best_columns(df, target, best_ratio):

    y = df[target]
    X_cat = df.drop(columns=[target, 'diag_1', 'diag_2', 'diag_3'])
    X = X_cat
    selector = SelectKBest(mutual_info_classif, k=ceil(X.shape[1]*best_ratio))
    X_reduced = selector.fit_transform(X, y)
    cols = selector.get_support(indices=True)
    selected_columns = X.iloc[:, cols].columns.tolist()
    selected_columns.append(target)

    print(X_reduced.shape)

    return selected_columns


# selected = select_best_columns(dataframe_onehot, "readmitted", 0.8)

# to_drop = []
# print("deleted columns:")

# for col in dataframe_onehot.columns:
#     if col not in selected:
#         print(col)
#         to_drop.append(col)

# #dataframe_onehot.drop(dataframe_onehot[to_drop], inplace=True, axis=1)
# print(dataframe_onehot.shape)


2.   Deskriptivna analiza

In [None]:
x = dataframe_onehot['readmitted'].unique()
y = dataframe_onehot['readmitted'].value_counts()

plt.bar(x, y)
plt.show()

In [None]:
def feature_importance(df):
    target = 'readmitted'
    y = df[target]
    X = df.drop(columns=[target, 'diag_1', 'diag_2', 'diag_3'])
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=.2, random_state=1066)
    encoder = ce.LeaveOneOutEncoder(return_df=True)
    X_train_loo = encoder.fit_transform(X_train, y_train)
    X_test_loo = encoder.transform(X_test)
    model = GradientBoostingRegressor(
        learning_rate=0.05, max_depth=5, n_estimators=500, min_samples_split=5, n_iter_no_change=10)
    model.fit(X_train_loo, y_train)

    rmse = np.sqrt(mean_squared_error(y_test, model.predict(X_test_loo)))
    feature_importance = model.feature_importances_
    sorted_idx = np.argsort(feature_importance)
    fig = plt.figure(figsize=(20, 30))
    plt.barh(range(len(sorted_idx)),
             feature_importance[sorted_idx], align='center')
    plt.yticks(range(len(sorted_idx)), np.array(X_test.columns)[sorted_idx])
    plt.title('Feature Importance')

    perm_importance = permutation_importance(
        model, X_test_loo, y_test, n_repeats=10, random_state=1066)
    sorted_idx = perm_importance.importances_mean.argsort()
    fig = plt.figure(figsize=(20, 30))
    plt.barh(range(len(sorted_idx)),
             perm_importance.importances_mean[sorted_idx], align='center')
    plt.yticks(range(len(sorted_idx)), np.array(X_test.columns)[sorted_idx])
    plt.title('Permutation Importance')
    

# feature_importance(dataframe_onehot)


3. Algoritmi za klasifikaciju

k-Nearest Neighbors.
Decision Trees.
Naive Bayes.
Random Forest.
Gradient Boosting.

In [None]:
dataframe_onehot.describe()

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


def gridSearch(df, target):

    np.random.seed(0)

    pipe = Pipeline([  # ('mms', MinMaxScaler()),
        # ('knn', KNeighborsClassifier()),
        ("classifier", RandomForestClassifier()),])

    search_space = [
        {"classifier": [LogisticRegression()],
         "classifier__penalty": ['l2'],
         "classifier__C": np.logspace(0, 4, 10)},
        {"classifier": [RandomForestClassifier()],
         "classifier__n_estimators": [10,100],#, 200
         "classifier__max_features": [1, 2, 3]},
        # {'knn__n_neighbors': [3, 5, 7, 9],
        #     'knn__weights': ['uniform', 'distance'],
        #     'knn__leaf_size': [15, 20]}
    ]



##Some metrics are essentially defined for binary classification tasks (e.g. f1_score, roc_auc_score)

    gridsearch = GridSearchCV(estimator=pipe,
                              param_grid=search_space,
                              scoring=['accuracy','precision','average_precision',],#'roc_auc'
                              refit='accuracy',
                              cv=5,
                              verbose=True,
                            #   error_score='raise'
                              )


    X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=[target]).values,
                                                        df[target].values, test_size=0.2 , random_state=42)

    best_model = gridsearch.fit(X_train, y_train)

    print(gridsearch.best_estimator_)

    # print(gridSearch.cv_results_)

    print(gridsearch.best_params_)

    print(gridsearch.best_score_)


    # df_result = pd.DataFrame(GS.cv_results_)
    # df_result = df.sort_values("rank_test_accuracy")
    # print(df_result.head(10))


    # best_model.best_estimator_.get_params()["classifier"]


    # print(gridsearch.score(X_train, y_train))


gridSearch(dataframe_label, 'readmitted')


     

