In [4]:
import pandas as pd
import sklearn as sk
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from liver_functions import import_data
from liver_functions import one_hot_encode
from liver_functions import split_data
from joblib import dump, load
from sklearn.feature_selection import RFE
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns
import pandas as pd
import numpy as np

def build_pipeline(classifier, isScaled, selectFeatures):
    if selectFeatures:
        if isScaled:
            pipe = Pipeline(steps=[("imputer", SimpleImputer()),
                                   ("scale", StandardScaler()),
                                   ("rfe", RFE(estimator=classifier)), 
                                   ("clf", classifier)])
        else:
            pipe = Pipeline(steps=[("imputer", SimpleImputer()),
                                   ("rfe", RFE(estimator=classifier)),
                                   ("clf", classifier)])
    else:
        if isScaled:
            pipe = Pipeline(steps=[("imputer", SimpleImputer()),
                                   ("scale", StandardScaler()),
                                   ("clf", classifier)])
        else:
            pipe = Pipeline(steps=[("imputer", SimpleImputer()),
                                   ("clf", classifier)])
    
    return pipe
    
def run_gridsearch(pipe, param_grid, num_folds, metric, X_train, y_train):
    search = GridSearchCV(pipe, param_grid, cv=num_folds, scoring=metric)
    search.fit(X_train, y_train)
    dump(search.best_estimator_, "test_model.pkl")
    
    
    return search.best_params_, search.best_score_, search.best_estimator_ 

"""
# Create the RFE object and rank each pixel
svc = SVC(kernel="linear", C=1)
rfe = RFE(estimator=svc, n_features_to_select=1, step=1)
rfe.fit(X, y)
ranking = rfe.ranking_.reshape(digits.images[0].shape)

"""


# Example
df = import_data("Indian Liver Patient Dataset (ILPD).csv")
df = one_hot_encode(df, "gender")
df = df.apply(lambda x: x.fillna(x.mean()),axis=0)
X_train, y_train, X_test, y_test = split_data(df, 0.2, "is_patient")

#pipe = build_pipeline(SVC(kernel = "linear"), True)
pipe = build_pipeline(DecisionTreeClassifier(), True, True)
#param_grid = dict(clf__C=list(range(1,10)), clf__gamma=[0.01, 0.1, 0.2, 0.3, 0.4, 0.5])
param_grid = dict(clf__max_depth=[None, 3, 6, 9], clf__max_features=["sqrt", None, "log2"],
                  rfe__n_features_to_select=list(range(1,12)))

params, score, model = run_gridsearch(pipe, param_grid, 10, "f1", X_train, y_train)

print(params)
print(score)
print(model)
print(df.columns)
print(model.named_steps["rfe"].n_features_)
print(model.named_steps["rfe"].support_)


{'clf__max_depth': 3, 'clf__max_features': 'log2', 'rfe__n_features_to_select': 5}
0.8254170161162067
Pipeline(memory=None,
         steps=[('imputer',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='mean',
                               verbose=0)),
                ('scale',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('rfe',
                 RFE(estimator=DecisionTreeClassifier(class_weight=None,
                                                      criterion='gini',
                                                      max_depth=None,
                                                      max_features=None,
                                                      max_leaf_nodes=None,
                                                      min_imp...
                                                      splitter='best'),
                     n_feature



In [None]:
sorted(sk.metrics.SCORERS.keys())

In [None]:
import unittest
import numpy as np
import pandas as pd
import string
import random
import os

from liver_functions import import_data
from liver_functions import one_hot_encode
from liver_functions import split_data
from liver_functions import make_correlation_heatmap

class TestFunctions(unittest.TestCase):
    
    def make_char_file(self):
        n = 5
        data = np.random.choice(list(string.ascii_lowercase),
                                     size=(n,n), replace = False)
        np.savetxt("test_data.csv", data, fmt= "%c", delimiter=",")
        return data
    
    def make_num_df(self):
        n = 5
        data = np.random.choice(100, size=(n,n), replace = False)
        return pd.DataFrame(data)
    
    def make_gender_df(self):
        n = 50
        gender = [random.choice(["Male", "Female"]) for i in range(n)]
        df = pd.DataFrame(gender, columns=["gender"])
        return df
    
    def test_heatmap(self):
        df = self.make_num_df()
        make_correlation_heatmap(df, "Test", "test.png")
        self.assertEqual(True, os.path.exists("test.png"))
         
    def test_one_hot(self):
        df = self.make_gender_df()
        df_enc = df.copy()
        df_enc["Male"] = df_enc["gender"].apply(lambda x: 1 if x == "Male" else 0)
        df_enc["Female"] = df_enc["gender"].apply(lambda x: 1 if x == "Female" else 0)
        df_enc.drop(columns = ["gender"],  inplace = True)
        pd.testing.assert_frame_equal(df_enc, one_hot_encode(df, "gender"),
                                      check_like = True, 
                                      check_dtype = False)

    def test_data_import(self):
        data = self.make_char_file()
        df = pd.DataFrame(data, columns= [x for x in data[0,:]])
        df = df.reindex(df.index.drop(0)).reset_index(drop=True)
        df_import = import_data("test_data.csv")
        pd.testing.assert_frame_equal(df, df_import, check_less_precise = 0) 

    
    def test_split(self):
        frac = 0.2
        data = self.make_char_file()
        n = len(data)
        df = pd.DataFrame(data, columns = [str(x) for x in range(n)])
        X_train, y_train, X_test, y_test = split_data(df, frac, str(n-1))
        
        self.assertEqual(np.shape(X_train), (round((1-frac)*n), n-1))
        self.assertEqual(np.shape(X_test), (round((frac)*n), n-1))
        self.assertEqual(np.shape(y_train), (round((1-frac)*n), ))
        self.assertEqual(np.shape(y_test), (round((frac)*n), ))
                                   
if __name__ == '__main__':        
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

In [None]:
#df.loc[:, df["is_patient"] == 1]

var = "tot_proteins"
one = df.loc[df["is_patient"] == 1, [var]]
two = df.loc[df["is_patient"] == 2, [var]]

for var in df.columns:
    for label in df["is_patient"].unique():
        print(label)
        subset = df[df["is_patient"] == label]

        # Draw the density plot
        sns.distplot(subset[var], hist = False, kde = True,
                     kde_kws = {'shade': True,'linewidth': 3},
                     label = label)
    plt.show()

In [None]:
#plt.scatter(df.index, df["tot_bilirubin"], c = df["is_patient"])
plt.boxplot([one["age"], two["age"]])

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize  = (10,10))
sns.pairplot(df, kind="reg", hue="is_patient")
plt.show()