In [56]:
import os
import sys
import numpy as np
import pandas as pd

# Get the images

### Modify the path_dir below

In [57]:
path_dir = os.path.dirname(os.path.abspath(''))

In [58]:

sys.path.append(os.path.join(path_dir, 'scripts\\common'))
import tool


path_images    = os.path.join(path_dir, 'data\\images')

dataset = pd.read_csv(os.path.join(path_dir, 'data\\sheet\\data_final_lite.csv'))
dataset = dataset.loc[:, ['ID', 'CHA2DS2-VASc', 'Age', 'Sex']]

In [59]:
SIZE = 224

image_size  = (SIZE, SIZE, 3)

## Extract images

In [60]:
import cv2 as cv
from tensorflow.keras.preprocessing.image import load_img, img_to_array

error_images       = []
patient_id_list    = []
dir_dirs = []
def find_images(name) :
    data        = []
    

    for dir_client in os.listdir(path_images) :
        path_client = os.path.join(path_images, dir_client)
        for dirs in os.listdir(path_client) :
            if dirs.lower().find(name) != -1 :
                image = cv.imread(os.path.join(path_client, dirs))
                image = cv.resize(image, (SIZE, SIZE), interpolation = cv.INTER_LINEAR)
                                
                if(image[image.astype(bool)].size != 0) :
                    try :
                        age     = round(dataset['Age'][dataset['ID'] == dir_client].values[0])
                        sexe    = int(dataset['Sex'][dataset['ID'] == dir_client].values[0])
                        target  = dataset['CHA2DS2-VASc'][dataset['ID'] == dir_client].values[0]
                        
                        patient_id_list.append(dir_client)
                        data.append([image, age, sexe, target])
                        if dir_dirs not in error_images :
                            dir_dirs.append(dirs)
                    except :
                        if dir_client not in error_images :
                            error_images.append(dir_client)
                        continue
    return data

## Prepare data

In [61]:
columns = ['image', 'age', 'sex', 'target']

cc_images     = pd.DataFrame(find_images('cc'), columns=columns)
sup_images    = pd.DataFrame(find_images('sup'), columns=columns)
deep_images   = pd.DataFrame(find_images('deep'), columns=columns)


cc_y    = cc_images.iloc[:, -1]
cc_X    = cc_images.iloc[:, :-1]

sup_y   = sup_images.iloc[:, -1]
sup_X   = sup_images.iloc[:, :-1]

deep_y  = deep_images.iloc[:, -1]
deep_X  = deep_images.iloc[:, :-1]

In [62]:
age         = cc_images.iloc[:, -3]
sex         = cc_images.iloc[:, -2]

In [63]:
def form_col(data, add_col=True) :
    columns = []
    for col in data.columns.to_list() :
        columns.append(str(col))
       
    # if add_col : 
    #     new_columns = columns[:-2]
    #     new_columns.append('age')
    #     new_columns.append('sex')
    #     columns = new_columns

    data.columns = columns
    return data

# Feature Extraction + Classification per depth

In [64]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model
from tensorflow.keras.applications import EfficientNetB0


In [65]:
def find_num_col(X, binary_columns=[], categorical_columns=[]) :
    
    numeric_columns = []
    for elem in X.columns.to_list():
        if elem not in binary_columns and elem not in categorical_columns :
            numeric_columns.append(elem)
    return numeric_columns

In [66]:
def make_preprocessor(data, binary_columns=[]):
    
    numeric_columns     = find_num_col(data, binary_columns=binary_columns)
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        # ('scaler', StandardScaler()),
        ('pca', PCA(n_components=tool.find_nb_pca(data, numeric_columns))),    
    ])

    preprocessor        = ColumnTransformer(
        transformers=[
            ('numeric', numeric_transformer, numeric_columns),
        ])
    return preprocessor

In [67]:
from skimage import color
from skimage.exposure import adjust_gamma
from skimage.util import random_noise

  
def form_image(image) :
    # image = image / 255.
    image = np.expand_dims(image, axis=0)
    return image

def predict_image(image, model) :    
    form_data = []
    for elem in model.predict(image)[0] :
        form_data.append(elem)
    return form_data
  
def image_augmentation(images, model) :

    data = []
    for i in range(len(images)) :
        
        data.append(predict_image(form_image(images[i]), model))

        # data.append(predict_image(form_image(adjust_gamma(images[i], gamma=0.5,gain=1)), model), age, sex)
        # data.append(predict_image(form_image(adjust_gamma(images[i], gamma=2,gain=1)), model), age, sex)
        
        data.append(predict_image(form_image(np.fliplr(images[i])), model))
        # data.append(predict_image(form_image(np.flipud(images[i])), model))
        
        # data.append(predict_image(form_image(random_noise(images[i])), model), age, sex)

    return data


from tensorflow.keras.models import Model
from tensorflow.keras.applications import EfficientNetB0



def preprocess_image(X, y, image_augment=False, extraction_model=None) :
    
    if extraction_model == None :
        efficient_model     = EfficientNetB0(weights=None)
        extraction_model    = Model(inputs=efficient_model.inputs, outputs=efficient_model.layers[-2].output)
    
    if image_augment :      
        ages                = X['age'].to_list()
        gender              = X['sex'].to_list()
        
        len_before          = X.shape[0]
        X                   = pd.DataFrame(image_augmentation(X['image'], extraction_model))        
        len_after           = X.shape[0]    
        
        image_augment       = int(len_after / len_before)
        
        X['age']            = [ages[i] for i in range(len_before) for j in range(image_augment)]
        X['sex']            = [gender[i] for i in range(len_before) for j in range(image_augment)]
        y                   = [y[i] for i in range(len_before) for j in range(image_augment)]
    
    else :
        tmp_X = []
        for i in range(X.shape[0]) :
            tmp_X.append(predict_image(form_image(X['image'][i]), extraction_model))
            
        tmp_X               = pd.DataFrame(tmp_X)
        tmp_X['age']        = X['age']
        tmp_X['sex']        = X['sex']
        X                   = tmp_X
    
    X     = form_col(X, False)
    
    preprocessor = make_preprocessor(X)
    
    X = preprocessor.fit_transform(X)
    return X, y, image_augment, preprocessor

In [68]:
def find_num_col(X, binary_columns=[], categorical_columns=[]) :
    
    numeric_columns = []
    for elem in X.columns.to_list():
        if elem not in binary_columns and elem not in categorical_columns :
            numeric_columns.append(elem)
    return numeric_columns

In [69]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier


forest_param = {
        'bootstrap'         : [False, True],
        'ccp_alpha'         : [.001, 0.1, 1],
        'n_estimators'      : [25, 50, 100],
        'criterion'         : ['gini', 'entropy', 'log_loss']
        }

LR_param = {
    'C'         : [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'penalty'   : ['l1', 'l2'],
#     'max_iter'  : list(range(100,800,100)),
    'solver'    : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

svc_param = {
    'C'         :[0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'gamma'     :[1,0.1,0.001,0.0001],
    'kernel'    :['linear', 'poly', 'rbf', 'sigmoid']
}

In [70]:
cc_X = pd.DataFrame()

cc_X['age'] = age
cc_X['sex'] = sex

In [71]:
index_train, index_test                 = tool.dataset_split_index(cc_X, cc_y, fold=5)

cc_X, cc_y, cc_image_augment, cc_preprocessor               = preprocess_image(cc_X, cc_y)
sup_X, sup_y, sup_image_augment, sup_preprocessor           = preprocess_image(sup_X, sup_y)
deep_X, deep_y, deep_image_augment, deep_preprocessor       = preprocess_image(deep_X, deep_y)

KeyError: 'image'

In [None]:
# import pickle
# pickle.dump(deep_preprocessor, open('C:\\Users\\nperc\\Documents\\Datathon\\clean_deep_model\\data\\models\\deep_preprocess.h5', 'wb'))

In [None]:
model = RandomForestClassifier(random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(cc_X, cc_y, test_size=0.2, random_state=42)

grid = GridSearchCV(model, forest_param)

model = grid.fit(X_train, cc_y)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

y_pred = grid.predict(X_test)

print('Accuracy : ' + str(accuracy_score(y_test, y_pred)))
print(confusion_matrix(y_test, y_pred))

Accuracy : 0.7724137931034483
[[83  5  0]
 [ 9 17  8]
 [ 0 11 12]]


In [None]:
cc_report, cc_index_fold         = tool.find_best_grid(
        model, cc_X, cc_y, index_train, index_test,
        param=forest_param)

sup_report, sup_index_fold       = tool.find_best_grid(
        model, sup_X, sup_y, index_train, index_test,
        param=forest_param)

deep_report, deep_index_fold     = tool.find_best_grid(
        model, deep_X, deep_y, index_train, index_test,
        param=forest_param)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import train_test_split

def train_pred(model, report, X, y, index_fold) :
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model       = model(**report['best_grid'], random_state=42)
    z
    model.fit(X_train, y_train)
    y_pred      = model.predict(X_test)
    
    print('Mean accuracy : ' + str(report['best_mean_score']))
    print('Standard deviation : ' + str(report['best_st_score']))
    print('Accuracy : ' + str(accuracy_score(y_test, y_pred)))
    print(confusion_matrix(y_test, y_pred))
    
    return y_pred, model

In [None]:
# cc_pred, cc_model = train_pred(RandomForestClassifier, cc_report, cc_X, cc_y, cc_index_fold)

In [None]:
sup_pred, sup_model = train_pred(RandomForestClassifier, sup_report, sup_X, sup_y, sup_index_fold)

TypeError: sklearn.ensemble._forest.ExtraTreesClassifier() argument after ** must be a mapping, not str

In [None]:
deep_pred, deep_model = train_pred(ExtraTreesClassifier, deep_report, deep_X, deep_y, deep_index_fold)

Mean accuracy : 0.7759961685823755
Standard deviation : 0.03792559657602957
Accuracy : 0.7931034482758621
[[82  6  0]
 [ 8 25  1]
 [ 0 15  8]]


# Classification with all depth + New Values

In [None]:
all_X = []

for i in range(len(cc_y_pred)) :
    all_X.append([cc_y_pred[i], sup_y_pred[i], deep_y_pred[i]])

all_X = pd.DataFrame(all_X, columns=['cc_pred','sup_pred', 'deep_pred'])
all_y = cc_y

NameError: name 'cc_y_pred' is not defined

In [None]:
from sklearn.preprocessing import OneHotEncoder

def all_make_preprocessor(numeric_columns, categorical_columns):

    # numeric_transformer     = Pipeline(steps=[
    #     ('imputer', SimpleImputer(strategy='median')),
    #     ('scaler', StandardScaler()),
    #     # ('pca', PCA(n_components=find_nb_pca(data, numeric_columns))),    
    # ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
        ('encoder', OneHotEncoder())
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            # ('numeric', numeric_transformer, numeric_columns),
            ('categorical', categorical_transformer, categorical_columns),            
        ])
    return preprocessor

In [None]:
numeric_columns     = ['Age']
binary_columns      = ['Sex']
categorical_columns = [col for col in all_X.columns if col not in numeric_columns + binary_columns]

In [None]:
# from sklearn.model_selection import train_test_split

# data_train, data_test, labels_train, labels_test, index_train, index_test = train_test_split(all_X, all_y, all_X.index, test_size=0.2, random_state=42)

In [None]:
all_preprocessor    = all_make_preprocessor(numeric_columns, categorical_columns)

all_X = all_preprocessor.fit_transform(all_X)

In [None]:
index_train, index_test                 = tool.dataset_split_index(cc_X, cc_y)

cc_X_train = cc_X.iloc[index_train[0], :]
cc_y_train = cc_y[index_train[0]]

cc_X_test = cc_X.iloc[index_test[0], :]
cc_y_test = cc_y[index_test[0]]

In [None]:
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(all_X, all_y, test_size = 0.2, random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

def make_forest_model(X_train, y_train) :
    forest_model = RandomForestClassifier(random_state=42)

    forest_param = {
        'bootstrap'         : [True],
        'max_depth'         : [5, 8, 12, 15, 20],
        'ccp_alpha'         : [.001],
        'n_estimators'      : [25, 50, 100],
        'criterion'         : ['gini', 'entropy']
        }

    forest_grid = GridSearchCV(forest_model, forest_param)
    
    forest_grid.fit(X_train, y_train)
    print(forest_grid.best_params_)
    
    return forest_grid.best_estimator_

In [None]:
all_model   = make_forest_model(cc_X_train, cc_y_train)

{'bootstrap': True, 'ccp_alpha': 0.001, 'criterion': 'gini', 'max_depth': 8, 'n_estimators': 25}


In [None]:
# all_estimator, all_report         = tool.train_model(LogisticRegression(), all_X, all_y, index_train, index_test, param=LR_param)

In [None]:
from sklearn.metrics import f1_score, confusion_matrix, classification_report


all_pred = all_model.predict(cc_X_test)

print(classification_report(cc_y_test, all_pred))
print(confusion_matrix(cc_y_test, all_pred))

              precision    recall  f1-score   support

           0       0.61      1.00      0.76        88
           1       0.00      0.00      0.00        34
           2       0.00      0.00      0.00        23

    accuracy                           0.61       145
   macro avg       0.20      0.33      0.25       145
weighted avg       0.37      0.61      0.46       145

[[88  0  0]
 [34  0  0]
 [23  0  0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
23/148

0.1554054054054054

In [None]:
# prep = cc_preprocess.transform(pd.DataFrame([[90, 1]], columns=['age', 'Sexe']))
# prep