# Souleiman

In [46]:
import pandas as pd
import numpy as np
from scipy.stats import randint

# Normalize or scale the data
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor

In [7]:
ROOT_PATH = 'data/'
X_train = pd.read_excel(ROOT_PATH + 'train_data.xlsx', sheet_name=0)
y_train = pd.read_excel(ROOT_PATH + 'train_data.xlsx', sheet_name=1)

X_test = pd.read_excel(ROOT_PATH + 'test_data_corrected.xlsx', sheet_name=0)
y_test = pd.read_excel(ROOT_PATH + 'test_data_corrected.xlsx', sheet_name=1)

X_train.columns = X_train.columns.str.rstrip()
X_test.columns = X_test.columns.str.rstrip()

In [8]:
def add_metrics(col_name_regex, col_name, col_start, col_end, df):
    filtered_df = df.filter(regex=col_name_regex).copy()
    new_df = pd.DataFrame()
    new_df[f'{col_name} max'] = filtered_df.apply(max, axis=1)
    new_df[f'{col_name} min'] = filtered_df.apply(min, axis=1)
    new_df[f'{col_name} mean'] = filtered_df.mean(axis=1)
    new_df[f'{col_name} std'] = filtered_df.std(axis=1)
    new_df[f'{col_name} final diff'] = filtered_df[col_end] - filtered_df[col_start]
    return new_df

In [9]:
enhanced_df = pd.DataFrame()
enhanced_df_test = pd.DataFrame()
columns_data = [
    ('^dd[\d] Cell Density$', 'Cell Density', 'dd0 Cell Density', 'dd7 Cell Density'),
    ('^dd[\d]-dd[\d] Cell Density Gradient$', 'Cell Density Gradient', 'dd0-dd1 Cell Density Gradient', 'dd5-dd7 Cell Density Gradient'),
    ('^dd[\d] Aggregate Size$', 'Aggregate Size', 'dd0 Aggregate Size', 'dd7 Aggregate Size'),
    ('^dd[\d]-dd[\d] Aggregate Size Gradient$', 'Aggregate Size Gradient', 'dd0-dd1 Aggregate Size Gradient', 'dd5-dd7 Aggregate Size Gradient'),
    ('^Average DO concentration dd[\d]$', 'Average DO concentration', 'Average DO concentration dd0', 'Average DO concentration dd7'),
    ('^Average DO concentration gradient dd[\d]$', 'Average DO concentration gradient', 'Average DO concentration gradient dd0', 'Average DO concentration gradient dd7'),
    ('^DO concentration/cell count dd[\d]$', 'DO concentration/cell count', 'DO concentration/cell count dd0', 'DO concentration/cell count dd7'),
    ('^DO gradient/cell count dd[\d]$', 'DO gradient/cell count', 'DO gradient/cell count dd0', 'DO gradient/cell count dd7'),
    ('^dd[\d] Average of 2nd derivative DO$', 'Average of 2nd derivative DO', 'dd0 Average of 2nd derivative DO', 'dd7 Average of 2nd derivative DO'),
    ('^dd[\d] DO 2nd derivative/cell count$', 'DO 2nd derivative/cell count', 'dd0 DO 2nd derivative/cell count', 'dd7 DO 2nd derivative/cell count'),
    ('^dd[\d] Average pH$', 'Average pH', 'dd0 Average pH', 'dd7 Average pH'),
    ('^dd[\d] Average pH Gradient$', 'Average pH Gradient', 'dd0 Average pH Gradient', 'dd7 Average pH Gradient'),
    ('^dd[\d] Lactate Concentration$', 'Lactate Concentration', 'dd0 Lactate Concentration', 'dd7 Lactate Concentration'),
    ('^dd[\d] Glucose Concentration$', 'Glucose Concentration', 'dd0 Glucose Concentration', 'dd7 Glucose Concentration'),
]

for col_name_regex, col_name, col_start, col_end in columns_data:
    new_cols = add_metrics(col_name_regex, col_name, col_start, col_end, X_train)
    new_cols_test = add_metrics(col_name_regex, col_name, col_start, col_end, X_test)

    enhanced_df = pd.concat([enhanced_df, new_cols], axis=1)
    enhanced_df_test = pd.concat([enhanced_df_test, new_cols_test], axis=1)

X_train = enhanced_df.copy()
X_test = enhanced_df_test.copy()

In [10]:
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [11]:
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.base import TransformerMixin
# from sklearn.decomposition import PCA
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.gaussian_process import GaussianProcessRegressor
# from sklearn.gaussian_process.kernels import RBF, WhiteKernel
# from sklearn.feature_selection import SelectKBest, f_regression


# y_train_temp = y_train.to_numpy().ravel() / 100
# y_test_temp = y_test.to_numpy().ravel() / 100

# class TreeTransformer(TransformerMixin):
#     def __init__(self, estimator):
#         self.estimator = estimator

#     def fit(self, X, y):
#         self.estimator.fit(X, y)
#         return self

#     def transform(self, X):
#         return X


# pca = PCA(n_components=0.95)
# X_train_pca = pca.fit_transform(X_train)
# X_test_pca = pca.transform(X_test)

# # Fit a MARS model and find the features importances 
# model = make_pipeline(TreeTransformer(DecisionTreeRegressor(max_depth=3)), LinearRegression())
# model.fit(X_train_pca, y_train_temp)

# importance_mars_pca = model.named_steps['treetransformer'].estimator.feature_importances_
# importance_mars = np.matmul(importance_mars_pca, np.abs(pca.components_)) # (NOT SURE)

# # fit a RandomForestRegressor and find the features importances
# model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
# model_rf.fit(X_train, y_train_temp)
# importance_rf = model_rf.feature_importances_

# choosen_quantile = 0.6
# # SelectKBest feature selection
# select = SelectKBest(f_regression, k=int((1-choosen_quantile)*X_train.shape[1]))
# select.fit(X_train, y_train_temp)
# indices = select.get_support(indices=True)
# important_features_selectK = np.zeros(X_train.shape[1])
# np.put(important_features_selectK, indices, 1)
# important_features_selectK = important_features_selectK.astype(int)

# # decide on which features to use
# select_mars = np.where(importance_mars >= np.quantile(importance_mars, choosen_quantile), 1, 0)
# select_rf = np.where(importance_rf >= np.quantile(importance_rf, choosen_quantile), 1, 0)

# selected_features = select_mars | select_rf
# selected_features = selected_features | important_features_selectK
# selected_features = np.where(selected_features, True, False)

# X_train = X_train[:, selected_features]
# X_test = X_test[:, selected_features]


In [12]:
y_train = (y_train >= 90).to_numpy().ravel()
y_test = (y_test >= 90).to_numpy().ravel()


In [13]:
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE

sme = SMOTETomek(random_state=42)

X_train, y_train = sme.fit_resample(X_train, y_train)



In [21]:
import pickle as pkl
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import lightgbm as lgb

RF_Classifier = RandomForestClassifier(random_state=42, n_jobs=-1, max_depth=6, n_estimators = 200)
RF_Classifier.fit(X_train, y_train)
y_pred = RF_Classifier.predict(X_test)


print("RF_Classifier")
print(RF_Classifier.fit(X_train, y_train).score(X_test, y_test))

XGB_Classifier = XGBClassifier(random_state=42,n_jobs=-1,max_depth=7,n_estimators = 200)
print("XGB_Classifier")
print(XGB_Classifier.fit(X_train, y_train).score(X_test, y_test))


LGR_Classifier = LogisticRegression(n_jobs=-1, random_state=42)
print("LGR_Classifier")
print(LGR_Classifier.fit(X_train, y_train).score(X_test, y_test))

KNN_Classifier =  KNeighborsClassifier()
print("KNN_Classifier")
print(KNN_Classifier.fit(X_train, y_train).score(X_test, y_test))


RF_Classifier
0.8333333333333334
XGB_Classifier
0.7222222222222222
LGR_Classifier
0.4444444444444444
KNN_Classifier
0.2222222222222222


In [72]:
import pickle as pkl
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import StackingClassifier

# LGB_Classifier = lgb.LGBMClassifier(max_depth=0)
#LGB_Classifier.fit(X_train, y_train)


RF_Classifier = RandomForestClassifier(random_state=42, n_jobs=-1, max_depth=6, n_estimators = 200)
#RF_Classifier.fit(X_train, y_train)


XGB_Classifier = XGBClassifier(random_state=42,n_jobs=-1,max_depth=7,n_estimators = 200)
#XGB_Classifier.fit(X_train, y_train)


LGR_Classifier = LogisticRegression(n_jobs=-1, random_state=42)
#LGR_Classifier.fit(X_train, y_train)

KNN_Classifier =  KNeighborsClassifier()

# estimators = [LGR_Classifier,XGB_Classifier,RF_Classifier,KNN_Classifier]
estimators = [('logistic', LGR_Classifier), ('xgb', XGB_Classifier), ('rf', RF_Classifier), ('knn', KNN_Classifier)]


clff = StackingClassifier(estimators=estimators, final_estimator=lgb.LGBMClassifier())
param_dist = {'rf__max_depth': randint(1, 20),
              'rf__n_estimators': randint(50, 500),
              'xgb__max_depth': randint(1, 20),
              'xgb__n_estimators': randint(50, 500),
              'knn__n_neighbors': randint(1, 20),
           }
random_search = RandomizedSearchCV(estimator=clff, param_distributions=param_dist, cv=5, n_iter=1000, n_jobs=-1, random_state=42)

random_search.fit(X_train, y_train)




In [80]:
random_search.best_params_

{'knn__n_neighbors': 9,
 'rf__max_depth': 2,
 'rf__n_estimators': 102,
 'xgb__max_depth': 12,
 'xgb__n_estimators': 404}

In [81]:
best_params = {'knn__n_neighbors': 9,
 'rf__max_depth': 2,
 'rf__n_estimators': 102,
 'xgb__max_depth': 12,
 'xgb__n_estimators': 404}

## Model time

In [81]:
RF_Classifier = RandomForestClassifier(random_state=42, n_jobs=-1, max_depth=best_params['rf__max_depth'], n_estimators = best_params['rf__n_estimators'])
#RF_Classifier.fit(X_train, y_train)


XGB_Classifier = XGBClassifier(random_state=42,n_jobs=-1,max_depth=best_params['xgb__max_depth'],n_estimators = best_params['xgb__n_estimators'])
#XGB_Classifier.fit(X_train, y_train)


LGR_Classifier = LogisticRegression(n_jobs=-1, random_state=42)
#LGR_Classifier.fit(X_train, y_train)

KNN_Classifier =  KNeighborsClassifier(n_neighbors=best_params['knn__n_neighbors'])

# estimators = [LGR_Classifier,XGB_Classifier,RF_Classifier,KNN_Classifier]
estimators = [('logistic', LGR_Classifier), ('xgb', XGB_Classifier), ('rf', RF_Classifier), ('knn', KNN_Classifier)]


clff = StackingClassifier(estimators=estimators, final_estimator=lgb.LGBMClassifier())
clff.fit(X_train, y_train)
y_pred = clff.predict(X_test)


In [82]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, matthews_corrcoef


# Classify predictions
y_pred_class = np.where(y_pred >= 0.9, 1, 0)
y_true_class = np.where(y_test >= 0.9, 1, 0)

# results
# Accuracy
print('Accuracy : ', accuracy_score(y_test, y_pred))

# Precision
print('Precision : ', precision_score(y_test, y_pred))

# Recall
print('Recall : ', recall_score(y_test, y_pred))

# MCC
print('MCC : ', matthews_corrcoef(y_test, y_pred))


Accuracy :  0.8333333333333334
Precision :  0.75
Recall :  0.6
MCC :  0.5635445125120265
