# MLP for S2 S3

# Load Packages<a class="anchor" id="Packages"></a>

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *
from sklearn.metrics import RocCurveDisplay, auc, roc_auc_score, make_scorer, f1_score, precision_score, recall_score
from sklearn.metrics import precision_recall_curve, accuracy_score, confusion_matrix, classification_report, roc_curve
from sklearn.metrics import silhouette_score
from keras.optimizers import Adam
import random
import csv
import statistics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans
from sklearn.utils.class_weight import compute_class_weight
from mlxtend.feature_selection import SequentialFeatureSelector
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.metrics import average_precision_score

In [15]:
from joblib import load

default_mlp_S2 = load('best_model/default_mlp_S2.joblib')
default_mlp_S3 = load('best_model/default_mlp_S3.joblib')

# Load Dataset<a class="anchor" id="Dataset"></a>

In [3]:
feature_extraction_51x51 = pd.read_csv('data/feature_extraction_51x51.csv')
riskforecasting = pd.read_csv('data/pollution_risk_forecasting.csv')

In [4]:
df_merged_51x51 = pd.merge(feature_extraction_51x51, riskforecasting[['site', 'time', 'riskLevelLabel']], on=['site', 'time'])
df_merged_neg_51x51 = df_merged_51x51.fillna(-10)
# scaler = MinMaxScaler()
# df_merged_zero_51x51.iloc[:, 2:-1] = scaler.fit_transform(df_merged_zero_51x51.iloc[:, 2:-1])
df_merged_neg_51x51

Unnamed: 0,time,site,TUR_1x1_median,SPM_1x1_median,CHL_1x1_median,TUR_1x1_mean,SPM_1x1_mean,CHL_1x1_mean,TUR_1x1_q1,SPM_1x1_q1,...,TUR_51x51_mean,SPM_51x51_mean,CHL_51x51_mean,TUR_51x51_q1,SPM_51x51_q1,CHL_51x51_q1,TUR_51x51_q3,SPM_51x51_q3,CHL_51x51_q3,riskLevelLabel
0,2020-05-14,ukc1101-06200,7.433017,4.459626,7.110167,7.959242,4.807047,7.302562,6.817358,4.047618,...,6.212902,3.942381,6.912345,2.592772,1.515497,5.901697,5.036339,2.958766,7.507099,0
1,2020-05-14,ukc1202-06300,22.309444,15.199169,7.722776,22.309444,15.199169,7.722776,19.907279,13.256525,...,7.022960,4.777973,5.869293,2.176280,1.269443,5.268040,3.551266,2.082532,6.076066,0
2,2020-05-14,ukc1202-06400,8.402541,5.029291,6.511792,8.402541,5.029291,6.511792,6.691505,3.974811,...,6.199667,4.191560,5.637279,2.131472,1.243585,5.077878,3.201907,1.870371,5.858303,0
3,2020-05-14,ukc1202-06650,24.125945,15.510758,7.050237,24.125945,15.510758,7.050237,19.264122,12.080823,...,7.952834,5.725047,5.262985,1.882222,1.099571,4.324785,3.389963,1.979074,5.887981,0
4,2020-05-14,ukc1202-06700,49.590233,33.248340,9.137156,49.590233,33.248340,9.137156,49.590233,33.248340,...,6.807269,4.931638,5.341158,1.895784,1.104605,4.586321,2.983651,1.748468,5.583160,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52220,2022-09-08,ukk4306-23000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,...,1.046009,0.609080,0.645500,1.045279,0.608172,0.601767,1.046739,0.609988,0.689232,0
52221,2022-09-18,ukk4200-23800,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,...,1.880243,1.103013,1.773031,1.880243,1.103013,1.773031,1.880243,1.103013,1.773031,0
52222,2022-09-23,ukk4200-23100,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,...,1.688591,0.988166,1.340343,1.688591,0.988166,1.340343,1.688591,0.988166,1.340343,0
52223,2022-09-28,ukj4208-13300,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,...,59.388277,50.467205,6.610489,56.783900,49.832815,5.619397,61.992653,51.101595,7.601581,0


In [5]:
df_merged_neg_15x15 = pd.read_csv('data/df_merged_neg_15x15_final.csv')

In [6]:
df_merged_neg_5x5 = df_merged_neg_15x15.iloc[:, :75]
df_merged_neg_5x5

Unnamed: 0,site,time,riskLevelLabel,BBP_1x1_median,CDM_1x1_median,SPM_1x1_median,KD490_1x1_median,ZSD_1x1_median,CHL_1x1_median,BBP_1x1_mean,...,SPM_5x5_q1,KD490_5x5_q1,ZSD_5x5_q1,CHL_5x5_q1,BBP_5x5_q3,CDM_5x5_q3,SPM_5x5_q3,KD490_5x5_q3,ZSD_5x5_q3,CHL_5x5_q3
0,ukc1101-06000,2020-05-14,0,0.014399,0.648437,9.016005,0.720255,1.359252,26.936116,0.014399,...,5.846387,0.720255,1.359252,26.936116,0.014399,0.648437,8.424028,0.720255,1.359252,26.936116
1,ukc1101-06000,2020-05-15,0,-10.000000,-10.000000,8.102808,0.393628,1.778462,10.588528,-10.000000,...,8.102808,0.393628,1.778462,10.588528,-10.000000,-10.000000,8.102808,0.393628,1.778462,10.588528
2,ukc1101-06000,2020-05-16,0,-10.000000,-10.000000,1.135922,0.263305,2.588783,5.632908,-10.000000,...,1.135922,0.263305,2.588783,5.632908,-10.000000,-10.000000,1.135922,0.263305,2.588783,5.632908
3,ukc1101-06000,2020-05-17,0,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,...,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000
4,ukc1101-06000,2020-05-18,0,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,...,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187106,ukk4307-33900,2022-09-26,0,-10.000000,-10.000000,7.356489,0.183889,3.869243,3.158081,-10.000000,...,7.356489,0.183889,3.869243,3.158081,-10.000000,-10.000000,7.356489,0.183889,3.869243,3.158081
187107,ukk4307-33900,2022-09-27,0,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,...,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000,-10.000000
187108,ukk4307-33900,2022-09-28,0,-10.000000,-10.000000,4.906127,0.142344,5.219365,2.063839,-10.000000,...,4.906127,0.142344,5.219365,2.063839,-10.000000,-10.000000,4.906127,0.142344,5.219365,2.063839
187109,ukk4307-33900,2022-09-29,0,-10.000000,-10.000000,9.181858,0.151466,4.857575,2.291730,-10.000000,...,9.181858,0.151466,4.857575,2.291730,-10.000000,-10.000000,9.181858,0.151466,4.857575,2.291730


## Train Test Validation Split<a class="anchor" id="Train_Test_Validation_Split"></a>

In [7]:
time_site_pairs_S2S3_train = pd.read_csv('data/time_site_pairs_S2S3_train.csv')[['time', 'site']]
time_site_pairs_S2S3_train

Unnamed: 0,time,site
0,2020-07-20,ukj4210-12300
1,2022-08-29,ukk2206-20300
2,2020-07-12,ukh3311-11350
3,2020-05-14,ukk3101-26800
4,2021-09-08,ukh1407-10800
...,...,...
90736,2022-10-16,ukk3106-32330
90737,2022-01-21,ukj3400-17500
90738,2022-07-03,uke1200-08800
90739,2022-03-17,ukk2101-19160


In [8]:
time_site_pairs_S2S3_test = pd.read_csv('data/time_site_pairs_S2S3_test.csv')[['time', 'site']]
time_site_pairs_S2S3_test

Unnamed: 0,time,site
0,2022-10-01,ukk4200-23400
1,2020-05-27,ukk3105-31400
2,2022-10-10,ukh1305-10000
3,2020-01-18,ukk3105-31350
4,2020-10-16,ukc1101-06000
...,...,...
22680,2021-05-19,ukk3106-28000
22681,2020-04-16,ukk4306-22400
22682,2021-03-15,ukc2101-04000
22683,2021-07-13,ukk3101-27000


# Functions<a class="anchor" id="Functions"></a>

In [9]:
def save_result(model,X_test,y_test):
    y_pred = model.predict(X_test)
    ACC = accuracy_score(y_test, y_pred)
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    AUC_ROC = auc(fpr, tpr)
    PRE = precision_score(y_test, y_pred)
    REC = recall_score(y_test, y_pred)
    precision, recall, _ = precision_recall_curve(y_test, y_pred)
    AP = average_precision_score(y_test, y_pred)
    #AUC_PRC = auc(recall, precision)
    F1 = f1_score(y_test, y_pred)
    return ACC, AUC_ROC, PRE, REC, AP, F1

# Default MLP

In [93]:
def default_mlp(X_train, y_train):
    ros = RandomOverSampler()
    X_train_oversampled, y_train_oversampled = ros.fit_resample(X_train, y_train)
    mlp = MLPClassifier(random_state=100)
    mlp.fit(X_train_oversampled, y_train_oversampled)
    return mlp

# S2

In [10]:
# Train test split
df_train = df_merged_neg_51x51.merge(time_site_pairs_S2S3_train, on=['time', 'site'], how='inner')
df_test = df_merged_neg_51x51.merge(time_site_pairs_S2S3_test, on=['time', 'site'], how='inner')

df_train.drop(['time', 'site'], axis=1, inplace=True)
df_test.drop(['time', 'site'], axis=1, inplace=True)

y_train = df_train.pop('riskLevelLabel')
y_test_S2 = df_test.pop('riskLevelLabel')

X_train = df_train
X_test_S2 = df_test

print('Training X Shape:', X_train.shape)
print('Training y Shape:', y_train.shape)
print('Testing X Shape:', X_test_S2.shape)
print('Testing y Shape:', y_test_S2.shape)

Training X Shape: (41668, 312)
Training y Shape: (41668,)
Testing X Shape: (10557, 312)
Testing y Shape: (10557,)


In [11]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test_S2 = scaler.transform(X_test_S2)

In [97]:
default_mlp_S2 = default_mlp(X_train, y_train)



In [98]:
from joblib import dump

dump(default_mlp_S2, 'best_model/default_mlp_S2.joblib')

['best_model/default_mlp_S2.joblib']

# S3

In [12]:
# Train test split
df_train = df_merged_neg_5x5.merge(time_site_pairs_S2S3_train, on=['time', 'site'], how='inner')
df_test = df_merged_neg_5x5.merge(time_site_pairs_S2S3_test, on=['time', 'site'], how='inner')

df_train.drop(['time', 'site'], axis=1, inplace=True)
df_test.drop(['time', 'site'], axis=1, inplace=True)

y_train = df_train.pop('riskLevelLabel')
y_test_S3 = df_test.pop('riskLevelLabel')

X_train = df_train
X_test_S3 = df_test

print('Training X Shape:', X_train.shape)
print('Training y Shape:', y_train.shape)
print('Testing X Shape:', X_test_S3.shape)
print('Testing y Shape:', y_test_S3.shape)

Training X Shape: (41668, 72)
Training y Shape: (41668,)
Testing X Shape: (10557, 72)
Testing y Shape: (10557,)


In [13]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test_S3 = scaler.transform(X_test_S3)

In [101]:
default_mlp_S3 = default_mlp(X_train, y_train)

In [102]:
from joblib import dump

dump(default_mlp_S3, 'best_model/default_mlp_S3.joblib')

['best_model/default_mlp_S3.joblib']

In [16]:
results_df_mlp = pd.DataFrame([['MLP', 'S2', *save_result(default_mlp_S2,X_test_S2,y_test_S2)], 
                                ['MLP', 'S3', *save_result(default_mlp_S3,X_test_S3,y_test_S3)]],
                             columns=['Model', 'Sub-Model', 'Accuracy', 'AUC_ROC','Precision','Recall', 'AP', 'f1 score'])
results_df_mlp

Unnamed: 0,Model,Sub-Model,Accuracy,AUC_ROC,Precision,Recall,AP,f1 score
0,MLP,S2,0.764706,0.687671,0.071821,0.60596,0.054793,0.128421
1,MLP,S3,0.609548,0.654297,0.048809,0.701695,0.042585,0.09127


In [17]:
results_df_mlp.to_csv('results/MLP_S2S3.csv')