Task 3 - Condensed Code

Imports

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn import preprocessing
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest,mutual_info_classif , f_classif
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler

Dataset Preparation

In [3]:
data = pd.read_csv('./data.csv', index_col=0)

# Category indexing
i_voltage = np.arange(0,9)
i_angles= np.arange(9,18)
i_pgen = np.arange(18,27)
i_qgen = np.arange(27,36)
i_pdem = np.arange(36,45)
i_qdem = np.arange(45,54)
i_pflow = np.arange(54, 135)
i_qflow = np.arange(135, 216)
i_risk = 216

data_raw= pd.concat([data.iloc[:, i_voltage], data.iloc[:, i_angles],
                data.iloc[:, i_pgen], data.iloc[:, i_qgen], data.iloc[:, i_pdem], 
                     data.iloc[:, i_qdem], data.iloc[:, i_pflow], data.iloc[:, i_qflow], data.iloc[:, i_risk]], axis=1)

#dataset1
data_raw = data_raw.loc[:, data_raw.any()].dropna(axis=0)
X1 = data_raw.iloc[:, 0:(data_raw.shape[1]-1)]
y1 = data_raw.iloc[:, (data_raw.shape[1]-1)]
y_1=y1.to_numpy()
for i in range(0,np.shape(y_1)[0]):
    if y_1[i]<0.1:
        y_1[i]=0
    elif y_1[i]<0.35:
        y_1[i]=1
    elif y_1[i]<0.7:
        y_1[i]=2
    else:
        y_1[i]=3
le = preprocessing.LabelEncoder()
y_1=le.fit_transform(y_1)
x_1=X1.to_numpy()

#dataset2
data_raw= pd.concat([data.iloc[:, i_voltage], data.iloc[:, i_angles],
                data.iloc[:, i_pgen], data.iloc[:, i_qgen], data.iloc[:, i_pdem], 
                     data.iloc[:, i_qdem], data.iloc[:, i_pflow], data.iloc[:, i_qflow], data.iloc[:, i_risk]], axis=1)
data_raw = data_raw.loc[:, data_raw.any()]
X2raw = data_raw.iloc[:, 0:(data_raw.shape[1]-1)]
y2 = data_raw.iloc[:, (data_raw.shape[1]-1)]
imputer = IterativeImputer(random_state=4720)
X2 = imputer.fit_transform(X2raw)
y_2=y2.to_numpy()
for i in range(0,np.shape(y_2)[0]):
    if y_2[i]<0.1:
        y_2[i]=0
    elif y_2[i]<0.35:
        y_2[i]=1
    elif y_2[i]<0.7:
        y_2[i]=2
    else:
        y_2[i]=3
y_2=le.fit_transform(y_2)
x_2=X2

#Train and Test Split
X_1train, X_1test, y_1train, y_1test = train_test_split(x_1, y_1, test_size=0.2, random_state=4720)
X_2train, X_2test, y_2train, y_2test = train_test_split(x_2, y_2, test_size=0.2, random_state=4720)

Pipeline 1 - Decision Tree Classifier

In [None]:
dtpipe = Pipeline([
    #('Dimensionality',PCA(n_components=30,whiten='True')),
    ('Kbest', SelectKBest(score_func=f_classif, k=25)),
    ('DT', DecisionTreeClassifier(max_depth=None,random_state=4720))
])

dtparam={
    #'Dimensionality__n_components':[30],#None],
    #'Dimensionality__whiten':['True'],#'False'],
    'Kbest__score_func':[mutual_info_classif,f_classif],
    'Kbest__k':[10,15,20,25],#10,20,30],
    'DT__max_depth': [12,15,17,20] #2,5
}

dtsearch = RandomizedSearchCV(dtpipe, dtparam, cv=5,scoring='f1_weighted')

dtsearch.fit(X_1train,y_1train)
bestmodel=dtsearch.best_estimator_
y_predict=bestmodel.predict(X_1test)
print('Weighted F1 score for dataset 1 is: %.5f'%f1_score(y_1test,y_predict,average='weighted'))
dtsearch.best_params_
dtsearch.best_estimator_

dtsearch.fit(X_2train,y_2train)
bestmodel=dtsearch.best_estimator_
y_predict=bestmodel.predict(X_2test)
print('Weighted F1 score for dataset 2 is: %.5f'%f1_score(y_2test,y_predict,average='weighted'))
dtsearch.best_params_
dtsearch.best_estimator_

Pipeline 2 - Extra-trees Classifier

In [None]:
etpipe = Pipeline([
    #('Dimensionality',PCA(n_components=30,whiten='True')),
    ('Kbest', SelectKBest(score_func=f_classif, k=25)),
    ('ET', ExtraTreesClassifier(max_depth=None,random_state=4720))
])

etparam={
    #'Dimensionality__n_components':[None],#40
    #'Dimensionality__whiten':['True'],#'False'],
    'Kbest__score_func':[mutual_info_classif,f_classif],
    'Kbest__k':[30,40,50,60],#10,20,30],
    'ET__max_depth': [10,15,25,30] #2,5,10
}

etsearch = RandomizedSearchCV(etpipe, etparam, cv=5,scoring='f1_weighted')

etsearch.fit(X_1train,y_1train)
bestmodel=etsearch.best_estimator_
y_predict=bestmodel.predict(X_1test)
print('Weighted F1 score for dataset 1 is: %.5f'%f1_score(y_1test,y_predict,average='weighted'))
etsearch.best_params_
etsearch.best_estimator_

etsearch.fit(X_2train,y_2train)
bestmodel=etsearch.best_estimator_
y_predict=bestmodel.predict(X_2test)
print('Weighted F1 score for dataset 2 is: %.5f'%f1_score(y_2test,y_predict,average='weighted'))
etsearch.best_params_
etsearch.best_estimator_

Pipeline 3 - SVM linearSVC

In [None]:
spipe=Pipeline([
    ('Scaling', MinMaxScaler()),
    ('Dimensionality',PCA(n_components=30,whiten='True')),
    #('Kbest', SelectKBest(score_func=f_classif, k=25)),
    ('linearSVC',LinearSVC(C=1,multi_class='ovr',random_state=4720))
])

sparam={
    'Dimensionality__n_components':[50,60],#40,45
    'Dimensionality__whiten':['True','False'],
    #'Kbest__score_func':[mutual_info_classif],
    #'Kbest__k':[30,40,50],#10,20,30],
    'linearSVC__C': [0.1,1,10], #2,5,10
}

svsearch = RandomizedSearchCV(spipe, sparam, cv=5,scoring='f1_weighted')

svsearch.fit(X_1train,y_1train)
bestmodel=svsearch.best_estimator_
y_predict=bestmodel.predict(X_1test)
print('Weighted F1 score for dataset 1 is: %.5f'%f1_score(y_1test,y_predict,average='weighted'))
svsearch.best_params_
svsearch.best_estimator_

svsearch.fit(X_2train,y_2train)
bestmodel=svsearch.best_estimator_
y_predict=bestmodel.predict(X_2test)
print('Weighted F1 score for dataset 2 is: %.5f'%f1_score(y_2test,y_predict,average='weighted'))
svsearch.best_params_
svsearch.best_estimator_

Pipeline 4 - SVM SVC-RBF

In [None]:
spipe2=Pipeline([
    ('Scaling', StandardScaler()),
    #('Dimensionality',PCA(n_components=45,whiten='True')),
    ('Kbest', SelectKBest(score_func=mutual_info_classif, k=35)),
    ('RBFSVC',SVC(C=1,kernel='rbf',random_state=4720, gamma='auto'))
])

sparam2={
    #'Dimensionality__n_components':[45],#40,45
    #'Dimensionality__whiten':['True'],#'False'],
    'Kbest__score_func':[mutual_info_classif,f_classif],
    'Kbest__k':[25,35,40],#10,20,30],
    'RBFSVC__C': [1,10,100,1000], #2,5,10
    'RBFSVC__gamma': ['auto','scale']
}

svsearch2 = RandomizedSearchCV(spipe2, sparam2, cv=5,scoring='f1_weighted')

svsearch2.fit(X_1train,y_1train)
bestmodel=svsearch2.best_estimator_
y_predict=bestmodel.predict(X_1test)
print('Weighted F1 score for dataset 1 is: %.5f'%f1_score(y_1test,y_predict,average='weighted'))
svsearch2.best_params_
svsearch2.best_estimator_

svsearch2.fit(X_2train,y_2train)
bestmodel=svsearch2.best_estimator_
y_predict=bestmodel.predict(X_2test)
print('Weighted F1 score for dataset 2 is: %.5f'%f1_score(y_2test,y_predict,average='weighted'))
svsearch2.best_params_
svsearch2.best_estimator_


Pipeline 5 - K-nearest Neighbors

In [None]:
kpipe=Pipeline([
    ('Scaling', StandardScaler()),
   # ('Dimensionality',PCA(n_components=45,whiten='True')),
    ('Kbest', SelectKBest(score_func=mutual_info_classif, k=25)),
    ('KNN',KNeighborsClassifier(n_neighbors=10, leaf_size=20, weights='distance'))
])

kparam={
    #'Dimensionality__n_components':[45],
    #'Dimensionality__whiten':['True'],
    'Scaling':[StandardScaler(),MinMaxScaler()],
    'Kbest__score_func':[mutual_info_classif,f_classif],
    'Kbest__k':[10,15,20,25,30,35],
    'KNN__n_neighbors': [5,10,15], 
    'KNN__leaf_size':[5,8,10,15],#25,30
    'KNN__weights':['distance','uniform']#uniform
}

ksearch = RandomizedSearchCV(kpipe, kparam, cv=5,scoring='f1_weighted')

ksearch.fit(X_1train,y_1train)
bestmodel=ksearch.best_estimator_
y_predict=bestmodel.predict(X_1test)
print('Weighted F1 score for dataset 1 is: %.5f'%f1_score(y_1test,y_predict,average='weighted'))
ksearch.best_params_
ksearch.best_estimator_

ksearch.fit(X_2train,y_2train)
bestmodel=ksearch.best_estimator_
y_predict=bestmodel.predict(X_2test)
print('Weighted F1 score for dataset 2 is: %.5f'%f1_score(y_2test,y_predict,average='weighted'))
ksearch.best_params_
ksearch.best_estimator_

Pipeline 6 - Deep Learning Classifier