In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler


In [3]:
data = pd.read_csv('dermatology_csv.csv')
data.head()

X = data.iloc[:,0:34]

y = data.iloc[:,-1]

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan,strategy ='mean')
imputer = imputer.fit(X)
X = imputer.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0, stratify = y)

In [275]:
constant_filter = VarianceThreshold(threshold=0.01)
constant_filter.fit(X_train)
X_train_filter = constant_filter.transform(X_train)
X_test_filter = constant_filter.transform(X_test)

In [249]:
X_train_filter.shape, X_test_filter.shape

((256, 34), (110, 34))

In [250]:
X_train_T = X_train_filter.T
X_test_T = X_test_filter.T

In [251]:
X_train_T = pd.DataFrame(X_train_T)
X_test_T = pd.DataFrame(X_test_T)

In [252]:
X_train_T.duplicated().sum()

0

In [253]:
duplicated_features = X_train_T.duplicated()

In [254]:
features_to_keep = [not index for index in duplicated_features]

X_train_unique = X_train_T[features_to_keep].T
X_test_unique = X_test_T[features_to_keep].T

In [217]:
scaler = StandardScaler().fit(X_train_unique)
X_train_unique = scaler.transform(X_train_unique)
X_test_unique = scaler.transform(X_test_unique)

In [255]:
X_train_unique = pd.DataFrame(X_train_unique)
X_test_unique = pd.DataFrame(X_test_unique)

X_train_unique.shape, X_test_unique.shape


((256, 34), (110, 34))

In [256]:
corrmat = X_train_unique.corr()

In [257]:
def get_correlation(data, threshold):
    corr_col = set()
    corrmat = data.corr()
    for i in range(len(corrmat.columns)):
        for j in range(i):
            if abs(corrmat.iloc[i, j]) > threshold:
                colname = corrmat.columns[i]
                corr_col.add(colname)
    return corr_col

corr_features = get_correlation(X_train_unique, 0.70)
print('correlated features: ', len(set(corr_features)) )

correlated features:  11


In [258]:
X_train_uncorr = X_train_unique.drop(labels=corr_features, axis = 1)
X_test_uncorr = X_test_unique.drop(labels = corr_features, axis = 1)

X_train_uncorr.shape, X_test_uncorr.shape

((256, 23), (110, 23))

In [261]:
from sklearn.decomposition import PCA


pca = PCA(n_components=2, random_state=42)
pca.fit(X_train_uncorr)

X_train_pca = pca.transform(X_train_uncorr)
X_test_pca = pca.transform(X_test_uncorr)
X_train_pca.shape, X_test_pca.shape

((256, 2), (110, 2))

In [264]:
def run_randomForest(X_train, X_test, y_train, y_test):
    clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print('Accuracy on test set: ')
    print(accuracy_score(y_test, y_pred))

In [265]:
%%time
run_randomForest(X_train_pca, X_test_pca, y_train, y_test)

Accuracy on test set: 
0.6363636363636364
Wall time: 897 ms


In [266]:
%%time
run_randomForest(X_train, X_test, y_train, y_test)

Accuracy on test set: 
0.9545454545454546
Wall time: 295 ms


In [267]:
X_train_uncorr.shape

(256, 23)

In [269]:
for component in range(1,24):
    pca = PCA(n_components=component, random_state=42)
    pca.fit(X_train_uncorr)
    X_train_pca = pca.transform(X_train_uncorr)
    X_test_pca = pca.transform(X_test_uncorr)
    print('Selected Components: ', component)
    run_randomForest(X_train_pca, X_test_pca, y_train, y_test)
    print()


Selected Components:  1
Accuracy on test set: 
0.39090909090909093

Selected Components:  2
Accuracy on test set: 
0.6363636363636364

Selected Components:  3
Accuracy on test set: 
0.7181818181818181

Selected Components:  4
Accuracy on test set: 
0.8090909090909091

Selected Components:  5
Accuracy on test set: 
0.8727272727272727

Selected Components:  6
Accuracy on test set: 
0.8909090909090909

Selected Components:  7
Accuracy on test set: 
0.8909090909090909

Selected Components:  8
Accuracy on test set: 
0.8909090909090909

Selected Components:  9
Accuracy on test set: 
0.8909090909090909

Selected Components:  10
Accuracy on test set: 
0.8818181818181818

Selected Components:  11
Accuracy on test set: 
0.8818181818181818

Selected Components:  12
Accuracy on test set: 
0.8909090909090909

Selected Components:  13
Accuracy on test set: 
0.8727272727272727

Selected Components:  14
Accuracy on test set: 
0.9090909090909091

Selected Components:  15
Accuracy on test set: 
0.9

Sel

In [271]:
for component in range(1,34):
    pca = PCA(n_components=component, random_state=42)
    pca.fit(X_train_unique)
    X_train_pca = pca.transform(X_train_unique)
    X_test_pca = pca.transform(X_test_unique)
    print('Selected Components: ', component)
    run_randomForest(X_train_pca, X_test_pca, y_train, y_test)
    print()


Selected Components:  1
Accuracy on test set: 
0.5

Selected Components:  2
Accuracy on test set: 
0.7545454545454545

Selected Components:  3
Accuracy on test set: 
0.8

Selected Components:  4
Accuracy on test set: 
0.8363636363636363

Selected Components:  5
Accuracy on test set: 
0.8909090909090909

Selected Components:  6
Accuracy on test set: 
0.8818181818181818

Selected Components:  7
Accuracy on test set: 
0.9454545454545454

Selected Components:  8
Accuracy on test set: 
0.9636363636363636

Selected Components:  9
Accuracy on test set: 
0.9545454545454546

Selected Components:  10
Accuracy on test set: 
0.9636363636363636

Selected Components:  11
Accuracy on test set: 
0.9636363636363636

Selected Components:  12
Accuracy on test set: 
0.9454545454545454

Selected Components:  13
Accuracy on test set: 
0.9545454545454546

Selected Components:  14
Accuracy on test set: 
0.9545454545454546

Selected Components:  15
Accuracy on test set: 
0.9454545454545454

Selected Components