In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler


In [7]:
data = pd.read_csv('dermatology_csv.csv')
data.head()

X = data.iloc[:,0:34]

y = data.iloc[:,-1]

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan,strategy ='mean')
imputer = imputer.fit(X)
X = imputer.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0, stratify = y)

In [8]:
constant_filter = VarianceThreshold(threshold=0.01)
constant_filter.fit(X_train)
X_train_filter = constant_filter.transform(X_train)
X_test_filter = constant_filter.transform(X_test)

In [9]:
X_train_filter.shape, X_test_filter.shape

((256, 34), (110, 34))

In [10]:
X_train_T = X_train_filter.T
X_test_T = X_test_filter.T

In [11]:
X_train_T = pd.DataFrame(X_train_T)
X_test_T = pd.DataFrame(X_test_T)

In [12]:
X_train_T.duplicated().sum()

0

In [13]:
duplicated_features = X_train_T.duplicated()

In [14]:
features_to_keep = [not index for index in duplicated_features]

X_train_unique = X_train_T[features_to_keep].T
X_test_unique = X_test_T[features_to_keep].T

In [15]:
scaler = StandardScaler().fit(X_train_unique)
X_train_unique = scaler.transform(X_train_unique)
X_test_unique = scaler.transform(X_test_unique)

In [16]:
X_train_unique = pd.DataFrame(X_train_unique)
X_test_unique = pd.DataFrame(X_test_unique)

X_train_unique.shape, X_test_unique.shape


((256, 34), (110, 34))

In [17]:
corrmat = X_train_unique.corr()

In [18]:
def get_correlation(data, threshold):
    corr_col = set()
    corrmat = data.corr()
    for i in range(len(corrmat.columns)):
        for j in range(i):
            if abs(corrmat.iloc[i, j]) > threshold:
                colname = corrmat.columns[i]
                corr_col.add(colname)
    return corr_col

corr_features = get_correlation(X_train_unique, 0.70)
print('correlated features: ', len(set(corr_features)) )

correlated features:  11


In [19]:
X_train_uncorr = X_train_unique.drop(labels=corr_features, axis = 1)
X_test_uncorr = X_test_unique.drop(labels = corr_features, axis = 1)

X_train_uncorr.shape, X_test_uncorr.shape

((256, 23), (110, 23))

In [20]:
from sklearn.decomposition import PCA


pca = PCA(n_components=2, random_state=42)
pca.fit(X_train_uncorr)

X_train_pca = pca.transform(X_train_uncorr)
X_test_pca = pca.transform(X_test_uncorr)
X_train_pca.shape, X_test_pca.shape

((256, 2), (110, 2))

In [21]:
def run_randomForest(X_train, X_test, y_train, y_test):
    clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print('Accuracy on test set: ')
    print(accuracy_score(y_test, y_pred))

In [22]:
%%time
run_randomForest(X_train_pca, X_test_pca, y_train, y_test)

Accuracy on test set: 
0.7272727272727273
CPU times: user 811 ms, sys: 212 ms, total: 1.02 s
Wall time: 857 ms


In [23]:
%%time
run_randomForest(X_train, X_test, y_train, y_test)

Accuracy on test set: 
0.9545454545454546
CPU times: user 192 ms, sys: 23 ms, total: 215 ms
Wall time: 248 ms


In [24]:
X_train_uncorr.shape

(256, 23)

In [25]:
for component in range(1,34):
    pca = PCA(n_components=component, random_state=42)
    pca.fit(X_train_unique)
    X_train_pca = pca.transform(X_train_unique)
    X_test_pca = pca.transform(X_test_unique)
    print(pca.explained_variance_ratio_)
    print('Selected Components: ', component)
    run_randomForest(X_train_pca, X_test_pca, y_train, y_test)
    print()


[0.27352422]
Selected Components:  1
Accuracy on test set: 
0.7181818181818181

[0.27352422 0.16227539]
Selected Components:  2
Accuracy on test set: 
0.7909090909090909

[0.27352422 0.16227539 0.09410222]
Selected Components:  3
Accuracy on test set: 
0.8727272727272727

[0.27352422 0.16227539 0.09410222 0.06773499]
Selected Components:  4
Accuracy on test set: 
0.8909090909090909

[0.27352422 0.16227539 0.09410222 0.06773499 0.04014264]
Selected Components:  5
Accuracy on test set: 
0.9090909090909091

[0.27352422 0.16227539 0.09410222 0.06773499 0.04014264 0.03652459]
Selected Components:  6
Accuracy on test set: 
0.9454545454545454

[0.27352422 0.16227539 0.09410222 0.06773499 0.04014264 0.03652459
 0.03079273]
Selected Components:  7
Accuracy on test set: 
0.9363636363636364

[0.27352422 0.16227539 0.09410222 0.06773499 0.04014264 0.03652459
 0.03079273 0.02773674]
Selected Components:  8
Accuracy on test set: 
0.9363636363636364

[0.27352422 0.16227539 0.09410222 0.06773499 0.040

Accuracy on test set: 
0.9454545454545454

