In [1]:
#https://www.kaggle.com/c/data-science-london-scikit-learn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings("ignore")


In [2]:
train = pd.read_csv("...input/train.csv",header=None)
trainLabels = pd.read_csv("...input/trainLabels.csv",header=None)
test = pd.read_csv("...input/test.csv",header=None)

In [3]:
train.shape,trainLabels.shape,test.shape

((1000, 40), (1000, 1), (9000, 40))

In [4]:
print(train.isnull().any().any())
print(test.isnull().any().any())
print(trainLabels.isnull().any().any())

False
False
False


In [5]:
#preprocessing

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(train,trainLabels,test_size=0.2,random_state=123)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((800, 40), (200, 40), (800, 1), (200, 1))

# Classification Models

In [6]:
#1.Naive Bayes
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(x_train,y_train.values.ravel())
predict_gnb = gnb.predict(x_test)
print("Naive Bayes:",accuracy_score(y_test,predict_gnb))
print(metrics.classification_report(y_test,predict_gnb))
''''''
print(metrics.confusion_matrix(y_test,predict_gnb))

Naive Bayes: 0.81
              precision    recall  f1-score   support

           0       0.80      0.79      0.80        94
           1       0.81      0.83      0.82       106

    accuracy                           0.81       200
   macro avg       0.81      0.81      0.81       200
weighted avg       0.81      0.81      0.81       200

[[74 20]
 [18 88]]


In [7]:
#2.Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train,y_train.values.ravel())
predict_lr = lr.predict(x_test)
print("Logistic Regression:",accuracy_score(y_test,predict_lr))
print(metrics.classification_report(y_test,predict_lr))
''''''
print(metrics.confusion_matrix(y_test,predict_lr))

Logistic Regression: 0.835
              precision    recall  f1-score   support

           0       0.84      0.80      0.82        94
           1       0.83      0.87      0.85       106

    accuracy                           0.83       200
   macro avg       0.84      0.83      0.83       200
weighted avg       0.84      0.83      0.83       200

[[75 19]
 [14 92]]


In [8]:
#3.K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=2, weights='uniform', algorithm='auto',
                           leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None)
knn.fit(x_train,y_train.values.ravel())
predict_knn = knn.predict(x_test)
print("Nearest Neighbors:",accuracy_score(y_test,predict_knn))
print(metrics.classification_report(y_test,predict_knn))
print(metrics.confusion_matrix(y_test,predict_knn))

Nearest Neighbors: 0.91
              precision    recall  f1-score   support

           0       0.87      0.96      0.91        94
           1       0.96      0.87      0.91       106

    accuracy                           0.91       200
   macro avg       0.91      0.91      0.91       200
weighted avg       0.91      0.91      0.91       200

[[90  4]
 [14 92]]


In [9]:
#3.Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=0, max_depth=5)
tree.fit(x_train,y_train.values.ravel())
predict_tree = tree.predict(x_test)
print("Decision Tree:",accuracy_score(y_test,predict_tree))
print(metrics.classification_report(y_test,predict_tree))
print(metrics.confusion_matrix(y_test,predict_tree))

Decision Tree: 0.795
              precision    recall  f1-score   support

           0       0.74      0.86      0.80        94
           1       0.86      0.74      0.79       106

    accuracy                           0.80       200
   macro avg       0.80      0.80      0.79       200
weighted avg       0.80      0.80      0.79       200

[[81 13]
 [28 78]]


In [10]:
#4.Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=0, max_depth=5)
rf.fit(x_train,y_train.values.ravel())
predict_rf = rf.predict(x_test)
print("Random Forest:",accuracy_score(y_test,predict_rf))
print(metrics.classification_report(y_test,predict_rf))
print(metrics.confusion_matrix(y_test,predict_rf))

Random Forest: 0.84
              precision    recall  f1-score   support

           0       0.84      0.82      0.83        94
           1       0.84      0.86      0.85       106

    accuracy                           0.84       200
   macro avg       0.84      0.84      0.84       200
weighted avg       0.84      0.84      0.84       200

[[77 17]
 [15 91]]


In [11]:
#5.SVM
from sklearn.svm import SVC
svm = SVC()
svm.fit(x_train,y_train.values.ravel())
predict_svm = svm.predict(x_test)
print("SVM:",accuracy_score(y_test,predict_svm))
print(metrics.classification_report(y_test,predict_svm))
print(metrics.confusion_matrix(y_test,predict_svm))

SVM: 0.905
              precision    recall  f1-score   support

           0       0.90      0.89      0.90        94
           1       0.91      0.92      0.91       106

    accuracy                           0.91       200
   macro avg       0.90      0.90      0.90       200
weighted avg       0.90      0.91      0.90       200

[[84 10]
 [ 9 97]]


In [12]:
#All in One place
print("Naive Bayes:",accuracy_score(y_test,predict_gnb))
print("Logistic Regression:",accuracy_score(y_test,predict_lr))
print("Nearest Neighbors:",accuracy_score(y_test,predict_knn))
print("Decision Tree:",accuracy_score(y_test,predict_tree))
print("Random Forest:",accuracy_score(y_test,predict_rf))
print("SVM:",accuracy_score(y_test,predict_svm))


Naive Bayes: 0.81
Logistic Regression: 0.835
Nearest Neighbors: 0.91
Decision Tree: 0.795
Random Forest: 0.84
SVM: 0.905


# Feature Scaling

In [13]:
from sklearn.preprocessing import StandardScaler,Normalizer
norm = Normalizer()
x_norm_train = norm.fit_transform(x_train)

x_norm_test = norm.fit_transform(x_test)

In [14]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(x_norm_train,y_train)
predict_gnb = gnb.predict(x_norm_test)
print("Naive Bayes:",accuracy_score(y_test,predict_gnb))


#2.Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_norm_train,y_train)
predict_lr = lr.predict(x_norm_test)
print("Logistic Regression:",accuracy_score(y_test,predict_lr))


#3.K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=2, weights='uniform', algorithm='auto',
                           leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None)
knn.fit(x_norm_train,y_train)
predict_knn = knn.predict(x_norm_test)
print("Nearest Neighbors:",accuracy_score(y_test,predict_knn))


#3.Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=0, max_depth=5)
tree.fit(x_norm_train,y_train)
predict_tree = tree.predict(x_norm_test)
print("Decision Tree:",accuracy_score(y_test,predict_tree))

#4.Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=0, max_depth=5)
rf.fit(x_norm_train,y_train)
predict_rf = rf.predict(x_norm_test)
print("Random Forest:",accuracy_score(y_test,predict_rf))

#4.SVM
from sklearn.svm import SVC
svm = SVC()
svm.fit(x_norm_train,y_train)
predict_svm = svm.predict(x_test)
print("SVM:",accuracy_score(y_test,predict_svm))


Naive Bayes: 0.805
Logistic Regression: 0.83
Nearest Neighbors: 0.89
Decision Tree: 0.81
Random Forest: 0.85
SVM: 0.78


# Principal Component Analysis

In [15]:
from sklearn.decomposition import PCA
pca = PCA(n_components=6)
x_pca_train = pca.fit_transform(x_train)

x_pca_test = pca.fit_transform(x_test)

In [16]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(x_pca_train,y_train)
predict_gnb = gnb.predict(x_pca_test)
print("Naive Bayes:",accuracy_score(y_test,predict_gnb))


#2.Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_pca_train,y_train)
predict_lr = lr.predict(x_pca_test)
print("Logistic Regression:",accuracy_score(y_test,predict_lr))


#3.K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=2, weights='uniform', algorithm='auto',
                           leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None)
knn.fit(x_pca_train,y_train)
predict_knn = knn.predict(x_pca_test)
print("Nearest Neighbors:",accuracy_score(y_test,predict_knn))


#3.Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=0, max_depth=5)
tree.fit(x_pca_train,y_train)
predict_tree = tree.predict(x_pca_test)
print("Decision Tree:",accuracy_score(y_test,predict_tree))

#4.Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=0, max_depth=5)
rf.fit(x_pca_train,y_train)
predict_rf = rf.predict(x_pca_test)
print("Random Forest:",accuracy_score(y_test,predict_rf))

#4.SVM
from sklearn.svm import SVC
svm = SVC()
svm.fit(x_pca_train,y_train)
predict_svm = svm.predict(x_pca_test)
print("SVM:",accuracy_score(y_test,predict_svm))


Naive Bayes: 0.745
Logistic Regression: 0.785
Nearest Neighbors: 0.67
Decision Tree: 0.69
Random Forest: 0.75
SVM: 0.71


In [17]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.mixture import GaussianMixture
from sklearn.svm import SVC

x_all = np.r_[train,test]
print('x_all shape :',x_all.shape)

# USING THE GAUSSIAN MIXTURE MODEL 
lowest_bic = np.infty
bic = []
n_components_range = range(1, 7)
cv_types = ['spherical', 'tied', 'diag', 'full']
for cv_type in cv_types:
    for n_components in n_components_range:
        gmm = GaussianMixture(n_components=n_components,covariance_type=cv_type)
        gmm.fit(x_all)
        bic.append(gmm.aic(x_all))
        if bic[-1] < lowest_bic:
            lowest_bic = bic[-1]
            best_gmm = gmm
            
best_gmm.fit(x_all)
gmm_train = best_gmm.predict_proba(train)
gmm_test = best_gmm.predict_proba(test)


#Random Forest Classifier
rfc = RandomForestClassifier(random_state=99)

#USING GRID SEARCH
n_estimators = [10, 50, 100, 200,400]
max_depth = [3, 10, 20, 40]
param_grid = dict(n_estimators=n_estimators,max_depth=max_depth)

grid_search_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv = 10,scoring='accuracy',n_jobs=-1).fit(gmm_train, trainLabels.values.ravel())
rfc_best = grid_search_rfc.best_estimator_
print('Random Forest Best Score',grid_search_rfc.best_score_)
print('Random Forest Best Parmas',grid_search_rfc.best_params_)
print('Random Forest Accuracy',cross_val_score(rfc_best,gmm_train, trainLabels.values.ravel(), cv=10).mean())

#KNN 
knn = KNeighborsClassifier()

#USING GRID SEARCH
n_neighbors=[3,5,6,7,8,9,10]
param_grid = dict(n_neighbors=n_neighbors)

grid_search_knn = GridSearchCV(estimator=knn, param_grid=param_grid, cv = 10, n_jobs=-1,scoring='accuracy').fit(gmm_train,trainLabels.values.ravel())
knn_best = grid_search_knn.best_estimator_
print('KNN Best Score', grid_search_knn.best_score_)
print('KNN Best Params',grid_search_knn.best_params_)
print('KNN Accuracy',cross_val_score(knn_best,gmm_train, trainLabels.values.ravel(), cv=10).mean())

#SVM
svc = SVC()

#USING GRID SEARCH
parameters = [{'kernel':['linear'],'C':[1,10,100]},
              {'kernel':['rbf'],'C':[1,10,100],'gamma':[0.05,0.0001,0.01,0.001]}]
grid_search_svm = GridSearchCV(estimator=svc, param_grid=parameters, cv = 10, n_jobs=-1,scoring='accuracy').fit(gmm_train, trainLabels.values.ravel())
svm_best = grid_search_svm.best_estimator_
print('SVM Best Score',grid_search_svm.best_score_)
print('SVM Best Params',grid_search_svm.best_params_)
print('SVM Accuracy',cross_val_score(svm_best,gmm_train, trainLabels.values.ravel(), cv=10).mean())

x_all shape : (10000, 40)
Random Forest Best Score 0.996
Random Forest Best Parmas {'max_depth': 3, 'n_estimators': 10}
Random Forest Accuracy 0.9960000000000001
KNN Best Score 0.996
KNN Best Params {'n_neighbors': 3}
KNN Accuracy 0.9960000000000001
SVM Best Score 0.996
SVM Best Params {'C': 1, 'kernel': 'linear'}
SVM Accuracy 0.9960000000000001
