In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

In [2]:
train = pd.read_csv('Data/train.csv',index_col = 0)

In [3]:
test = pd.read_csv('Data/test.csv',index_col = 0)

In [4]:
train.drop(columns = ['Stoich. Formula', 'Point Group','split','rad_mean','rad_var','SG_vec','SG_freq'],inplace=True)

In [5]:
test.drop(columns = ['Stoich. Formula', 'Point Group','split','rad_mean','rad_var','SG_vec','SG_freq'],inplace=True)

In [12]:
y_0 = []
y_TI = []
y_TSM = []
for i in range(len(train)):
    if train['Top_Class'].iloc[i]==0:
        y_0.append(0)
        y_TI.append(0)
        y_TSM.append(0)
    elif train['Top_Class'].iloc[i]==1 or train['Top_Class'].iloc[i]==2:
        y_0.append(1)
        y_TI.append(1)
        y_TSM.append(0)  
    else:
        y_0.append(1)
        y_TI.append(0)
        y_TSM.append(1)  

In [18]:
y_0 = []
y_TI = []
y_TSM = []
for i in range(len(test)):
    if test['Top_Class'].iloc[i]==0:
        y_0.append(0)
        y_TI.append(0)
        y_TSM.append(0)
    elif test['Top_Class'].iloc[i]==1 or test['Top_Class'].iloc[i]==2:
        y_0.append(1)
        y_TI.append(1)
        y_TSM.append(0)  
    else:
        y_0.append(1)
        y_TI.append(0)
        y_TSM.append(1)  

In [14]:
train['y_triv'] = y_0
train['y_TI'] = y_TI
train['y_TSM'] = y_TSM

In [19]:
test['y_triv'] = y_0
test['y_TI'] = y_TI
test['y_TSM'] = y_TSM

In [23]:
def fit_GBT(X,y,X_test,y_test):

    model = XGBClassifier(use_label_encoder=False, eta=0.3)
    param_grid = {'max_depth':[10,30,100],'min_child_weight':[5,6],'colsample_bytree' : [0.5,0.6,0.7,0.8,0.9,1],'colsample_bynode' : [0.5,0.6,0.7,0.8,0.9,1],'lambda' : [0.01,0.03,0.1,0.3,1,10,30]}

    gs = RandomizedSearchCV(model,param_distributions = param_grid, n_jobs = -1, n_iter = 50)
    results = gs.fit(X,y)
    print('Results of randomized search ...')

    #display(pd.DataFrame(results.cv_results_).sort_values('rank_test_score'))
    print('='*50)
    print('Best Parameters:')
    display(results.best_params_)
    clf = XGBClassifier(**results.best_params_)
    clf.fit(X,y)
    y_pred_train = clf.predict(X)
    y_pred_test = clf.predict(X_test)
    print('Precision, Recall and F1 score for train data')
    display(metrics.precision_recall_fscore_support(y,y_pred_train))
    print('Precision, Recall and F1 score for test data')
    display(metrics.precision_recall_fscore_support(y_test,y_pred_test))
    print('Accuracy on train data')
    display(metrics.accuracy_score(y,y_pred_train))
    print('Accuracy on test data')
    display(metrics.accuracy_score(y_test,y_pred_test))
    #acc_scores.append(metrics.accuracy_score(y_test,y_pred_test))
    print('='*50)
#    print('Plotting feature importances .. ')
#     fig, ax = plt.subplots(figsize=(15, 15))
#     plot_importance(clf, ax=ax)

## Top or not

In [21]:
fit_GBT(train.drop(columns = ['Top_Class','y_TI','y_TSM','y_triv']),train['y_triv'],test.drop(columns = ['Top_Class','y_TI','y_TSM','y_triv']),test['y_triv'])

Results of randomized search ...
Best Parameters:


{'min_child_weight': 5,
 'max_depth': 10,
 'lambda': 0.3,
 'colsample_bytree': 0.8,
 'colsample_bynode': 0.9}





Precision, Recall and F1 score for train data


(array([1.        , 0.99966159]),
 array([0.99956578, 1.        ]),
 array([0.99978284, 0.99983077]),
 array([4606, 5908]))

Precision, Recall and F1 score for test data


(array([0.95059625, 0.97525773]),
 array([0.96875   , 0.96073121]),
 array([0.95958727, 0.96793997]),
 array([1152, 1477]))

Accuracy on train data


0.9998097774396043

Accuracy on test data


0.9642449600608597

NameError: name 'acc_scores' is not defined







## TI vs all

In [22]:
fit_GBT(train.drop(columns = ['Top_Class','y_TI','y_TSM','y_triv']),train['y_TI'],test.drop(columns = ['Top_Class','y_TI','y_TSM','y_triv']),test['y_TI'])

Results of randomized search ...
Best Parameters:


{'min_child_weight': 5,
 'max_depth': 10,
 'lambda': 0.01,
 'colsample_bytree': 0.5,
 'colsample_bynode': 0.5}





Precision, Recall and F1 score for train data


(array([0.99379354, 0.99121802]),
 array([0.99707714, 0.98147448]),
 array([0.99543263, 0.98632219]),
 array([7869, 2645]))

Precision, Recall and F1 score for test data


(array([0.91727851, 0.84129693]),
 array([0.95271988, 0.74471299]),
 array([0.93466334, 0.7900641 ]),
 array([1967,  662]))

Accuracy on train data


0.9931519878257561

Accuracy on test data


0.900342335488779

NameError: name 'acc_scores' is not defined

In [24]:
fit_GBT(train.drop(columns = ['Top_Class','y_TI','y_TSM','y_triv']),train['y_TSM'],test.drop(columns = ['Top_Class','y_TI','y_TSM','y_triv']),test['y_TSM'])

Results of randomized search ...
Best Parameters:


{'min_child_weight': 5,
 'max_depth': 10,
 'lambda': 0.03,
 'colsample_bytree': 0.6,
 'colsample_bynode': 0.9}





Precision, Recall and F1 score for train data


(array([0.99806443, 0.99024688]),
 array([0.99558682, 0.99570947]),
 array([0.99682408, 0.99297066]),
 array([7251, 3263]))

Precision, Recall and F1 score for test data


(array([0.9373297 , 0.88161209]),
 array([0.94818082, 0.85889571]),
 array([0.94272403, 0.87010566]),
 array([1814,  815]))

Accuracy on train data


0.9956248811108998

Accuracy on test data


0.9205020920502092

















## Random Forests

In [6]:
from sklearn.ensemble import RandomForestClassifier

In [9]:
def fit_RF(X,y,X_test,y_test):

    model = RandomForestClassifier(class_weight='balanced',random_state=0)
    param_grid = {'n_estimators':[10,25,50,100,200],'criterion':['gini', 'entropy'],'max_depth':[10,30,50,100,200]}

    gs = RandomizedSearchCV(model,param_distributions = param_grid, n_jobs = -1, n_iter = 50)
    results = gs.fit(X,y)
    print('Results of randomized search ...')

    #display(pd.DataFrame(results.cv_results_).sort_values('rank_test_score'))
    print('='*50)
    print('Best Parameters:')
    display(results.best_params_)
    clf = RandomForestClassifier(**results.best_params_)
    clf.fit(X,y)
    y_pred_train = clf.predict(X)
    y_pred_test = clf.predict(X_test)
    print('Precision, Recall and F1 score for train data')
    display(metrics.precision_recall_fscore_support(y,y_pred_train))
    print('Precision, Recall and F1 score for test data')
    display(metrics.precision_recall_fscore_support(y_test,y_pred_test))
    print('Accuracy on train data')
    display(metrics.accuracy_score(y,y_pred_train))
    print('Accuracy on test data')
    display(metrics.accuracy_score(y_test,y_pred_test))
    #acc_scores.append(metrics.accuracy_score(y_test,y_pred_test))
    print('='*50)
#    print('Plotting feature importances .. ')
#     fig, ax = plt.subplots(figsize=(15, 15))
#     plot_importance(clf, ax=ax)

In [10]:
fit_RF(train.drop(columns = ['Top_Class']),train['Top_Class'],test.drop(columns = ['Top_Class']),test['Top_Class'])

Results of randomized search ...
Best Parameters:


{'n_estimators': 200, 'max_depth': 50, 'criterion': 'gini'}

Precision, Recall and F1 score for train data


(array([0.99978289, 0.99032258, 0.99353647, 0.98556999, 0.99788248]),
 array([0.99978289, 0.98208573, 0.99445471, 0.99201162, 0.99946978]),
 array([0.99978289, 0.98618696, 0.99399538, 0.98878031, 0.9986755 ]),
 array([4606, 1563, 1082, 1377, 1886]))

Precision, Recall and F1 score for test data


(array([0.85780169, 0.84971098, 0.69767442, 0.76923077, 0.77125506]),
 array([0.96875   , 0.75191816, 0.55350554, 0.61046512, 0.8089172 ]),
 array([0.90990624, 0.79782904, 0.61728395, 0.68071313, 0.78963731]),
 array([1152,  391,  271,  344,  471]))

Accuracy on train data


0.995529769830702

Accuracy on test data


0.8181818181818182



## KNN

In [12]:
from sklearn.neighbors import KNeighborsClassifier

In [23]:
def fit_knn(X,y,X_test,y_test):

    model = KNeighborsClassifier()
    param_grid = {'weights':['uniform', 'distance'],'n_neighbors':[10,15,20,30,50,75,100,150]}

    gs = RandomizedSearchCV(model,param_distributions = param_grid, n_jobs = -1, n_iter = 50)
    results = gs.fit(X,y)
    print('Results of randomized search ...')

    #display(pd.DataFrame(results.cv_results_).sort_values('rank_test_score'))
    print('='*50)
    print('Best Parameters:')
    display(results.best_params_)
    clf = KNeighborsClassifier(**results.best_params_)
    clf.fit(X,y)
    y_pred_train = clf.predict(X)
    y_pred_test = clf.predict(X_test)
    print('Precision, Recall and F1 score for train data')
    display(metrics.precision_recall_fscore_support(y,y_pred_train))
    print('Precision, Recall and F1 score for test data')
    display(metrics.precision_recall_fscore_support(y_test,y_pred_test))
    print('Accuracy on train data')
    display(metrics.accuracy_score(y,y_pred_train))
    print('Accuracy on test data')
    display(metrics.accuracy_score(y_test,y_pred_test))
    #acc_scores.append(metrics.accuracy_score(y_test,y_pred_test))
    print('='*50)
#    print('Plotting feature importances .. ')
#     fig, ax = plt.subplots(figsize=(15, 15))
#     plot_importance(clf, ax=ax)

In [24]:
fit_knn(train.drop(columns = ['Top_Class']),train['Top_Class'],test.drop(columns = ['Top_Class']),test['Top_Class'])



Results of randomized search ...
Best Parameters:


{'weights': 'distance', 'n_neighbors': 100}

Precision, Recall and F1 score for train data


(array([0.99978142, 0.94785276, 0.99626168, 0.9890431 , 0.99893048]),
 array([0.99305254, 0.98848369, 0.98521257, 0.98329702, 0.99045599]),
 array([0.99640562, 0.96774194, 0.99070632, 0.98616169, 0.99467519]),
 array([4606, 1563, 1082, 1377, 1886]))

Precision, Recall and F1 score for test data


(array([0.72571429, 0.63360882, 0.585     , 0.73275862, 0.60368664]),
 array([0.88194444, 0.58823529, 0.43173432, 0.49418605, 0.55626327]),
 array([0.79623824, 0.61007958, 0.49681529, 0.59027778, 0.57900552]),
 array([1152,  391,  271,  344,  471]))

Accuracy on train data


0.989823093018832

Accuracy on test data


0.6827691137314569



## SVM

In [25]:
from sklearn.svm import SVC

In [29]:
def fit_svm(X,y,X_test,y_test):

    model = SVC()
    param_grid = {'C':[0.001,0.01,0.1,1,10,100],'kernel':['linear', 'poly', 'rbf', 'sigmoid'],'degree':[2,3,5,7],'gamma':['scale','auto']}

    gs = RandomizedSearchCV(model,param_distributions = param_grid, n_jobs = -1, n_iter = 50)
    results = gs.fit(X,y)
    print('Results of randomized search ...')

    #display(pd.DataFrame(results.cv_results_).sort_values('rank_test_score'))
    print('='*50)
    print('Best Parameters:')
    display(results.best_params_)
    clf = SVC(**results.best_params_)
    clf.fit(X,y)
    y_pred_train = clf.predict(X)
    y_pred_test = clf.predict(X_test)
    print('Precision, Recall and F1 score for train data')
    display(metrics.precision_recall_fscore_support(y,y_pred_train))
    print('Precision, Recall and F1 score for test data')
    display(metrics.precision_recall_fscore_support(y_test,y_pred_test))
    print('Accuracy on train data')
    display(metrics.accuracy_score(y,y_pred_train))
    print('Accuracy on test data')
    display(metrics.accuracy_score(y_test,y_pred_test))
    #acc_scores.append(metrics.accuracy_score(y_test,y_pred_test))
    print('='*50)
#    print('Plotting feature importances .. ')
#     fig, ax = plt.subplots(figsize=(15, 15))
#     plot_importance(clf, ax=ax)

In [30]:
fit_svm(train.drop(columns = ['Top_Class']),train['Top_Class'],test.drop(columns = ['Top_Class']),test['Top_Class'])

KeyboardInterrupt: 