In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

In [130]:
def model_fit_predict(X_train, y_train):
    kf = StratifiedKFold(n_splits = 10, random_state=42, shuffle=True)
    accuracies = []
    precisions = []
    recalls = []
    f1s = []
    cnt = 0
    for train_index, test_index in kf.split(X_train, y_train):
        cnt = cnt+1
        print("-----------------------------------------------")
        print("Running for fold ", str(cnt))
        x_train = X_train.iloc[train_index]
        Y_train = y_train.iloc[train_index]
        x_test = X_train.iloc[test_index]
        Y_test = y_train.iloc[test_index]
        model = xgb.XGBClassifier(max_depth=3,learning_rate=0.1).fit(x_train, Y_train)
        predictions = model.predict(x_test)
        accuracies.append(accuracy_score(predictions, Y_test))
        precisions.append(np.mean(precision_score(predictions, Y_test, average = None)))
        recalls.append(np.mean(recall_score(predictions, Y_test, average = None)))
        f1s.append(np.mean(f1_score(predictions, Y_test, average = None)))
        print("Accuracy : ", accuracy_score(predictions, Y_test))
        print("Precision : ", precision_score(predictions, Y_test, average = None))
        print("Recall : ", recall_score(predictions, Y_test, average = None))
        print("F1 : ", f1_score(predictions, Y_test, average = None))
        print("-----------------------------------------------")
    print('Final Mean Scores ---------------------------------')
    print('Mean Accuracy Score : ', sum(accuracies)/len(accuracies))
    precision = sum(precisions)/len(precisions)
    recall = sum(recalls)/len(recalls)
    f1 = sum(f1s)/len(f1s)
    print('Mean Precision Score : ', np.mean(precision))
    print('Mean Recall Score : ', np.mean(recall))
    print('Mean F1 Score : ', np.mean(f1))

In [131]:
def find_categorical_features(df):
    df[df.select_dtypes(['object']).columns] = df.select_dtypes(['object']).apply(lambda x: x.astype('category'))
    cols = df.columns

    num_cols = df._get_numeric_data().columns
    print(list(set(cols) - set(num_cols)))
    return (list(set(cols) - set(num_cols)))

In [132]:
def encode_categorical_features(df):
    ONE_HOT_COLS = find_categorical_features(df)
    print("Starting DF shape: %d, %d" % df.shape)
    
    for col in ONE_HOT_COLS:
        s = df[col].unique()

        # Create a One Hot Dataframe with 1 row for each unique value
        one_hot_df = pd.get_dummies(s, prefix='%s_' % col)
        one_hot_df[col] = s

        print("Adding One Hot values for %s (the column has %d unique values)" % (col, len(s)))
        pre_len = len(df)

        # Merge the one hot columns
        df = df.merge(one_hot_df, on=[col], how="left")
        assert len(df) == pre_len
        print(df.shape)
    df.drop(ONE_HOT_COLS, axis=1, inplace=True)
    return df

### Iris Data

In [133]:
iris = pd.read_csv('iris.data', header=None)
column_names = ['SepalLength','SepalWidth','PetalLength','PetalWidth','Class']
iris.columns = column_names

In [134]:
iris

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [135]:
target = iris.Class
iris.drop(['Class'], axis=1, inplace=True)

In [136]:
model_fit_predict(iris, target)

-----------------------------------------------
Running for fold  1
Accuracy :  1.0
Precision :  [1. 1. 1.]
Recall :  [1. 1. 1.]
F1 :  [1. 1. 1.]




-----------------------------------------------
-----------------------------------------------
Running for fold  2
Accuracy :  0.9333333333333333
Precision :  [1.  1.  0.8]
Recall :  [1.         0.83333333 1.        ]
F1 :  [1.         0.90909091 0.88888889]
-----------------------------------------------
-----------------------------------------------
Running for fold  3
Accuracy :  1.0
Precision :  [1. 1. 1.]
Recall :  [1. 1. 1.]
F1 :  [1. 1. 1.]
-----------------------------------------------
-----------------------------------------------
Running for fold  4
Accuracy :  0.9333333333333333
Precision :  [1.  1.  0.8]
Recall :  [1.         0.83333333 1.        ]
F1 :  [1.         0.90909091 0.88888889]
-----------------------------------------------
-----------------------------------------------
Running for fold  5
Accuracy :  0.8666666666666667
Precision :  [1.  0.8 0.8]
Recall :  [1.  0.8 0.8]
F1 :  [1.  0.8 0.8]
-----------------------------------------------
-------------------

### Wine Data

In [137]:
wine = pd.read_csv('wine.data', header=None)
column_names = ['Alcohol','Malic acid','Ash','Alcalinity of ash','Magnesium','Total phenols','Flavanoids','Nonflavanoid phenols','Proanthocyanins' ,'Color intensity','Hue' ,'OD280/OD315 of diluted wines','Proline','target']
wine.columns = column_names

In [138]:
wine

Unnamed: 0,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline,target
0,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065,1
1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050,1
2,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185,1
3,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480,1
4,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740,3
174,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750,3
175,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835,3
176,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840,3


In [139]:
target = wine.target
wine.drop(['target'], axis=1, inplace=True)

In [140]:
model_fit_predict(wine, target)

-----------------------------------------------
Running for fold  1
Accuracy : 



 1.0
Precision :  [1. 1. 1.]
Recall :  [1. 1. 1.]
F1 :  [1. 1. 1.]
-----------------------------------------------
-----------------------------------------------
Running for fold  2
Accuracy :  0.9444444444444444
Precision :  [1.         0.85714286 1.        ]
Recall :  [1.         1.         0.83333333]
F1 :  [1.         0.92307692 0.90909091]
-----------------------------------------------
-----------------------------------------------
Running for fold  3
Accuracy :  0.8888888888888888
Precision :  [1.         0.85714286 0.8       ]
Recall :  [0.85714286 0.85714286 1.        ]
F1 :  [0.92307692 0.85714286 0.88888889]
-----------------------------------------------
-----------------------------------------------
Running for fold  4
Accuracy :  1.0
Precision :  [1. 1. 1.]
Recall :  [1. 1. 1.]
F1 :  [1. 1. 1.]
-----------------------------------------------
-----------------------------------------------
Running for fold  5
Accuracy :  0.9444444444444444
Precision :  [1.         0.857

### Zoo

In [141]:
zoo = pd.read_csv('zoo.data', header=None)
column_names = ['hair','feathers','eggs','milk','airborne','aquatic','predator','toothed','backbone','breathes','enomous','fins','legs','tail','domestic','catsize','label']
zoo.columns = column_names

In [142]:
target = zoo.label
zoo.drop(['label'], axis=1, inplace=True)

In [143]:
target

0      1
1      1
2      4
3      1
4      1
      ..
96     1
97     6
98     1
99     7
100    2
Name: label, Length: 101, dtype: int64

In [144]:
model_fit_predict(zoo, target)



-----------------------------------------------
Running for fold  1




Accuracy :  1.0
Precision :  [1. 1. 1. 1. 1. 1.]
Recall :  [1. 1. 1. 1. 1. 1.]
F1 :  [1. 1. 1. 1. 1. 1.]
-----------------------------------------------
-----------------------------------------------
Running for fold  2
Accuracy :  1.0
Precision :  [1. 1. 1. 1. 1.]
Recall :  [1. 1. 1. 1. 1.]
F1 :  [1. 1. 1. 1. 1.]
-----------------------------------------------
-----------------------------------------------
Running for fold  3


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy :  0.9
Precision :  [1. 1. 0. 1. 0. 1.]
Recall :  [1. 1. 0. 1. 0. 1.]
F1 :  [1. 1. 0. 1. 0. 1.]
-----------------------------------------------
-----------------------------------------------
Running for fold  4
Accuracy :  1.0
Precision :  [1. 1. 1. 1. 1.]
Recall :  [1. 1. 1. 1. 1.]
F1 :  [1. 1. 1. 1. 1.]
-----------------------------------------------
-----------------------------------------------
Running for fold  5
Accuracy :  1.0
Precision :  [1. 1. 1. 1. 1. 1.]
Recall :  [1. 1. 1. 1. 1. 1.]
F1 :  [1. 1. 1. 1. 1. 1.]
-----------------------------------------------
-----------------------------------------------
Running for fold  6
Accuracy :  1.0
Precision :  [1. 1. 1. 1. 1. 1.]
Recall :  [1. 1. 1. 1. 1. 1.]
F1 :  [1. 1. 1. 1. 1. 1.]
-----------------------------------------------
-----------------------------------------------
Running for fold  7
Accuracy :  1.0
Precision :  [1. 1. 1. 1. 1. 1.]
Recall :  [1. 1. 1. 1. 1. 1.]
F1 :  [1. 1. 1. 1. 1. 1.]
--------------------

### Bank

In [145]:
banknote = pd.read_csv('banknote_authentication.data', header=None)
column_names = ['variance of Wavelet Transformed image','skewness of Wavelet Transformed image','curtosis of Wavelet Transformed image','entropy of image','label']
banknote.columns = column_names

In [146]:
banknote

Unnamed: 0,variance of Wavelet Transformed image,skewness of Wavelet Transformed image,curtosis of Wavelet Transformed image,entropy of image,label
0,3.62160,8.66610,-2.8073,-0.44699,0
1,4.54590,8.16740,-2.4586,-1.46210,0
2,3.86600,-2.63830,1.9242,0.10645,0
3,3.45660,9.52280,-4.0112,-3.59440,0
4,0.32924,-4.45520,4.5718,-0.98880,0
...,...,...,...,...,...
1367,0.40614,1.34920,-1.4501,-0.55949,1
1368,-1.38870,-4.87730,6.4774,0.34179,1
1369,-3.75030,-13.45860,17.5932,-2.77710,1
1370,-3.56370,-8.38270,12.3930,-1.28230,1


In [147]:
target = banknote.label
banknote.drop(['label'], axis=1, inplace=True)

In [148]:
model_fit_predict(banknote, target)

-----------------------------------------------
Running for fold  1
Accuracy :  1.0
Precision :  [1. 1.]
Recall :  [1. 1.]
F1 :  [1. 1.]
-----------------------------------------------
-----------------------------------------------
Running for fold  2




Accuracy :  0.9927536231884058
Precision :  [1.         0.98360656]
Recall :  [0.98717949 1.        ]
F1 :  [0.99354839 0.99173554]
-----------------------------------------------
-----------------------------------------------
Running for fold  3
Accuracy :  1.0
Precision :  [1. 1.]
Recall :  [1. 1.]
F1 :  [1. 1.]
-----------------------------------------------
-----------------------------------------------
Running for fold  4
Accuracy :  1.0
Precision :  [1. 1.]
Recall :  [1. 1.]
F1 :  [1. 1.]
-----------------------------------------------
-----------------------------------------------
Running for fold  5
Accuracy :  1.0
Precision :  [1. 1.]
Recall :  [1. 1.]
F1 :  [1. 1.]
-----------------------------------------------
-----------------------------------------------
Running for fold  6
Accuracy :  1.0
Precision :  [1. 1.]
Recall :  [1. 1.]
F1 :  [1. 1.]
-----------------------------------------------
-----------------------------------------------
Running for fold  7
Accuracy :  

### User

In [149]:
user_knowledge = pd.read_csv('user-knowledge-modeling.data.txt', header=None)
column_names = ['STG','SCG','STR','LPR','PEG','label']
user_knowledge.columns = column_names

In [150]:
user_knowledge

Unnamed: 0,STG,SCG,STR,LPR,PEG,label
0,0.00,0.00,0.00,0.00,0.00,very_low
1,0.08,0.08,0.10,0.24,0.90,High
2,0.06,0.06,0.05,0.25,0.33,Low
3,0.10,0.10,0.15,0.65,0.30,Middle
4,0.08,0.08,0.08,0.98,0.24,Low
...,...,...,...,...,...,...
253,0.61,0.78,0.69,0.92,0.58,High
254,0.78,0.61,0.71,0.19,0.60,Middle
255,0.54,0.82,0.71,0.29,0.77,High
256,0.50,0.75,0.81,0.61,0.26,Middle


In [151]:
target = user_knowledge.label
user_knowledge.drop(['label'], axis=1, inplace=True)

In [152]:
model_fit_predict(user_knowledge, target)

-----------------------------------------------
Running for fold  1




Accuracy :  0.8076923076923077
Precision :  [1.         1.         0.66666667 0.33333333]
Recall :  [1.         0.61538462 1.         1.        ]
F1 :  [1.         0.76190476 0.8        0.5       ]
-----------------------------------------------
-----------------------------------------------
Running for fold  2
Accuracy :  0.8846153846153846
Precision :  [0.83333333 1.         0.88888889 0.66666667]
Recall :  [1.         0.8        0.88888889 1.        ]
F1 :  [0.90909091 0.88888889 0.88888889 0.8       ]
-----------------------------------------------
-----------------------------------------------
Running for fold  3
Accuracy :  0.9615384615384616
Precision :  [0.83333333 1.         1.         1.        ]
Recall :  [1.  1.  0.9 1. ]
F1 :  [0.90909091 1.         0.94736842 1.        ]
-----------------------------------------------
-----------------------------------------------
Running for fold  4
Accuracy :  0.9230769230769231
Precision :  [0.83333333 1.         0.88888889 1.      

### Pima Indians Diabetes

In [153]:
pima = pd.read_csv('pima-indians-diabetes.data', header=None)
column_names = ['Number of times pregnant','Plasma glucose concentration a 2 hours in an oral glucose tolerance test','Diastolic blood pressure','Triceps skin fold thickness','2-Hour serum insulin','Body mass index','Diabetes pedigree function','Age','label']
pima.columns = column_names

In [154]:
pima

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure,Triceps skin fold thickness,2-Hour serum insulin,Body mass index,Diabetes pedigree function,Age,label
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [155]:
target = pima.label
pima.drop(['label'], axis=1, inplace=True)

In [156]:
model_fit_predict(pima, target)

-----------------------------------------------
Running for fold  1
Accuracy :  0.7922077922077922
Precision :  [0.78       0.81481481]
Recall :  [0.88636364 0.66666667]
F1 :  [0.82978723 0.73333333]
-----------------------------------------------
-----------------------------------------------
Running for fold  2




Accuracy :  0.7792207792207793
Precision :  [0.92       0.51851852]
Recall :  [0.77966102 0.77777778]
F1 :  [0.8440367  0.62222222]
-----------------------------------------------
-----------------------------------------------
Running for fold  3
Accuracy :  0.7142857142857143
Precision :  [0.84       0.48148148]
Recall :  [0.75       0.61904762]
F1 :  [0.79245283 0.54166667]
-----------------------------------------------
-----------------------------------------------
Running for fold  4
Accuracy :  0.8961038961038961
Precision :  [0.96       0.77777778]
Recall :  [0.88888889 0.91304348]
F1 :  [0.92307692 0.84      ]
-----------------------------------------------
-----------------------------------------------
Running for fold  5
Accuracy :  0.8311688311688312
Precision :  [0.9       0.7037037]
Recall :  [0.8490566  0.79166667]
F1 :  [0.87378641 0.74509804]
-----------------------------------------------
-----------------------------------------------
Running for fold  6
Accuracy :

### Glass

In [157]:
glass = pd.read_csv('glass.data', header=None)
column_names = ['ID','RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Class']
glass.columns = column_names

In [158]:
glass = glass.drop(columns=['ID'])

In [159]:
glass

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Class
0,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.0,1
1,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.0,1
...,...,...,...,...,...,...,...,...,...,...
209,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.0,7
210,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.0,7
211,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.0,7
212,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.0,7


In [160]:
target = glass.Class
glass.drop(['Class'], axis=1, inplace=True)

In [161]:
model_fit_predict(glass, target)



-----------------------------------------------
Running for fold  1


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy :  0.7727272727272727
Precision :  [0.85714286 0.875      0.         1.         0.         1.        ]
Recall :  [0.66666667 0.875      0.         0.5        0.         1.        ]
F1 :  [0.75       0.875      0.         0.66666667 0.         1.        ]
-----------------------------------------------
-----------------------------------------------
Running for fold  2
Accuracy :  0.7272727272727273
Precision :  [1.         0.625      0.         1.         1.         0.66666667]
Recall :  [0.7        0.83333333 0.         0.5        1.         0.66666667]
F1 :  [0.82352941 0.71428571 0.         0.66666667 1.         0.66666667]
-----------------------------------------------
-----------------------------------------------
Running for fold  3
Accuracy :  0.8636363636363636
Precision :  [0.85714286 0.875      1.         1.         1.         0.66666667]
Recall :  [1.         0.875      1.         0.33333333 1.         1.        ]
F1 :  [0.92307692 0.875      1.         0.5       

### Seeds

In [162]:
seeds = pd.read_csv('seeds_dataset.txt', sep='\t', header=None)
column_names = ['1', '2', '3', '4', '5', '6', '7', 'label']
seeds.columns = column_names

In [163]:
seeds

Unnamed: 0,1,2,3,4,5,6,7,label
0,15.26,14.84,0.8710,5.763,3.312,2.221,5.220,1
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956,1
2,14.29,14.09,0.9050,5.291,3.337,2.699,4.825,1
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805,1
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175,1
...,...,...,...,...,...,...,...,...
205,12.19,13.20,0.8783,5.137,2.981,3.631,4.870,3
206,11.23,12.88,0.8511,5.140,2.795,4.325,5.003,3
207,13.20,13.66,0.8883,5.236,3.232,8.315,5.056,3
208,11.84,13.21,0.8521,5.175,2.836,3.598,5.044,3


In [164]:
target = seeds.label
seeds.drop(['label'], axis=1, inplace=True)

In [165]:
seeds

Unnamed: 0,1,2,3,4,5,6,7
0,15.26,14.84,0.8710,5.763,3.312,2.221,5.220
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956
2,14.29,14.09,0.9050,5.291,3.337,2.699,4.825
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175
...,...,...,...,...,...,...,...
205,12.19,13.20,0.8783,5.137,2.981,3.631,4.870
206,11.23,12.88,0.8511,5.140,2.795,4.325,5.003
207,13.20,13.66,0.8883,5.236,3.232,8.315,5.056
208,11.84,13.21,0.8521,5.175,2.836,3.598,5.044


In [166]:
model_fit_predict(seeds, target)

-----------------------------------------------
Running for fold  1




Accuracy :  0.9047619047619048
Precision :  [0.85714286 1.         0.85714286]
Recall :  [0.85714286 1.         0.85714286]
F1 :  [0.85714286 1.         0.85714286]
-----------------------------------------------
-----------------------------------------------
Running for fold  2
Accuracy :  1.0
Precision :  [1. 1. 1.]
Recall :  [1. 1. 1.]
F1 :  [1. 1. 1.]
-----------------------------------------------
-----------------------------------------------
Running for fold  3
Accuracy :  0.9047619047619048
Precision :  [0.85714286 1.         0.85714286]
Recall :  [0.85714286 0.875      1.        ]
F1 :  [0.85714286 0.93333333 0.92307692]
-----------------------------------------------
-----------------------------------------------
Running for fold  4
Accuracy :  0.9047619047619048
Precision :  [0.85714286 1.         0.85714286]
Recall :  [0.85714286 1.         0.85714286]
F1 :  [0.85714286 1.         0.85714286]
-----------------------------------------------
-------------------------------