In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

### DATASET 1 - bank_marketing_dataset

In [2]:
le = LabelEncoder() # encoder
#pd.set_option("display.max_columns", 999)
df = pd.read_csv('bank_marketing_dataset.csv')
df = df.sample(frac=1)
y = df.loc[:,'y']
df = df.loc[:,df.columns != 'y']
y = le.fit_transform(y)
#print(df.describe(include = object)) #statistics of dataset (omit @param:include to include numerical variables)
df.drop(columns = ['default'],inplace = True) 
df = pd.get_dummies(df) # one hot encoding categorical variables

In [6]:
df.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
2083,42,214,28,21.0,24,-1.0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,1
2234,34,1,19,108.0,1,-1.0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2070,80,8304,6,681.0,1,118.0,11,0,0,0,...,0,0,0,0,0,0,0,0,1,0
351,35,32,21,180.0,1,-1.0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
1224,27,201,4,172.0,2,-1.0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1


In [19]:
X_train,X_test,y_train,y_test = train_test_split(df,y,test_size=0.33) # split to train and test dataset

In [20]:
'''Logistic Regression model, Suport Vector Machine, Random Forest'''
lg = LogisticRegression(solver='newton-cg',tol=1e-2,max_iter=100)
svm = SVC(kernel='sigmoid')
rf = RandomForestClassifier(n_estimators=40,criterion='entropy',max_depth=7,min_samples_split=3,max_features=30)

In [21]:
'''FIT MODELS'''
#print(cross_val_score(lg,X_train,y_train))
lg.fit(X_train,y_train)
svm.fit(X_train,y_train)
rf.fit(X_train,y_train)
''''''



''

In [22]:
'''GET PREDICTIONS'''
predictions_lg = lg.predict(X_test)
predictions_svm = svm.predict(X_test)
predicitons_rf = rf.predict(X_test)

In [23]:
'''PRINT CLASSIFICATION PERFORMANCE REPORT'''
print('Logistic Regression')
print(classification_report(y_test,predictions_lg))
print('-----------')
print('SVM')
print(classification_report(y_test,predictions_svm))
print('-----------')
print('RF')
print(classification_report(y_test,predicitons_rf))

Logistic Regression
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      1343
           1       0.59      0.34      0.43       149

    accuracy                           0.91      1492
   macro avg       0.76      0.66      0.69      1492
weighted avg       0.90      0.91      0.90      1492

-----------
SVM
              precision    recall  f1-score   support

           0       0.89      0.86      0.88      1343
           1       0.04      0.05      0.04       149

    accuracy                           0.78      1492
   macro avg       0.46      0.45      0.46      1492
weighted avg       0.81      0.78      0.79      1492

-----------
RF
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      1343
           1       0.59      0.37      0.45       149

    accuracy                           0.91      1492
   macro avg       0.76      0.67      0.70      1492
weighted avg       0.90

### DATASET 2 - breast_cancer_dataset

In [30]:
df = pd.read_csv("breast_cancer_dataset.csv")
df = df.sample(frac=1)
df = pd.get_dummies(df)
y = df.loc[:,'class']
df = df.loc[:,df.columns != 'class']

In [31]:
df.head()

Unnamed: 0,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitosis
89,2,1,1,2,3,1,2,1,1
344,7,6,4,8,10,10,9,5,3
74,10,6,4,1,3,4,3,2,3
196,8,4,4,5,4,7,7,8,2
503,4,1,1,1,2,1,3,1,1


In [33]:
df.describe() # data statistics

Unnamed: 0,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitosis
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,4.539543,3.184534,3.265378,2.845343,3.29877,-2632.518453,3.490334,2.989455,1.637961
std,2.896501,3.002236,2.955935,2.873626,2.304775,16035.653408,2.324925,3.091315,1.773941
min,1.0,1.0,1.0,1.0,1.0,-100000.0,1.0,1.0,1.0
25%,2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0
50%,4.0,1.0,2.0,1.0,2.0,1.0,3.0,1.0,1.0
75%,6.0,5.0,5.0,4.0,4.0,8.0,5.0,4.0,1.0
max,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0


In [37]:
X_train,X_test,y_train,y_test = train_test_split(df,y,test_size=0.33) # split to train and test dataset

In [45]:
'''Logistic Regression model, Suport Vector Machine, Random Forest'''
lg = LogisticRegression(solver='newton-cg',tol=1e-2,max_iter=100)
svm = SVC(kernel='rbf')
rf = RandomForestClassifier(n_estimators=10,criterion='entropy',max_depth=3,min_samples_split=3,max_features=5)

In [46]:
'''FIT MODELS'''
#print(cross_val_score(lg,X_train,y_train))
lg.fit(X_train,y_train)
svm.fit(X_train,y_train)
rf.fit(X_train,y_train)
''''''



''

In [47]:
'''GET PREDICTIONS'''
predictions_lg = lg.predict(X_test)
predictions_svm = svm.predict(X_test)
predicitons_rf = rf.predict(X_test)

In [48]:
'''PRINT CLASSIFICATION PERFORMANCE REPORT'''
print('Logistic Regression')
print(classification_report(y_test,predictions_lg))
print('-----------')
print('SVM')
print(classification_report(y_test,predictions_svm))
print('-----------')
print('RF')
print(classification_report(y_test,predicitons_rf))

Logistic Regression
              precision    recall  f1-score   support

           2       0.94      0.96      0.95       113
           4       0.93      0.91      0.92        75

    accuracy                           0.94       188
   macro avg       0.94      0.93      0.93       188
weighted avg       0.94      0.94      0.94       188

-----------
SVM
              precision    recall  f1-score   support

           2       0.99      0.91      0.95       113
           4       0.88      0.99      0.93        75

    accuracy                           0.94       188
   macro avg       0.94      0.95      0.94       188
weighted avg       0.95      0.94      0.94       188

-----------
RF
              precision    recall  f1-score   support

           2       0.96      0.96      0.96       113
           4       0.93      0.93      0.93        75

    accuracy                           0.95       188
   macro avg       0.94      0.94      0.94       188
weighted avg       0.95

### DATASET 3 - cars_dataset

In [28]:
df = pd.read_csv("cars_dataset.csv")
df = df.sample(frac=1)
y = df.loc[:,'car']
df = df.loc[:,df.columns != 'car']
le = LabelEncoder() # encoder
y = le.fit_transform(y)

In [29]:
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
339,high,high,four,two,big,low
200,high,med,three,two,small,high
91,med,med,two,two,small,med
1714,low,med,5more,more,med,med
395,med,low,four,two,big,high


In [30]:
df.describe(include=object) # print statistics

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
count,1728,1728,1728,1728,1728,1728
unique,4,4,4,3,3,3
top,low,low,5more,more,big,low
freq,432,432,432,576,576,576


In [32]:
df = pd.get_dummies(df) # one hot encoding categorical variables

In [33]:
df.head()

Unnamed: 0,buying_high,buying_low,buying_med,buying_vhigh,maint_high,maint_low,maint_med,maint_vhigh,doors_5more,doors_four,...,doors_two,persons_four,persons_more,persons_two,lug_boot_big,lug_boot_med,lug_boot_small,safety_high,safety_low,safety_med
339,1,0,0,0,1,0,0,0,0,1,...,0,0,0,1,1,0,0,0,1,0
200,1,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,1,1,0,0
91,0,0,1,0,0,0,1,0,0,0,...,1,0,0,1,0,0,1,0,0,1
1714,0,1,0,0,0,0,1,0,1,0,...,0,0,1,0,0,1,0,0,0,1
395,0,0,1,0,0,1,0,0,0,1,...,0,0,0,1,1,0,0,1,0,0


In [34]:
X_train,X_test,y_train,y_test = train_test_split(df,y,test_size=0.33) # split to train and test dataset

In [43]:
'''Logistic Regression model, Suport Vector Machine, Random Forest'''
lg = LogisticRegression(solver='newton-cg',tol=1e-2,max_iter=100)
svm = SVC(kernel='rbf')
rf = RandomForestClassifier(n_estimators=30,criterion='entropy',max_depth=6,min_samples_split=3,max_features=10)

In [44]:
'''FIT MODELS'''
#print(cross_val_score(lg,X_train,y_train))
lg.fit(X_train,y_train)
svm.fit(X_train,y_train)
rf.fit(X_train,y_train)
''''''



''

In [45]:
'''GET PREDICTIONS'''
predictions_lg = lg.predict(X_test)
predictions_svm = svm.predict(X_test)
predicitons_rf = rf.predict(X_test)

In [46]:
'''PRINT CLASSIFICATION PERFORMANCE REPORT'''
print('Logistic Regression')
print(classification_report(y_test,predictions_lg))
print('-----------')
print('SVM')
print(classification_report(y_test,predictions_svm))
print('-----------')
print('RF')
print(classification_report(y_test,predicitons_rf))

Logistic Regression
              precision    recall  f1-score   support

           0       0.65      0.78      0.71       116
           1       0.40      0.09      0.14        23
           2       0.94      0.96      0.95       411
           3       1.00      0.33      0.50        21

    accuracy                           0.87       571
   macro avg       0.75      0.54      0.58       571
weighted avg       0.86      0.87      0.85       571

-----------
SVM
              precision    recall  f1-score   support

           0       0.67      0.99      0.80       116
           1       0.00      0.00      0.00        23
           2       1.00      0.95      0.97       411
           3       0.90      0.43      0.58        21

    accuracy                           0.90       571
   macro avg       0.64      0.59      0.59       571
weighted avg       0.89      0.90      0.88       571

-----------
RF
              precision    recall  f1-score   support

           0       0.74 

  'precision', 'predicted', average, warn_for)


### DATASET 4 - bach_choral_set_dataset

In [20]:
le = LabelEncoder()
df = pd.read_csv("bach_choral_set_dataset.csv")
df = df.sample(frac=1)
y = df.loc[:,'chord_label']
df = df.loc[:,df.columns != 'chord_label']
y = le.fit_transform(y)

In [21]:
df.head()

Unnamed: 0,choral_ID,event_number,pitch_1,pitch_2,pitch_3,pitch_4,pitch_5,pitch_6,pitch_7,pitch_8,pitch_9,pitch_10,pitch_11,pitch_12,bass,meter
4766,014505b_,7,NO,NO,NO,YES,NO,NO,YES,NO,NO,NO,NO,YES,B,3
4451,014007b_,122,NO,NO,YES,NO,NO,NO,NO,YES,NO,NO,NO,YES,B,2
4251,013705ch,57,NO,NO,YES,NO,NO,NO,NO,YES,NO,NO,NO,YES,B,4
1561,002406bs,35,NO,NO,YES,NO,NO,YES,NO,NO,NO,NO,YES,NO,Bb,3
4462,014007b_,133,NO,NO,NO,YES,NO,NO,NO,YES,NO,NO,YES,NO,G,5


In [22]:
le = LabelEncoder()
cols = df.columns
df = pd.DataFrame(df.values)
df.columns = cols
df = df.replace({'YES':1,'NO':0}) # encode YES = 1, NO = 0
transf_bass = pd.DataFrame(le.fit_transform(df.loc[:,'bass']),columns = ['bass'])
df.loc[:,'bass'] = transf_bass
df.drop(columns = ['choral_ID','event_number'],inplace=True) # drop insignificant features
df.head()

Unnamed: 0,pitch_1,pitch_2,pitch_3,pitch_4,pitch_5,pitch_6,pitch_7,pitch_8,pitch_9,pitch_10,pitch_11,pitch_12,bass,meter
0,0,0,0,1,0,0,1,0,0,0,0,1,3,3
1,0,0,1,0,0,0,0,1,0,0,0,1,3,2
2,0,0,1,0,0,0,0,1,0,0,0,1,3,4
3,0,0,1,0,0,1,0,0,0,0,1,0,4,3
4,0,0,0,1,0,0,0,1,0,0,1,0,14,5


In [24]:
X_train,X_test,y_train,y_test = train_test_split(df,y,test_size=0.33) # split to train and test dataset

In [25]:
'''Logistic Regression model, Suport Vector Machine, Random Forest'''
lg = LogisticRegression(solver='newton-cg',tol=1e-2,max_iter=100)
svm = SVC(kernel='rbf')
rf = RandomForestClassifier(n_estimators=40,criterion='entropy',max_depth=8,min_samples_split=3,max_features=10)

In [26]:
'''FIT MODELS'''
#print(cross_val_score(lg,X_train,y_train))
lg.fit(X_train,y_train)
svm.fit(X_train,y_train)
rf.fit(X_train,y_train)
''''''



''

In [27]:
'''GET PREDICTIONS'''
predictions_lg = lg.predict(X_test)
predictions_svm = svm.predict(X_test)
predicitons_rf = rf.predict(X_test)

In [28]:
'''PRINT CLASSIFICATION PERFORMANCE REPORT'''
print('Logistic Regression')
print(classification_report(y_test,predictions_lg))
print('-----------')
print('SVM')
print(classification_report(y_test,predictions_svm))
print('-----------')
print('RF')
print(classification_report(y_test,predicitons_rf))

Logistic Regression
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00         1
           2       0.67      0.82      0.74       112
           3       0.33      0.20      0.25         5
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00        18
           6       0.00      0.00      0.00         1
           7       0.66      0.79      0.72        87
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         5
          10       0.00      0.00      0.00         3
          11       0.91      0.83      0.87        12
          13       0.00      0.00      0.00         1
          14       0.73      0.79      0.76        48
          15       0.00      0.00      0.00         1
          16       0.67      0.21      0.32        19
          17       0.00      0.00      0.00         5
       

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


### DATASET 5 - gender_voice_dataset

In [41]:
df = pd.read_csv("gender_voice_dataset.csv")
df = df.sample(frac=1)
y = df.loc[:,'label']
df = df.loc[:,df.columns != 'label']
y.replace({'male':0,'female':1},inplace=True)

In [42]:
df.head()

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,mode,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx
3026,0.2178,0.045799,0.224264,0.204146,0.242403,0.038257,2.175078,7.668994,0.868693,0.256959,0.232674,0.2178,0.223417,0.013021,0.277778,0.517782,0.214844,0.825195,0.610352,0.435556
644,0.091436,0.077062,0.070372,0.023457,0.153963,0.130505,2.354569,9.180173,0.956468,0.731009,0.01359,0.091436,0.161466,0.016529,0.246154,0.776278,0.007812,6.125,6.117188,0.121208
1125,0.177917,0.062669,0.160372,0.138266,0.244458,0.106192,1.694293,5.452466,0.913984,0.478171,0.157771,0.177917,0.142627,0.047198,0.27907,1.612926,0.023438,9.0,8.976562,0.140925
2594,0.238831,0.03006,0.243616,0.220942,0.261017,0.040075,1.918328,6.120897,0.814822,0.118336,0.261544,0.238831,0.187539,0.047619,0.277457,1.326923,0.023438,9.773438,9.75,0.137525
1093,0.195164,0.05883,0.212555,0.135401,0.248832,0.113431,1.16896,3.405238,0.900403,0.300151,0.240146,0.195164,0.130253,0.048048,0.27907,1.386541,0.023438,8.25,8.226562,0.150372


In [40]:
df.describe() # print statistics

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,mode,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx
count,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0
mean,0.180907,0.057126,0.185621,0.140456,0.224765,0.084309,3.140168,36.568461,0.895127,0.408216,0.165282,0.180907,0.142807,0.036802,0.258842,0.829211,0.052647,5.047277,4.99463,0.173752
std,0.029918,0.016652,0.03636,0.04868,0.023639,0.042783,4.240529,134.928661,0.04498,0.177521,0.077203,0.029918,0.032304,0.01922,0.030077,0.525205,0.063299,3.521157,3.520039,0.119454
min,0.039363,0.018363,0.010975,0.000229,0.042946,0.014558,0.141735,2.068455,0.738651,0.036876,0.0,0.039363,0.055565,0.009775,0.103093,0.007812,0.004883,0.007812,0.0,0.0
25%,0.163662,0.041954,0.169593,0.111087,0.208747,0.04256,1.649569,5.669547,0.861811,0.258041,0.118016,0.163662,0.116998,0.018223,0.253968,0.419828,0.007812,2.070312,2.044922,0.099766
50%,0.184838,0.059155,0.190032,0.140286,0.225684,0.09428,2.197101,8.318463,0.901767,0.396335,0.186599,0.184838,0.140519,0.04611,0.271186,0.765795,0.023438,4.992188,4.945312,0.139357
75%,0.199146,0.06702,0.210618,0.175939,0.24366,0.114175,2.931694,13.648905,0.928713,0.533676,0.221104,0.199146,0.169581,0.047904,0.277457,1.177166,0.070312,7.007812,6.992188,0.209183
max,0.251124,0.115273,0.261224,0.247347,0.273469,0.252225,34.725453,1309.612887,0.981997,0.842936,0.28,0.251124,0.237636,0.204082,0.279114,2.957682,0.458984,21.867188,21.84375,0.932374


In [43]:
X_train,X_test,y_train,y_test = train_test_split(df,y,test_size=0.33) # split to train and test dataset

In [48]:
'''Logistic Regression model, Suport Vector Machine, Random Forest'''
lg = LogisticRegression(solver='newton-cg',tol=1e-2,max_iter=100)
svm = SVC(kernel='rbf')
rf = RandomForestClassifier(n_estimators=20,criterion='entropy',max_depth=8,min_samples_split=3,max_features=5)

In [49]:
'''FIT MODELS'''
#print(cross_val_score(lg,X_train,y_train))
lg.fit(X_train,y_train)
svm.fit(X_train,y_train)
rf.fit(X_train,y_train)
''''''



''

In [50]:
'''GET PREDICTIONS'''
predictions_lg = lg.predict(X_test)
predictions_svm = svm.predict(X_test)
predicitons_rf = rf.predict(X_test)

In [51]:
'''PRINT CLASSIFICATION PERFORMANCE REPORT'''
print('Logistic Regression')
print(classification_report(y_test,predictions_lg))
print('-----------')
print('SVM')
print(classification_report(y_test,predictions_svm))
print('-----------')
print('RF')
print(classification_report(y_test,predicitons_rf))

Logistic Regression
              precision    recall  f1-score   support

           0       0.87      0.95      0.91       537
           1       0.94      0.85      0.89       509

    accuracy                           0.90      1046
   macro avg       0.90      0.90      0.90      1046
weighted avg       0.90      0.90      0.90      1046

-----------
SVM
              precision    recall  f1-score   support

           0       0.72      0.75      0.74       537
           1       0.73      0.70      0.71       509

    accuracy                           0.73      1046
   macro avg       0.73      0.72      0.73      1046
weighted avg       0.73      0.73      0.73      1046

-----------
RF
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       537
           1       0.97      0.99      0.98       509

    accuracy                           0.98      1046
   macro avg       0.98      0.98      0.98      1046
weighted avg       0.98

### DATASET 6 - mushroom_dataset

In [59]:
le = LabelEncoder()
df = pd.read_csv("mushroom_dataset.csv")
df = df.sample(frac=1)
y = df.loc[:,'mushroom']
df = df.loc[:,df.columns != 'mushroom']
y = le.fit_transform(y)

In [60]:
df.describe(include=object)

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8416,8416,8416,8416,8416,8416,8416,8416,8416,8416,...,8416,8416,8416,8416,8416,8416,8416,8416,8416,8416
unique,6,4,10,2,9,2,2,2,12,2,...,4,9,9,1,4,3,5,9,6,7
top,CONVEX,SCALY,BROWN,NO,NONE,FREE,CLOSE,BROAD,BUFF,TAPERING,...,SMOOTH,WHITE,WHITE,PARTIAL,WHITE,ONE,PENDANT,WHITE,SEVERAL,WOODS
freq,3796,3268,2320,5040,3808,8200,6824,5880,1728,4864,...,5076,4744,4640,8416,8216,7768,3968,2424,4064,3160


In [61]:
df = pd.get_dummies(df) # one hot encoding of categorical variables

In [63]:
X_train,X_test,y_train,y_test = train_test_split(df,y,test_size=0.33) # split to train and test dataset

In [64]:
'''Logistic Regression model, Suport Vector Machine, Random Forest'''
lg = LogisticRegression(solver='newton-cg',tol=1e-2,max_iter=100)
svm = SVC(kernel='rbf')
rf = RandomForestClassifier(n_estimators=40,criterion='entropy',max_depth=7,min_samples_split=3,max_features=80)

In [65]:
'''FIT MODELS'''
#print(cross_val_score(lg,X_train,y_train))
lg.fit(X_train,y_train)
svm.fit(X_train,y_train)
rf.fit(X_train,y_train)
''''''



''

In [66]:
'''GET PREDICTIONS'''
predictions_lg = lg.predict(X_test)
predictions_svm = svm.predict(X_test)
predicitons_rf = rf.predict(X_test)

In [67]:
'''PRINT CLASSIFICATION PERFORMANCE REPORT'''
print('Logistic Regression')
print(classification_report(y_test,predictions_lg))
print('-----------')
print('SVM')
print(classification_report(y_test,predictions_svm))
print('-----------')
print('RF')
print(classification_report(y_test,predicitons_rf))

Logistic Regression
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1460
           1       1.00      1.00      1.00      1318

    accuracy                           1.00      2778
   macro avg       1.00      1.00      1.00      2778
weighted avg       1.00      1.00      1.00      2778

-----------
SVM
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1460
           1       1.00      1.00      1.00      1318

    accuracy                           1.00      2778
   macro avg       1.00      1.00      1.00      2778
weighted avg       1.00      1.00      1.00      2778

-----------
RF
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1460
           1       1.00      1.00      1.00      1318

    accuracy                           1.00      2778
   macro avg       1.00      1.00      1.00      2778
weighted avg       1.00

### DATASET 7 - vehicle_sillouete_dataset

In [70]:
df = pd.read_csv("vehicle_silhouette.csv")
df = df.sample(frac=1)
df.drop(columns = ['match_date'], inplace = True) # insignificant feature

In [71]:
df.head()

Unnamed: 0,home_country,away_country,home_score,away_score,match_type,match_city,match_country,home_team_result
10956,Russia,Finland,2,2,UEFA Euro qualification,Moscow,Soviet Union,Draw
18655,San Marino,Finland,0,2,UEFA Euro qualification,Serravalle,San Marino,Loss
33862,Peru,Venezuela,2,1,FIFA World Cup qualification,Lima,Peru,Win
37163,Kenya,Guinea-Bissau,0,1,African Cup of Nations qualification,Nairobi,Kenya,Loss
11823,Japan,United Arab Emirates,3,2,Merdeka Tournament,Kuala Lumpur,Malaysia,Win
