In [36]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import ExtraTreesClassifier
from imblearn.over_sampling import SMOTE

In [37]:
training_data = pd.read_csv('data/train.dat', header=None,sep=' ',float_precision='high')
    

test_data = pd.read_csv('data/test.dat', header=None,sep=' ',float_precision='high')

training_labels = pd.read_csv('data/train.labels', header=None)

In [38]:
training_data = training_data[training_data.columns[832:880]]
test_data = test_data[test_data.columns[832:880]]

In [39]:
training_data.head()

Unnamed: 0,832,833,834,835,836,837,838,839,840,841,...,870,871,872,873,874,875,876,877,878,879
0,0.09083,0.11261,0.143387,0.18458,0.144097,0.096907,0.054609,0.17298,0.098722,0.108665,...,0.005524,0.004577,0.840041,0.087358,0.033696,0.011916,0.005287,0.002525,0.001105,0.018071
1,0.08,0.066667,0.084103,0.262564,0.227692,0.166154,0.068718,0.044103,0.086154,0.068718,...,0.013333,0.008205,0.933333,0.025641,0.011282,0.005128,0.005128,0.001026,0.0,0.018462
2,0.259933,0.374263,0.170725,0.085619,0.05819,0.028711,0.01333,0.009228,0.245578,0.301205,...,0.003589,0.004358,0.4586,0.200461,0.15355,0.097667,0.018457,0.011023,0.006921,0.05332
3,0.217831,0.074449,0.094669,0.112132,0.153493,0.058824,0.0625,0.226103,0.21875,0.07261,...,0.006434,0.007353,0.855699,0.073529,0.014706,0.017463,0.007353,0.0,0.000919,0.030331
4,0.130441,0.138817,0.121564,0.112543,0.092712,0.061211,0.072666,0.270046,0.128794,0.090922,...,0.002005,0.002219,0.788159,0.078179,0.045676,0.045604,0.011884,0.003866,0.001074,0.025558


In [40]:
def removing_imbalance(training_data, training_classes):
    nm = SMOTE(random_state=21,k_neighbors=1)
    training_data = np.array(training_data)
    training_classes = np.array(training_classes)
    
    X_res, y_res = nm.fit_sample(training_data, training_classes)
    
    return X_res, y_res

In [41]:
training_data,training_labels = removing_imbalance(training_data=training_data,
                                                          training_classes=training_labels)

In [42]:
X_train, X_test, y_train, y_test = train_test_split(
    training_data,
    training_labels,
    test_size=0.2,
    shuffle=True,
    random_state=21,
)

In [43]:
pipeline=Pipeline(steps=[("DR", SelectKBest(chi2, k=46)), ('classify', ExtraTreesClassifier(n_estimators=500,min_samples_split=4,random_state=21,class_weight="balanced"))])

In [44]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('DR',
                 SelectKBest(k=46,
                             score_func=<function chi2 at 0x0000023629127168>)),
                ('classify',
                 ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
                                      class_weight='balanced', criterion='gini',
                                      max_depth=None, max_features='auto',
                                      max_leaf_nodes=None, max_samples=None,
                                      min_impurity_decrease=0.0,
                                      min_impurity_split=None,
                                      min_samples_leaf=1, min_samples_split=4,
                                      min_weight_fraction_leaf=0.0,
                                      n_estimators=500, n_jobs=None,
                                      oob_score=False, random_state=21,
                                      verbose=0, warm_start=False))],
         verbose=Fals

In [45]:
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.82      0.84      0.83      1767
           2       0.83      0.81      0.82      1784
           3       0.98      0.98      0.98      1753
           4       1.00      1.00      1.00      1808
           5       1.00      1.00      1.00      1760
           6       1.00      1.00      1.00      1760
           7       1.00      1.00      1.00      1772
           8       0.99      0.99      0.99      1733
          10       1.00      1.00      1.00      1795
          11       1.00      1.00      1.00      1750

    accuracy                           0.96     17682
   macro avg       0.96      0.96      0.96     17682
weighted avg       0.96      0.96      0.96     17682



In [46]:
X_train = np.vstack((X_train,X_test))
y_train = np.append(y_train,y_test)

pipeline=Pipeline(steps=[("DR", SelectKBest(chi2, k=46)), ('classify', ExtraTreesClassifier(n_estimators=500,min_samples_split=4,random_state=21,class_weight="balanced"))])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(test_data)

In [47]:
prediction_df = pd.DataFrame(y_pred)
prediction_df.index += 1 
prediction_df.index.names = ['ImageID']
prediction_df.columns = ['Class']

In [48]:
prediction_df.to_csv('prediction.dat', index=True)