# Imports

In [2]:
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.utils import check_X_y
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_curve, average_precision_score, classification_report, make_scorer, f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import scale, MinMaxScaler, Normalizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc



# Loading Data

In [3]:
data = pd.read_csv("C:\\Users\\user\\Documents\\remote_sensing.csv", sep=";")
data.head()

X,y = data.drop("class", axis=1), data["class"]

# Modelling Configuration 

In [4]:
cv = StratifiedKFold(random_state=0)

In [5]:
def f1_multi(y_true, y_pred): 
    return f1_score(y_true, y_pred, average='macro')

f1_multi_scorer = make_scorer(f1_multi)

# Linear Model - Logistic Regression 

In [6]:
lr = LogisticRegression(multi_class='ovr')
lr.fit(X,y)

lr_score = cross_val_score(lr,X,y,cv=cv)
lr_score.mean()

## f1

lr_score_f1 = cross_val_score(lr,X,y,cv=cv,scoring=f1_multi_scorer)

print(lr_score.mean())
print(lr_score_f1.mean())

0.562694461904
0.285750043949


# Decision Tree

In [7]:
dc = DecisionTreeClassifier()
dc_score = cross_val_score(dc,X,y,cv=cv)
dc_score.mean()

## f1

dc_score_f1 = cross_val_score(dc,X,y,cv=cv,scoring=f1_multi_scorer)
dc_score_f1.mean()

print(dc_score.mean())
print(dc_score_f1.mean())

0.476961160164
0.258925448937


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


# Dummy classifier - Baseline (most_frequent)

A machine learning algorithm tries to learn a function that models the relationship between the input data and the target variable. In order to measure performance a baseline model is used to compare any other machine learning algorithm against. In this specific classification case, the dummy classifier strategy used is “most_frequent”: always predicts the most frequent label in the training set.

In [8]:
dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X,y)
dummy_score = cross_val_score(dummy,X,y,cv=cv).mean()
dummy_score 

#f1

dummy_score_f1 = cross_val_score(dummy,X,y,cv=cv,scoring=f1_multi_scorer)
dummy_score_f1.mean()


print(dummy_score.mean())
print(dummy_score_f1.mean())

0.449240337624
0.0774956295029


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Before performing any type of oversampling, both the logistic regression and the decision tree classifiers perform slightly better than the baseline. 

# Random Forest Classifier 

In [10]:
forest = RandomForestClassifier(random_state=0)
forest.fit(X, y)
forest_score = cross_val_score(forest,X,y,cv=cv).mean()

### f1 score

In [None]:
forest_score_f1 = cross_val_score(forest,X,y,cv=cv,scoring=f1_multi_scorer).mean()

print(forest_score.mean())
print(forest_score_f1.mean())

Similarly, the random forest classifier also performs slightly better than the baseline classifier. 

# Oversampling Methods

Since this is clearly an imbalanced problem, 

## Random Oversampling 

In [10]:
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_sample(X, y)

0.935299373398
0.649470137873
0.796004377911
0.929807796168
0.646707577224
0.794689407376


### Accuracy 

In [14]:
dc_score_random = cross_val_score(dc,X_resampled,y_resampled,cv=cv)

lr.fit(X_resampled,y_resampled)
lr_score_random = cross_val_score(lr,X_resampled,y_resampled,cv=cv)

forest.fit(X_resampled, y_resampled)
forest_score_random = cross_val_score(forest,X_resampled,y_resampled,cv=cv).mean()

print(dc_score_random.mean())
print(lr_score_random.mean())
print(forest_score_random.mean())


0.933662843775
0.649470137873
0.796004377911


### f1 score

In [15]:
dc_score_random_f1 = cross_val_score(dc,X_resampled,y_resampled,cv=cv,scoring=f1_multi_scorer)

lr_score_random_f1 = cross_val_score(lr,X_resampled,y_resampled,cv=cv,scoring=f1_multi_scorer)

forest_score_random_f1 = cross_val_score(forest,X_resampled,y_resampled,cv=cv,scoring=f1_multi_scorer).mean()


print(dc_score_random_f1.mean())
print(lr_score_random_f1.mean())
print(forest_score_random_f1.mean())


0.931072435564
0.646707577224
0.794689407376


After performing random oversampling on the data set, the results improved.

## SMOTE

In [18]:
sm = SMOTE(ratio = 'auto',k_neighbors=1)
X_res, y_res = sm.fit_sample(X, y)

#accuracy

dc_score_smote = cross_val_score(dc,X_res,y_res,cv=cv)

lr.fit(X_res,y_res)
lr_score_smote = cross_val_score(lr,X_res,y_res,cv=cv)

forest.fit(X_res, y_res)
forest_score_smote = cross_val_score(forest,X_res,y_res,cv=cv).mean()

## f1

dc_score_smote_f1 = cross_val_score(dc,X_res,y_res,cv=cv,scoring=f1_multi_scorer)

lr.fit(X_res,y_res)
lr_score_smote_f1 = cross_val_score(lr,X_res,y_res,cv=cv,scoring=f1_multi_scorer)

forest.fit(X_res, y_res)
forest_score_smote_f1 = cross_val_score(forest,X_res,y_res,cv=cv,scoring=f1_multi_scorer).mean()

param_grid = {
    "max_depth": [3,5,7,10,15],
    "max_features": [1, 3, 10],
    "bootstrap": [True, False],
    "criterion": ["gini", "entropy"]
}
gscv = GridSearchCV(estimator=forest, param_grid=param_grid, cv=cv, scoring='accuracy')
gscv.fit(X_res, y_res)
gscv.best_score_

print(dc_score_smote.mean())
print(lr_score_smote.mean())
print(forest_score_smote.mean())

print(dc_score_smote_f1.mean())
print(lr_score_smote_f1.mean())
print(forest_score_smote_f1.mean())




0.838251999627
0.704360664156
0.89752497588
0.836034184929
0.701353870956
0.895492006521


To work with the SMOTE algorithm, it's important to first set the k_neighbors to 1 because of the class that has only 4 observations. Otherwise, the algorithm would not work. 

# Classification Report

## Normalization

In [16]:
pca = PCA()
normalizer = Normalizer()
scaler = MinMaxScaler()

steps = [('scaler', scaler),
         ('pca', pca),
         ('dt', dc),]
pipeline = Pipeline(steps)

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.33, random_state=42)

pipeline.fit(X_train, y_train)
y_prediction = pipeline.predict(X_test)
report = classification_report(y_test, y_prediction)
print(report)

             precision    recall  f1-score   support

          A       0.82      0.90      0.86       251
          B       0.73      0.73      0.73       251
          C       0.62      0.55      0.58       252
          D       0.71      0.68      0.70       256
          E       0.80      0.76      0.78       264
          F       0.92      0.97      0.94       248
          G       0.96      0.99      0.98       257
          H       0.98      1.00      0.99       231

avg / total       0.82      0.82      0.82      2010



## No Normalization

In [47]:
pca = PCA()
steps = [('pca', pca),
        ('dt', dc)]
pipeline = Pipeline(steps)

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.33, random_state=42)

pipeline.fit(X_train, y_train)
y_prediction = pipeline.predict(X_test)
report1 = classification_report(y_test, y_prediction)
print(report1)

             precision    recall  f1-score   support

          A       0.80      0.84      0.82       251
          B       0.72      0.71      0.71       251
          C       0.58      0.53      0.55       252
          D       0.69      0.67      0.68       256
          E       0.75      0.73      0.74       264
          F       0.89      0.97      0.93       248
          G       0.96      1.00      0.98       257
          H       1.00      1.00      1.00       231

avg / total       0.80      0.80      0.80      2010



## One vs Rest Classifier 

This strategy consists in fitting one classifier per class. For each classifier, the class is fitted against all the other classes. In addition to its computational efficiency (only n_classes classifiers are needed).

**Advantage: Interpretability.** *Since each class is represented by one and one classifier only, it is possible to gain knowledge about the class by inspecting its corresponding classifier. This is the most commonly used strategy for multiclass classification and is a fair default choice.*

In [26]:
y_ovr = OneVsRestClassifier(dc).fit(X_train, y_train).predict(X_test)
report2 = classification_report(y_test, y_ovr)
print(report2)

             precision    recall  f1-score   support

          A       0.94      0.78      0.86       251
          B       0.88      0.58      0.70       251
          C       0.71      0.49      0.58       252
          D       0.72      0.66      0.69       256
          E       0.78      0.73      0.75       264
          F       0.94      0.94      0.94       248
          G       0.97      0.99      0.98       257
          H       0.49      1.00      0.66       231

avg / total       0.81      0.77      0.77      2010



ROC curves are typically used in binary classification to study the output of a classifier. In order to extend ROC curve and ROC area to multi-class or multi-label classification, it is necessary to binarize the output. One ROC curve can be drawn per label, but one can also draw a ROC curve by considering each element of the label indicator matrix as a binary prediction (micro-averaging).

Another evaluation measure for multi-class classification is macro-averaging, which gives equal weight to the classification of each label.

Binarize labels in a one-vs-all fashion

Several regression and binary classification algorithms are available in the scikit. 
A simple way to extend these algorithms to the multi-class classification case is to use the so-called one-vs-all scheme.

0.91475032851511173