# Working with unbalanced datasets

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.metrics import classification_report



In [4]:
def calculate_percentage(subset, total):
    return round(100.0 * subset / total, 2)

In [5]:
changesets = pd.read_csv('../data/unbalanced-datasets/changesets.csv')
changesets = changesets.drop_duplicates('ID')
changesets.head(2)

Unnamed: 0,ID,user,editor,Powerfull Editor,comment,source,imagery used,date,reasons,reasons__name,create,modify,delete,bbox,is suspect,harmful,checked,check_user__username,check date
0,47491144,RichRico,JOSM/1.5 (11639 en),True,Adding junction nodes or bridges to overlappin...,Bing,Not reported,2017-04-05T22:46:26+00:00,,,0.0,1.0,0.0,"SRID=4326;POLYGON ((-46.8202964 -23.693203, -4...",False,False,True,calfarome,2017-04-05T23:30:53.776282+00:00
1,47490912,Birgitta_fi,rosemary v0.4.4,False,Modified via wheelmap.org,Not reported,Not reported,2017-04-05T22:27:45+00:00,23.0,Major name modification,5.0,41.0,0.0,"SRID=4326;POLYGON ((24.9182827 60.1779368, 24....",False,True,True,yurasi,2017-04-05T23:33:08.813791+00:00


In [6]:
good_changesets = changesets[changesets['harmful'] == False]
print('Number of good changesets: {}'.format(good_changesets.shape[0]))

harmful_changesets = changesets[changesets['harmful'] == True]
print('Number of harmful changesets: {}'.format(harmful_changesets.shape[0]))

Number of good changesets: 53556
Number of harmful changesets: 5691


In [7]:
def get_features(changesets):
    """Extract required features for training the model."""
    columns = ['create', 'modify', 'delete', 'harmful']
    features = changesets[columns]
    return features.dropna()

In [8]:
# Prepare all features and actual labels for testing all models.
all_features = get_features(changesets)
all_X = all_features.drop('harmful', axis=1)
all_y = all_features['harmful']

In [9]:
def get_scaler(features):    
    """Scale features using RobustScaler which is suitable for data with outliers."""
    X = features.drop('harmful', axis=1)
    return preprocessing.RobustScaler().fit(X)

In [10]:
def get_model(X, y):
    """Return a trained model."""
    model = SVC(kernel='rbf')
    model.fit(X, y)
    return model

In [11]:
def print_performance(y_true, y_pred):
    print(classification_report(y_true, y_pred, labels=[True, False], target_names=['problematic', 'not problematic']))

## Experiment 1: Using all changesets

In [12]:
features = get_features(changesets)
scaler = get_scaler(features)

X = features.drop('harmful', axis=1)
y = features['harmful']

model = get_model(scaler.transform(X), y)
y_model = model.predict(scaler.transform(all_X))

print_performance(all_y, y_model)

                 precision    recall  f1-score   support

    problematic       0.98      0.03      0.06      5684
not problematic       0.91      1.00      0.95     53455

    avg / total       0.91      0.91      0.87     59139



## Experiment 2: Undersample good changesets

In [13]:
undersampled_good_changesets = good_changesets.sample(harmful_changesets.shape[0])
undersampled_changesets = pd.concat([undersampled_good_changesets, harmful_changesets])
undersampled_changesets.shape

(11382, 19)

In [14]:
features = get_features(undersampled_changesets)
scaler = get_scaler(features)

X = features.drop('harmful', axis=1)
y = features['harmful']

model = get_model(scaler.transform(X), y)
y_model = model.predict(scaler.transform(all_X))

print_performance(all_y, y_model)

                 precision    recall  f1-score   support

    problematic       0.18      0.83      0.30      5684
not problematic       0.97      0.60      0.74     53455

    avg / total       0.90      0.62      0.70     59139



## Experiment 3: Undersample good changesets by 4x

In [15]:
undersampled_good_changesets = good_changesets.sample(harmful_changesets.shape[0] * 4)
undersampled_changesets = pd.concat([undersampled_good_changesets, harmful_changesets])
undersampled_changesets.shape

(28455, 19)

In [16]:
features = get_features(undersampled_changesets)
scaler = get_scaler(features)

X = features.drop('harmful', axis=1)
y = features['harmful']

model = get_model(scaler.transform(X), y)
y_model = model.predict(scaler.transform(all_X))

print_performance(all_y, y_model)

                 precision    recall  f1-score   support

    problematic       0.86      0.05      0.10      5684
not problematic       0.91      1.00      0.95     53455

    avg / total       0.90      0.91      0.87     59139



## Experiment 4: Oversample harmful changesets by 4x

In [17]:
# Oversampling harmful changesets by 4 times
overampled_harmful_changesets = pd.concat([harmful_changesets] * 4)
overampled_changesets = pd.concat([good_changesets, overampled_harmful_changesets])
overampled_changesets.shape

(76320, 19)

In [18]:
features = get_features(overampled_changesets)
scaler = get_scaler(features)

X = features.drop('harmful', axis=1)
y = features['harmful']

model = get_model(scaler.transform(X), y)
y_model = model.predict(scaler.transform(all_X))

print_performance(all_y, y_model)

                 precision    recall  f1-score   support

    problematic       0.74      0.09      0.15      5684
not problematic       0.91      1.00      0.95     53455

    avg / total       0.89      0.91      0.88     59139



## Experiment 5: Let's try Decision Trees
> Decision trees often perform well on imbalanced datasets.

In [19]:
features = get_features(changesets)
scaler = get_scaler(features)

X = features.drop('harmful', axis=1)
y = features['harmful']

# Training a decision tree.
from sklearn import tree
model = tree.DecisionTreeClassifier()
model = model.fit(X, y)

# Get predictions from decision tree model.
y_model = model.predict(scaler.transform(all_X))

print_performance(all_y, y_model)

                 precision    recall  f1-score   support

    problematic       0.06      0.01      0.01      5684
not problematic       0.90      0.99      0.94     53455

    avg / total       0.82      0.89      0.85     59139



## Experiment 6: Set class_weight='balanced'

> In SVC, if data for classification are unbalanced (e.g. many positive and few negative), set class_weight='balanced' and/or try different penalty parameters C.

In [20]:
features = get_features(changesets)
scaler = get_scaler(features)

X = features.drop('harmful', axis=1)
y = features['harmful']

model = SVC(kernel='rbf', class_weight='balanced')
model.fit(scaler.transform(X), y)
y_model = model.predict(scaler.transform(all_X))

print_performance(all_y, y_model)

                 precision    recall  f1-score   support

    problematic       0.18      0.81      0.29      5684
not problematic       0.97      0.61      0.75     53455

    avg / total       0.89      0.63      0.70     59139



## Experiment 7: Try different penalty parameters C

In [21]:
features = get_features(changesets)
scaler = get_scaler(features)

X = features.drop('harmful', axis=1)
y = features['harmful']

# The default penalty is `1`, so trying a penalty of `2`.
model = SVC(kernel='rbf', class_weight='balanced', C=2)
model.fit(scaler.transform(X), y)
y_model = model.predict(scaler.transform(all_X))

print_performance(all_y, y_model)

                 precision    recall  f1-score   support

    problematic       0.18      0.81      0.29      5684
not problematic       0.97      0.60      0.74     53455

    avg / total       0.89      0.62      0.70     59139

