In [2]:
import pandas as pd
import numpy as np

# Import data

In [3]:
df = pd.read_csv('./datasets/balance-scale.data.csv', names=['balance', 'var1', 'var2', 'var3', 'var4'])
df.head()

Unnamed: 0,balance,var1,var2,var3,var4
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5


In [4]:
df['balance'].value_counts()

R    288
L    288
B     49
Name: balance, dtype: int64

# Turn into a binary classification problem

In [309]:
df['balance'] = [1 if b=='B' else 0 for b in df.balance] # about 8% were balanced
df['balance'].value_counts()

0    576
1     49
Name: balance, dtype: int64

### Simulate how the model signals the data by removing majority data

In [295]:
df_majority = df[df['balance']==0]
df_minority = df[df['balance']==1]

In [296]:
df_400 = df_majority[0:400]

In [297]:
df_removed = pd.concat([df_400, df_minority])

In [298]:
df_removed['balance'].value_counts()

0    400
1     49
Name: balance, dtype: int64

# Try to model

In [299]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [310]:
y = df['balance'] # using df removed to simulate how the model signals the data
X = df.drop('balance', axis=1)

In [311]:
classifier = LogisticRegression()
classifier.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [312]:
prediction = classifier.predict(X)

In [313]:
predict_proba = classifier.predict_proba(X)
predict_proba[0:5]

array([[0.8596411 , 0.1403589 ],
       [0.86903045, 0.13096955],
       [0.87788092, 0.12211908],
       [0.88621162, 0.11378838],
       [0.89404261, 0.10595739]])

In [314]:
print("Overall accuracy: %.2f%%" % (accuracy_score(prediction, y)*100))

Overall accuracy: 92.16%


In [315]:
print(np.unique(prediction))

[0]


# Solve it!

In [41]:
from sklearn.utils import resample

In [57]:
df_majority = df[df['balance']==0]
df_minority = df[df['balance']==1]

### 1. Up-sample minority class

In [44]:
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=576,    # to match majority class
                                 random_state=123) # reproducible results

In [48]:
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
df_upsampled['balance'].value_counts()

1    576
0    576
Name: balance, dtype: int64

In [49]:
y = df_upsampled['balance']
X = df_upsampled.drop('balance', axis=1)

classifier_upsampled = LogisticRegression()
classifier_upsampled.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [50]:
prediction_upsampled = classifier_upsampled.predict(X)

In [53]:
print("Overall accuracy: %.2f%%" % (accuracy_score(prediction_upsampled, y)*100))

Overall accuracy: 51.39%


In [56]:
# the model is no longer predicting just one class, it's now meaningful as a performance metric
print(np.unique(prediction_upsampled))

[0 1]


### 2. Down-sample Majority class

In [58]:
df_majority_downsampled = resample(df_majority,
                                   replace=False,    # sample without replacement
                                   n_samples=49,     # to match minority class
                                   random_state=123) # reproducible results

In [60]:
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
df_downsampled['balance'].value_counts()

1    49
0    49
Name: balance, dtype: int64

In [61]:
y = df_downsampled['balance']
X = df_downsampled.drop('balance', axis=1)

classifier_downsampled = LogisticRegression()
classifier_downsampled.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [62]:
prediction_downsampled = classifier_downsampled.predict(X)

In [63]:
print("Overall accuracy: %.2f%%" % (accuracy_score(prediction_downsampled, y)*100))

Overall accuracy: 58.16%


In [64]:
# the model is no longer predicting just one class, it's now meaningful as a performance metric
print(np.unique(prediction_upsampled))

[0 1]


### 3. Change performance metric (ROC)

In [91]:
from sklearn.metrics import roc_auc_score

In [335]:
probability_roc = classifier.predict_proba(X)
probability_roc[:5]

array([[0.8596411 , 0.1403589 ],
       [0.86903045, 0.13096955],
       [0.87788092, 0.12211908],
       [0.88621162, 0.11378838],
       [0.89404261, 0.10595739]])

In [336]:
probability_roc_idx0 = [p[0] for p in probability_roc]
probability_roc_idx0[:5]

[0.85964109636236,
 0.8690304467248932,
 0.8778809209614689,
 0.8862116219550588,
 0.8940426127138545]

In [337]:
print("ROC AUC score: %.2f%%" % (roc_auc_score(y, probability_roc_idx0)*100))

ROC AUC score: 46.93%


In [338]:
probability_roc_idx1 = [p[1] for p in probability_clf]
probability_roc_idx1[:5]

[0.14035890363764,
 0.13096955327510687,
 0.12211907903853116,
 0.11378837804494128,
 0.10595738728614548]

In [339]:
print("ROC AUC score: %.2f%%" % (roc_auc_score(y, probability_roc_idx1)*100))

ROC AUC score: 53.07%


### 4. Using penalized algorithms

In [83]:
from sklearn.svm import SVC

In [84]:
y = df['balance']
X = df.drop('balance', axis=1)

In [85]:
classifier_penalized = SVC(kernel='linear',
                           class_weight='balanced',
                           probability=True)

In [86]:
classifier_penalized.fit(X, y)

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [87]:
prediction_penalized = classifier_penalized.predict(X)

In [88]:
print("Overall accuracy: %.2f%%" % (accuracy_score(prediction_penalized, y)*100))

Overall accuracy: 68.80%


In [97]:
# what about ROC score?
probability_penalized = classifier_penalized.predict_proba(X)
probability_penalized = [p[1] for p in probability_penalized]
print("ROC score: %.2f%%" % (roc_auc_score(y, probability_penalized)*100))

ROC score: 46.95%


In [89]:
# the model is no longer predicting just one class, it's now meaningful as a performance metric
print(np.unique(prediction_penalized))

[0 1]


<hr/>