In [26]:
import pandas as pd
import numpy as np
 
# Read dataset
df = pd.read_csv('balance-scale.data', 
                 names=['balance', 'var1', 'var2', 'var3', 'var4'])
 
# Display example observations
df.head()

Unnamed: 0,balance,var1,var2,var3,var4
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5


In [27]:
df['balance'].value_counts()

L    288
R    288
B     49
Name: balance, dtype: int64

In [28]:
# Transform into binary classification
df['balance'] = [1 if b=='B' else 0 for b in df.balance]
 
df['balance'].value_counts()
# 0    576
# 1     49
# Name: balance, dtype: int64
# About 8% were balanced

0    576
1     49
Name: balance, dtype: int64

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [29]:
# Separate input features (X) and target variable (y)
y = df.balance
X = df.drop('balance', axis=1)
 
# Train model
clf_0 = LogisticRegression(solver = 'lbfgs').fit(X, y)
 
# Predict on training set
pred_y_0 = clf_0.predict(X)

In [30]:
print( accuracy_score(pred_y_0, y) )

0.9216


In [31]:
print( np.unique( pred_y_0 ) )

[0]


In [17]:
from sklearn.utils import resample

In [33]:
n0 = (df.balance == 0).sum()
n1 = (df.balance == 1).sum()
n0, n1

(576, 49)

In [35]:
df_maj = df[df.balance==0]
df_min = df[df.balance==1]
len(df_maj) > len(df_min)

True

In [93]:
df_min_upsamp = resample(df_min, replace=True, n_samples = int(0.2*n0), random_state=123)
df_maj_downsamp = resample(df_maj, replace=False, n_samples = int(0.2*n0), random_state=123)

In [94]:
df_rebal_1 = pd.concat([df_maj_downsamp, df_min_upsamp])

In [95]:
df_rebal_1.balance.value_counts()

1    115
0    115
Name: balance, dtype: int64

In [96]:
# Separate input features (X) and target variable (y)
y_1 = df_rebal_1.balance
X_1 = df_rebal_1.drop('balance', axis=1)
 
# Train model
clf_1 = LogisticRegression(solver = 'lbfgs').fit(X_1, y_1)
 
# Predict on training set
pred_y_1 = clf_1.predict(X_1)

In [97]:
np.unique( pred_y_1 )

array([0, 1], dtype=int64)

In [98]:
accuracy_score(y_1, pred_y_1)

0.5478260869565217

In [63]:
df_min_upsamp = resample(df_min, replace=True, n_samples = n0, random_state=123)

In [64]:
from sklearn.metrics import roc_auc_score

In [65]:
roc_auc_score(y_1, [p[1] for p in clf_1.predict_proba(X_1)])

0.5439

In [66]:
roc_auc_score(y, [p[1] for p in clf_0.predict_proba(X)])

0.5306122448979591

In [99]:
from sklearn.svm import SVC

In [104]:
clf_2 = SVC(kernel='rbf', 
            class_weight='balanced', # penalize
            probability=True,
            gamma='scale')

In [111]:
clf_2.fit(X, y)

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [112]:
pred_y_2 = clf_2.predict(X)
np.unique(pred_y_2)

array([0, 1], dtype=int64)

In [113]:
accuracy_score(y,pred_y_2)

0.7344

In [116]:
roc_auc_score(y, [p[1] for p in clf_2.predict_proba(X)])

0.9795918367346939

In [117]:
from sklearn.ensemble import RandomForestClassifier

In [125]:
clf_3 = RandomForestClassifier(n_estimators=10)
clf_3.fit(X, y)
pred_y_3 = clf_3.predict(X)
np.unique(pred_y_3)

array([0, 1], dtype=int64)

In [126]:
accuracy_score(y, pred_y_3)

0.9792

In [127]:
clf_3.predict_proba(X)

array([[0.3, 0.7],
       [0.7, 0.3],
       [0.9, 0.1],
       ...,
       [1. , 0. ],
       [0.8, 0.2],
       [0.6, 0.4]])

In [128]:
roc_auc_score(y, [p[1] for p in clf_3.predict_proba(X)])

0.9986359126984127

---

In [1]:
from collections import Counter

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.metrics import classification_report_imbalanced

  _nan_object_mask = _nan_object_array != _nan_object_array


In [2]:
categories = ['alt.atheism', 'talk.religion.misc',
              'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',
                                      categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test',
                                     categories=categories)

X_train = newsgroups_train.data
X_test = newsgroups_test.data

y_train = newsgroups_train.target
y_test = newsgroups_test.target

print('Training class distributions summary: {}'.format(Counter(y_train)))
print('Test class distributions summary: {}'.format(Counter(y_test)))

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


Training class distributions summary: Counter({2: 593, 1: 584, 0: 480, 3: 377})
Test class distributions summary: Counter({2: 394, 1: 389, 0: 319, 3: 251})


In [5]:
y_train

array([1, 3, 2, ..., 1, 0, 1], dtype=int64)

In [6]:
pipe = make_pipeline(TfidfVectorizer(), MultinomialNB())
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.67      0.94      0.86      0.79      0.90      0.82       319
          1       0.96      0.92      0.99      0.94      0.95      0.90       389
          2       0.87      0.98      0.94      0.92      0.96      0.92       394
          3       0.97      0.36      1.00      0.52      0.60      0.33       251

avg / total       0.87      0.84      0.94      0.82      0.88      0.78      1353



In [7]:
pipe = make_pipeline_imb(TfidfVectorizer(),
                         RandomUnderSampler(),
                         MultinomialNB())

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.70      0.90      0.88      0.79      0.89      0.80       319
          1       0.98      0.84      0.99      0.90      0.91      0.82       389
          2       0.94      0.90      0.98      0.92      0.94      0.88       394
          3       0.80      0.73      0.96      0.76      0.84      0.68       251

avg / total       0.87      0.85      0.96      0.86      0.90      0.81      1353

