In [176]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV

In [177]:
DATA_PATH = "../data/"
DATASET_FILE_NAME = "dataset.csv"

In [178]:
df = pd.read_csv(DATA_PATH + DATASET_FILE_NAME)

In [179]:
df["cancelled"].value_counts()

0    99550
1    60692
Name: cancelled, dtype: int64

In [180]:
df.head()

Unnamed: 0,default,cancelled,pid_CC0001,pid_CC0002,pid_IS0001,pid_IS0002
0,0,0,0,0,0,1
1,0,1,0,0,1,0
2,0,1,0,1,0,0
3,1,1,0,0,1,0
4,1,1,0,0,1,0


In [181]:
X = df.drop(columns="cancelled").to_numpy()
y = df.cancelled.to_numpy()

In [182]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [183]:
params = {
    'C':[0.00001, 0.0001, 0.001, 0.01],
    'class_weight': ["balanced", None],
    'n_jobs': [-1],
}
clf = GridSearchCV(LogisticRegression(), params)
clf.fit(X_train, y_train)

GridSearchCV(estimator=LogisticRegression(),
             param_grid={'C': [1e-05, 0.0001, 0.001, 0.01],
                         'class_weight': ['balanced', None], 'n_jobs': [-1]})

In [184]:
clf.best_estimator_

LogisticRegression(C=1e-05, class_weight='balanced', n_jobs=-1)

In [185]:
clf = LogisticRegression(C=1e-05, class_weight='balanced')
clf.fit(X_train, y_train)

LogisticRegression(C=1e-05, class_weight='balanced')

In [117]:
#clf = LogisticRegression(random_state=0, fit_intercept=False, class_weight="balanced")
clf = LinearSVC(random_state=0)
clf.fit(X_train, y_train)

LinearSVC(random_state=0)

In [186]:
y_pred = clf.predict(X_test)

In [187]:
accuracy_score(y_test, y_pred), balanced_accuracy_score(y_test, y_pred), f1_score(y_test, y_pred)

(0.7068720201692419, 0.6788112955263206, 0.5921011497446941)

In [188]:
#clf.predict_proba(X[:2, :])
#clf.score(X, y)

In [189]:
from collections import Counter
#Counter(y_pred)

In [190]:
[round(elem, 4) for elem in list(clf.coef_[0])]

[0.101, -0.0374, -0.012, 0.0132, 0.0362]

In [191]:
df.columns

Index(['default', 'cancelled', 'pid_CC0001', 'pid_CC0002', 'pid_IS0001',
       'pid_IS0002'],
      dtype='object')

In [192]:
from collections import Counter
Counter(y_test)

Counter({1: 15074, 0: 24987})

In [193]:
24987 / (24987 + 15074)

0.6237238211727116

0.5468033009884828

In [128]:
from random import random

In [130]:
random() < .5

True

In [194]:
f1_score(y_test, [1 if random() > .5 else 0 for _ in range(y_test.shape[0])])

0.4330869812393406

In [195]:
f1_score(y_test, [1 for _ in range(y_test.shape[0])])

0.5468033009884828

In [196]:
f1_score(y_test, y_pred)

0.5921011497446941