In [1]:
# Standard imports
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 500)
import matplotlib.pyplot as plt
import joblib

In [2]:
train = pd.read_pickle('data/train_df.pkl')
test = pd.read_pickle('data/test_df.pkl')

In [4]:
cols = list(train.columns.values)
genre_cols = cols[-6:]
print(len(genre_cols))
print(genre_cols)

6
['Abstract', "Children's", 'Family', 'Strategy', 'Thematic', 'Wargames']


In [5]:
X_train = train[train.columns[~train.columns.isin(genre_cols)]]
y_train = train[train.columns[ train.columns.isin(genre_cols)]]

X_test = test[test.columns[~test.columns.isin(genre_cols)]]
y_test = test[test.columns[ test.columns.isin(genre_cols)]]

In [8]:
from sklearn.preprocessing import StandardScaler
my_standard_scaler = StandardScaler().fit(X_train)
X_train_s = my_standard_scaler.transform(X_train)
X_test_s = my_standard_scaler.transform(X_test)

In [10]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

In [11]:
from sklearn.model_selection import cross_val_score
my_log_model = OneVsRestClassifier(LogisticRegression(random_state=123, solver='lbfgs', max_iter=3000, C=0.01, n_jobs=-1), n_jobs=-1)

scores = cross_val_score(my_log_model, X_train_s, y_train, cv = 5)
print(scores)

for i in range(len(scores)) :
    print(f"Fold {i+1}: {scores[i]}")
print(f"Average Score:{np.mean(scores)}")

[0.65335753 0.65263158 0.63798112 0.65395788 0.65686275]
Fold 1: 0.6533575317604355
Fold 2: 0.6526315789473685
Fold 3: 0.6379811183732752
Fold 4: 0.6539578794480755
Fold 5: 0.6568627450980392
Average Score:0.6509581707254387


In [13]:
my_log_model = OneVsRestClassifier(LogisticRegression(random_state=123, solver='lbfgs', max_iter=3000, C=0.01, n_jobs=-1), n_jobs=-1).fit(X_train_s, y_train)

In [14]:
y_train_pred = my_log_model.predict(X_train_s)
y_train_proba = my_log_model.predict_proba(X_train_s)
y_test_pred = my_log_model.predict(X_test_s)
y_test_proba = my_log_model.predict_proba(X_test_s)

In [15]:
from sklearn.metrics import accuracy_score
print(f'Training score: {accuracy_score(y_train, y_train_pred):0.5f}')
print(f'    Test score: {accuracy_score(y_test, y_test_pred):0.5f}')

Training score: 0.99332
    Test score: 0.64496


In [16]:
y_pred_df = pd.DataFrame(y_test_pred, columns=genre_cols)

# Test set predictions
for g in genre_cols:
    score = accuracy_score(y_test[g], y_pred_df[g])
    print(f'{score:0.4f}  {g}')

0.9072  Abstract
0.9203  Children's
0.8647  Family
0.9090  Strategy
0.9386  Thematic
0.9591  Wargames
