In [1]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

In [2]:
# Import data to Pandas DataFrame
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
test_final = pd.read_csv('data/test_final.csv')

X_train = train.drop('target', axis=1)
y_train = train['target']

X_test = test.drop('target', axis=1)
y_test = test['target']

X_test_final = test_final.drop('target', axis=1)
y_test_final = test_final['target']


In [3]:
# Standardize with MinMax Scaler

from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()
X_train_mmscaled = min_max_scaler.fit_transform(X_train)
X_test_mmscaled = min_max_scaler.transform(X_test)
X_test_final_mmscaled = min_max_scaler.transform(X_test_final)

In [11]:
class_names = ['Dance-Pop','Rap-Pop','Folk-Pop', 'Electro-Pop','Rock-Pop', 'Indie-Pop','EDM-Pop']
classes=[1,2,3,4,5,6,7]

from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight(class_weight='balanced',classes=classes,y=y_train)
cls_wt = dict(zip(classes,class_weights))

{1: 0.7974005244555923,
 2: 0.7592270950933565,
 3: 1.0500713159672697,
 4: 1.2655387677553604,
 5: 1.081908886998221,
 6: 1.4802116402116403,
 7: 0.9204448246364414}

In [12]:
from sklearn.linear_model import LogisticRegression

# Perform vanilla logistic regression
logreg_base = LogisticRegression(C = 1e9, 
                                 solver='lbfgs',
                                 multi_class='multinomial', 
                                 max_iter=1000, 
                                 penalty='l2',
                                 class_weight='cls_wt')
logreg_base.fit(X_train_mmscaled, y_train)
pred_logreg_base_train = logreg_base.predict(X_train_mmscaled)
pred_logreg_base_test = logreg_base.predict(X_test_mmscaled)


In [13]:
# Print metrics
from sklearn.metrics import accuracy_score

def print_metrics(labels, preds):
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))

print("Train score:")
print_metrics(y_train, pred_logreg_base_train)

print("Test score:")
print_metrics(y_test, pred_logreg_base_test)


Train score:
Accuracy Score: 0.4984272233342865
Test score:
Accuracy Score: 0.5030025736345439


In [23]:
from sklearn.model_selection import GridSearchCV

logreg_GS = LogisticRegression(random_state = 11,
                               multi_class='multinomial', 
                               max_iter=1000, 
                               class_weight=cls_wt)

# Set grid search params
param_grid_logreg = [
    {'penalty':['l1'],
     'solver':['saga'],
     'C':[1e9, 1e5, 1e3, 1, 1e-2],
     'class_weight':[cls_wt,'balanced']
     },
    {'penalty':['l2'],
     'solver':['newton-cg'],
     'C':[1e9, 1e5, 1e3, 1, 1e-2],
     'class_weight':[cls_wt,'balanced']
    }   
]

# Construct grid search
gs_logreg = GridSearchCV(estimator=logreg_GS,
            param_grid=param_grid_logreg,
            scoring='accuracy',
            refit='accuracy',
            cv=5, verbose=2, return_train_score = True,
            n_jobs=-1)

# Fit using grid search
gs_logreg.fit(X_train_mmscaled, y_train)

pred_gs_lr_train = gs_logreg.predict(X_train_mmscaled)
pred_gs_lr_test = gs_logreg.predict(X_test_mmscaled)

# Best accuracy
print('Best accuracy: %.3f' % gs_logreg.best_score_)

# Best params
print('\nBest params:\n', gs_logreg.best_params_)


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.4min finished


Best accuracy: 0.493

Best params:
 {'C': 1000000000.0, 'class_weight': {1: 0.7974005244555923, 2: 0.7592270950933565, 3: 1.0500713159672697, 4: 1.2655387677553604, 5: 1.081908886998221, 6: 1.4802116402116403, 7: 0.9204448246364414}, 'penalty': 'l1', 'solver': 'saga'}


In [24]:
print("Train score:")
print_metrics(y_train, pred_gs_lr_train)

print("Test score:")
print_metrics(y_test, pred_gs_lr_test)

Train score:
Accuracy Score: 0.4984272233342865
Test score:
Accuracy Score: 0.50328853302831
