In [1]:
# Common modules
import numpy as np
import pandas as pd
import os
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, auc, roc_curve
from sklearn.model_selection import cross_val_score, cross_validate
import scipy.sparse as sp
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# SVM specific modules
from sklearn import svm

In [2]:
BASE_PATH = os.path.join(os.getcwd(), os.pardir)
DATA_PATH = os.path.join(BASE_PATH, 'data')

In [3]:
train_df = pd.read_json(os.path.join(BASE_PATH, '01-milestone1', 'imputed_train.json'))
test_df = pd.read_json(os.path.join(DATA_PATH, 'test.json.zip'))

In [4]:
train_feats = sp.load_npz(os.path.join(DATA_PATH, 'training_feats.npz'))

test_feats = sp.load_npz(os.path.join(DATA_PATH, 'test_feats.npz'))

In [None]:
train_feats.todense()

In [None]:
train_df.head()

In [5]:
# X = train_df.drop(columns=['interest_level'])
y = train_df['interest_level']

X = train_feats

In [19]:
X_test = test_feats

In [None]:
X

## Preprocessing

In [6]:
# Convert labels from {low, medium high} -> {0, 1, 2}
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)

In [12]:
#feature selection
kbest = SelectKBest(mutual_info_classif, k=25).fit(X,y)
X_fseltd = kbest.transform(X) 

In [13]:
X_fseltd

<49308x25 sparse matrix of type '<class 'numpy.float64'>'
	with 661838 stored elements in Compressed Sparse Row format>

In [22]:
X_test

<74659x25 sparse matrix of type '<class 'numpy.float64'>'
	with 1000434 stored elements in Compressed Sparse Row format>

In [21]:
X_test = kbest.transform(X_test)

In [None]:
#Scale feature values
# X_fseltd = preprocessing.scale(X_fseltd)
# X_test = preprocessing.scale(X_test)

## Train-test split

We will do 5-fold cross-validation

In [23]:
model = svm.SVC(kernel='linear', C=3e2, probability=True)

In [None]:
scores = cross_val_score(model, X_fseltd, y, cv=5, scoring='neg_log_loss')
test_scores = -1 * scores['test_score']
print(test_scores.mean().round(4))

In [None]:
model.fit(X, y)
# model.fit(X_train, y_train)

In [None]:
y_pred = model.predict_proba(X_test)

## Raw accuracy

In [None]:
acc = accuracy_score(y_test, y_pred)

In [None]:
acc

## Area Under the Curve

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=2)

In [None]:
plt.plot(fpr, tpr)
plt.title("ROC Curve for Decision Tree (w/o parameter tuning)")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()

In [None]:
auc_score = auc(fpr, tpr)
auc_score