In [1]:
# Common modules
import numpy as np
import pandas as pd
import os
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, auc, roc_curve
from sklearn.model_selection import cross_val_score, cross_validate
import scipy.sparse as sp
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# SVM specific modules
from sklearn import svm

In [2]:
BASE_PATH = os.path.join(os.getcwd(), os.pardir)
DATA_PATH = os.path.join(BASE_PATH, 'data')

In [3]:
train_df = pd.read_json(os.path.join(BASE_PATH, '01-milestone1', 'imputed_train.json'))
test_df = pd.read_json(os.path.join(DATA_PATH, 'test.json.zip'))

In [4]:
train_feats = sp.load_npz(os.path.join(DATA_PATH, 'training_feats.npz'))

test_feats = sp.load_npz(os.path.join(DATA_PATH, 'test_feats.npz'))

In [None]:
train_df.head()

In [5]:
# X = train_df.drop(columns=['interest_level'])
y = train_df['interest_level']

X = train_feats

In [6]:
X_test = test_feats

In [7]:
X

<49308x35522 sparse matrix of type '<class 'numpy.float64'>'
	with 2626763 stored elements in Compressed Sparse Row format>

## Preprocessing

In [8]:
# Convert labels from {low, medium high} -> {0, 1, 2}
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)

In [9]:
#feature selection
kbest = SelectKBest(mutual_info_classif, k=20).fit(X,y)
X_fseltd = kbest.transform(X) 

In [10]:
X_fseltd

<49308x20 sparse matrix of type '<class 'numpy.float64'>'
	with 611587 stored elements in Compressed Sparse Row format>

In [11]:
X_test = kbest.transform(X_test)

In [12]:
X_test

<74659x20 sparse matrix of type '<class 'numpy.float64'>'
	with 924809 stored elements in Compressed Sparse Row format>

In [40]:
kbest.get_support(indices=True)

array([    0,     1,     2,     3,     4,     5,     6,     9,    11,
       12283, 35214, 35242, 35258, 35269, 35271, 35316, 35318, 35322,
       35352, 35371])

In [None]:
#Scale feature values
# X_fseltd = preprocessing.scale(X_fseltd)
# X_test = preprocessing.scale(X_test)

## Train-test split

We will do 5-fold cross-validation

In [13]:
model = svm.SVC(probability=True)

In [14]:
scores = cross_val_score(model, X_fseltd, y, cv=5, scoring='neg_log_loss')



IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [54]:
scores
test_scores = scores * -1
print(test_scores.mean())

0.7525721742573035


In [26]:
model.fit(X_fseltd, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [27]:
y_pred = model.predict_proba(X_test)

In [55]:
y_pred2 = model.predict(X_test)

In [None]:
acc = accuracy_score(train_df['interest_levels'], y_pred)

In [31]:
def create_submission_csv(y_pred, X_test_indexes):
    df = pd.DataFrame(y_pred, columns=le.classes_)
    df.index = X_test_indexes
    df.index.name = 'listing_id'
    return df

In [47]:
X_test_indexes = test_df['listing_id']

In [44]:
output = create_submission_csv(y_pred, X_test_indexes)
output.to_csv('svm_predictions_1.csv')