In [244]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [245]:
raw_data = pd.read_csv('epi_r.csv')

In [246]:
raw_data.rating.describe()

count    20052.000000
mean         3.714467
std          1.340829
min          0.000000
25%          3.750000
50%          4.375000
75%          4.375000
max          5.000000
Name: rating, dtype: float64

In [247]:
# Convert our label ('rating') from regression to a binary classifier using the mean value
# if rating is > mean then 1 else 0

rating_mean = raw_data.rating.mean()

raw_data.loc[raw_data.rating >= rating_mean, 'rating'] = 1
raw_data.loc[raw_data.rating > 1, 'rating'] = 0

In [248]:
# Drop the title column as it is a text data and does not contribute as a feature or label

raw_data.drop(['title'], axis=1, inplace=True)

In [249]:
# Fill any NaN values to zeros

raw_data.fillna(0, inplace=True)

In [250]:
# Create our feature set and label data

X = raw_data.drop(['rating'], axis=1)
y = raw_data['rating']

In [251]:
from sklearn.model_selection import train_test_split

# test_size: The split between test and train is 80-20 approximately

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/7.0, random_state=0)

In [252]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Fit on training set only.
scaler.fit(X_train)

# Apply transform to both the training set and the test set.
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [253]:
from sklearn.decomposition import PCA

# Fit our scaled data into PCA with 30 components
pca = PCA(n_components=30)

pca.fit(X_train)
X_train_pca = pca.fit_transform(X_train)
print("original shape:   ", X_train.shape)
print("transformed shape:", X_train_pca.shape)

original shape:    (17187, 678)
transformed shape: (17187, 30)


In [254]:
print(X_train_pca)

[[ 2.19464719 -0.42740388 -2.11578795 ... -0.4748697  -1.036702
   0.56278038]
 [ 2.2829558  -1.4099328  -1.13260134 ...  0.45742277  0.14786213
   0.07513771]
 [ 1.7511687  -2.32851716 -0.80927018 ...  0.43053751  0.54695509
   0.29107284]
 ...
 [-2.67276838  4.20281644 -3.44871182 ...  0.66715836 -0.01300972
   0.34498622]
 [-2.44972997 -1.37441485  0.34976843 ...  1.59980523 -1.53900513
   0.13233931]
 [-3.84256852 -0.98549957  1.07071244 ...  0.39185179  0.17281273
   0.22217437]]


In [261]:
from sklearn.svm import SVC
svr = SVC(gamma='auto')

svr.fit(X_train_pca, y_train)

predicted_labels = svr.predict(X_train_pca)
#plt.scatter(y_train, predicted_labels)

svr.score(X_train_pca, y_train)

from sklearn.model_selection import cross_val_score
cross_val_score(svr, X_train_pca, y_train, cv=5)

array([0.80948226, 0.80628272, 0.80331685, 0.80331685, 0.80826302])

In [262]:
# Check the confusion matrix to see fp and tp

from sklearn.metrics import confusion_matrix
confusion_matrix(y_train, predicted_labels)

array([[  838,  2742],
       [   89, 13518]])

In [263]:
# Check the original labels

pd.Series(y_train).value_counts()

1.0    13607
0.0     3580
Name: rating, dtype: int64

In [264]:
# Check predicted labels

pd.Series(predicted_labels).value_counts()

1.0    16260
0.0      927
dtype: int64