In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("Movie_classification.csv",header=0)
df.head()

Unnamed: 0,Marketing expense,Production expense,Multiplex coverage,Budget,Movie_length,Lead_ Actor_Rating,Lead_Actress_rating,Director_rating,Producer_rating,Critic_rating,Trailer_views,3D_available,Time_taken,Twitter_hastags,Genre,Avg_age_actors,Num_multiplex,Collection,Start_Tech_Oscar
0,20.1264,59.62,0.462,36524.125,138.7,7.825,8.095,7.91,7.995,7.94,527367,YES,109.6,223.84,Thriller,23,494,48000,1
1,20.5462,69.14,0.531,35668.655,152.4,7.505,7.65,7.44,7.47,7.44,494055,NO,146.64,243.456,Drama,42,462,43200,0
2,20.5458,69.14,0.531,39912.675,134.6,7.485,7.57,7.495,7.515,7.44,547051,NO,147.88,2022.4,Comedy,38,458,69400,1
3,20.6474,59.36,0.542,38873.89,119.3,6.895,7.035,6.92,7.02,8.26,516279,YES,185.36,225.344,Drama,45,472,66800,1
4,21.381,59.36,0.542,39701.585,127.7,6.92,7.07,6.815,7.07,8.26,531448,NO,176.48,225.792,Drama,55,395,72400,1


## Missing Value Imputation

In [3]:
df["Time_taken"].mean()
df["Time_taken"].fillna(value=df["Time_taken"].mean(),inplace=True)

## Dummy Variable Creation

In [4]:
df = pd.get_dummies(df,columns=["3D_available","Genre"],drop_first=True)

## X-y split

In [5]:
x = df.loc[:,df.columns!="Start_Tech_Oscar"]

y = df["Start_Tech_Oscar"]

## Test-Train Split

In [6]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = .2,random_state=0)

## Standarizing Data

In [7]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler().fit(x_train)

x_train_std = sc.transform(x_train)

x_test_std = sc.transform(x_test)

## Train SVM

In [8]:

from sklearn  import svm 

clf_svm_l = svm.SVC(kernel="linear",C=.01)

clf_svm_l.fit(x_train_std,y_train)

SVC(C=0.01, kernel='linear')

## Predict values using trained model

In [9]:
y_train_pred = clf_svm_l.predict(x_train_std)

y_test_pred = clf_svm_l.predict(x_test_std)

## Model Performance 

In [10]:
from sklearn.metrics import accuracy_score,confusion_matrix

In [11]:
confusion_matrix(y_test,y_test_pred)

array([[11, 33],
       [ 5, 53]], dtype=int64)

In [12]:
accuracy_score(y_test,y_test_pred)

0.6274509803921569

## Grid Search 

In [13]:
from sklearn.model_selection import GridSearchCV

In [14]:
params = {
    "C":(.001,.005,.01,.05,.1,.5,1,5,10,50,100,500,1000)
}

In [15]:
clf_svm_l = svm.SVC(kernel="linear")

In [16]:
svm_grid_lin = GridSearchCV(clf_svm_l,params,n_jobs=-1,cv=10,verbose=1,scoring="accuracy")

In [17]:
svm_grid_lin.fit(x_train_std,y_train)

Fitting 10 folds for each of 13 candidates, totalling 130 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done 130 out of 130 | elapsed:   43.7s finished


GridSearchCV(cv=10, estimator=SVC(kernel='linear'), n_jobs=-1,
             param_grid={'C': (0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50,
                               100, 500, 1000)},
             scoring='accuracy', verbose=1)

In [18]:
linsvm_clf = svm_grid_lin.best_estimator_

In [19]:
accuracy_score(y_test,linsvm_clf.predict(x_test_std))

0.5980392156862745

## Polynomial

In [21]:
clf_svm_p3 = svm.SVC(kernel="poly",degree=2,C=.1)
clf_svm_p3.fit(x_train_std,y_train)

SVC(C=0.1, degree=2, kernel='poly')

In [22]:
y_train_pred = clf_svm_p3.predict(x_train_std)
y_test_pred = clf_svm_p3.predict(x_test_std)

In [23]:
accuracy_score(y_test,y_test_pred)

0.5588235294117647

In [24]:
clf_svm_p3.n_support_

array([185, 194])

## Radial

In [25]:
clf_svm_r = svm.SVC(kernel="rbf",gamma=.5,C=10)
clf_svm_r.fit(x_train_std,y_train)

SVC(C=10, gamma=0.5)

In [26]:
y_train_pred = clf_svm_r.predict(x_train_std)
y_test_pred = clf_svm_r.predict(x_test_std)

In [27]:
accuracy_score(y_test,y_test_pred)

0.6176470588235294

In [28]:
clf_svm_r.n_support_

array([186, 218])

## Radial Grid

In [39]:
params = {
    "C":(.01,.05,.1,.5,1,5,10,50),
    "gamma":(.001,.01,.1,.05,1)
}

In [40]:
clf_svm_r = svm.SVC(kernel="rbf")

In [41]:
svm_grid_rad = GridSearchCV(clf_svm_r,params,n_jobs=-1,cv=3,verbose=1,scoring="accuracy")

In [42]:
svm_grid_rad.fit(x_train_std,y_train)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    0.3s finished


GridSearchCV(cv=3, estimator=SVC(), n_jobs=-1,
             param_grid={'C': (0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50),
                         'gamma': (0.001, 0.01, 0.1, 0.05, 1)},
             scoring='accuracy', verbose=1)

In [43]:
radscm_clf = svm_grid_rad.best_estimator_

In [44]:
accuracy_score(y_test,radscm_clf.predict(x_test_std))

0.6176470588235294