In [1]:
import numpy as np
import cv2
from skimage.color import rgb2hsv, hsv2rgb
from skimage import feature
from matplotlib import pyplot as plt
import scipy.misc
from skimage.io import imread
import os
import fnmatch
from time import time

Load the CSV file into panda dataframe. The column names description:

**Genre** : Genre for the given File

**imdbId**: Image file name (File names are the imdbID for the given movie)

**Feature1**: Label assigned to image file with given Genre

In [2]:
import pandas as pd
df = pd.read_csv("./../Dataset/Genres3Labels.csv",delimiter=",").fillna("-NA-")
df.head()

Unnamed: 0,Genre,imdbId,Feature1
0,Comedy,113101,0
1,Drama,114117,1
2,Drama,110299,1
3,Comedy,115683,0
4,Drama,114753,1


Get the columns of the dataframe. This contains the columns of data contained in .csv file

In [3]:
columns = list(df.columns.values)
print(columns)

['Genre', 'imdbId', 'Feature1']


Get the count of each Genre category in the dataframe

In [4]:
df.Genre.value_counts()

Drama          4205
Comedy         2408
Documentary    1559
Name: Genre, dtype: int64

Filter the unique Genre present in data and their count.

In [5]:
uniqueGenre = df.Feature1.unique()
print(uniqueGenre.shape)

(3,)


Compute the sift featurs

In [6]:
def getSIFTFeatures(image):
    sift = cv2.xfeatures2d.SIFT_create()
    keypoints, descriptors = sift.detectAndCompute(image, None)
    return keypoints, descriptors

In [7]:
def getBOWTrain():
    imageDir = "./../Dataset/MovieGenreFullPosters/"  
    inputData = []
    labels = []
    bow_train = cv2.BOWKMeansTrainer(100)
    detect = cv2.xfeatures2d.SIFT_create()
    for index, row in df.iterrows(): 
        filename = str(row[columns[1]])+".jpg"
        label = row[columns[2]]
        original_image = cv2.imread(imageDir+filename,0)
        keypoints, descriptors = getSIFTFeatures(original_image)
        bow_train.add(descriptors)
    return bow_train 

In [8]:
bow_train = getBOWTrain()

In [9]:
def loadBOWFeatures(bow_train):
    """

    :param bow_train: This contains the SIFT features descriptors added to BOWKMeansTrainer
    :return: feature descriptors and labels
    """
    flann_params = dict(algorithm = 1, trees = 5)
    matcher = cv2.FlannBasedMatcher(flann_params, dict(checks=50))
    voc = bow_train.cluster()
    extract = cv2.xfeatures2d.SIFT_create()
    detect = cv2.xfeatures2d.SIFT_create()
    bow_extract = cv2.BOWImgDescriptorExtractor( extract, cv2.BFMatcher(cv2.NORM_L2) )
    bow_extract.setVocabulary( voc )
    imageDir = "./../Dataset/MovieGenreFullPosters/"
    inputData = []
    labels = []
    for index, row in df.iterrows():
        filename = str(row[columns[1]])+".jpg"
        label = row[columns[2]]
        original_image =cv2.imread(imageDir+filename,0)
        bowFeatures = bow_extract.compute(original_image, detect.detect(original_image))
        inputData.extend(bowFeatures)
        labels.append(label)
    return inputData,labels

In [None]:
t0 = time()
BOWFeatures,BOWLabels = loadBOWFeatures(bow_train)
test_time = time() - t0
print("Feature Extraction time:  %0.3fs" % test_time)
BOWFeatures,BOWLabels = np.asarray(BOWFeatures),np.asarray(BOWLabels)
print (BOWFeatures.shape , BOWLabels.shape)

Divide the dataset into training data and testing data using sklearn train_test_split. The following is the definition of:

**X_train**: This is a numpy array of feature vectors of the image for train the classifier.

**X_test**:This is numpy array of lables of the images for training the classifier.

**y_train**: This is a numpy array of feature vectors of the image for testing the classifier.

**y_test** This is numpy array of lables of the images for testing the classifier.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(BOWFeatures, BOWLabels, test_size=0.25, random_state=42)
X_train, X_test, y_train, y_test = np.asarray(X_train), np.asarray(X_test), np.asarray(y_train), np.asarray(y_test)
print (X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [13]:
from __future__ import print_function
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression,chi2,SelectPercentile,SelectFpr
from sklearn.model_selection import GridSearchCV
from pprint import pprint
from sklearn import metrics
from time import time

Train RandomForestClassifier classifier and perform cross validation using GridSearchCV and feature selection using recursive cross validation. Compute confusion matrix, Precision score,accuracy,recall score and f1 score

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
pipeline = Pipeline([
    ('rfe', RFECV(estimator=SVC(kernel='linear'),scoring='accuracy',cv=StratifiedKFold(),step=1)),
    ('clf', RandomForestClassifier()),
])
parameters = {
    'clf__n_estimators': [10,20,50,100],
    'clf__max_features': ['auto', 'sqrt', 'log2'],
    'clf__random_state':[300,400,700]
}
print("Performing Grid Search to tune the hyper parameters of the model")
RandomForestModel = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
print("Performing Pipeline Steps:", [name for name, _ in pipeline.steps])
print("parameters are :")
pprint(parameters)
t0 = time()
RandomForestModel.fit(X_train, y_train)
print("Time to train the classifier is {}".format(time() - t0))
print()

print("Best score {}".format(RandomForestModel.best_score_))
print("Best parameters set:")
best_parameters = RandomForestModel.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
t0 = time()
predicted = RandomForestModel.predict(X_test)
print("Time to test the classifier is {}".format(time() - t0))
print("Calculated Accuracy is {}".format(metrics.accuracy_score(y_test, predicted)))
print("Precision Score is {}".format(metrics.precision_score(y_test, predicted, average='weighted')))
print("Recall Score is {}".format(metrics.recall_score(y_test, predicted, average='weighted')))
print("F1 Score is {}".format(metrics.f1_score(y_test, predicted, average='weighted')))
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))

Performing grid search...
pipeline: ['rfe', 'clf']
parameters:
{'clf__max_features': ['auto', 'sqrt', 'log2'],
 'clf__n_estimators': [10, 20, 50, 100],
 'clf__random_state': [300, 400, 700]}
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   22.4s
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:   59.9s finished


done in 62.238s

Best score: 0.391
Best parameters set:
	clf__max_features: 'auto'
	clf__n_estimators: 50
	clf__random_state: 700
Calculated Accuracy is 0.360888888889
Precision Score is 0.361624873602
Recall Score is 0.360888888889
F1 Score is 0.36038379701


Train AdaBoostClassifier. Compute confusion matrix, Precision score,accuracy,recall score and f1 score

In [1]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier()
print("Training the classifier: ")
print(clf)
t0 = time()
clf.fit(X_train, y_train)
print("Time to train the model is {}".format(time() - t0))
t0 = time()
pred = clf.predict(X_test)
print("Time to test the model is {}".format(time() - t0))
score = metrics.accuracy_score(y_test, pred)
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))

print("Calculated Accuracy is {}".format(metrics.accuracy_score(y_test, pred)))
print("Precision Score is {}".format(metrics.precision_score(y_test, pred, average='weighted')))
print("Recall Score is {}".format(metrics.recall_score(y_test, pred, average='weighted')))
print("F1 Score is {}".format(metrics.f1_score(y_test, pred, average='weighted')))

Train AdaBoostClassifier  by performing feature selection and cross validation. The feature selection is perform using Recursive feature selection with cross validation and hyper parameter tuning is performed GridSearchCV. 
Compute confusion matrix, Precision score,accuracy,recall score and f1 score

In [18]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_selection import RFECV
pipeline = Pipeline([
     ('rfe', RFECV(estimator=SVC(kernel="linear"),scoring='accuracy',cv=StratifiedKFold(2),step=1)),
    ('clf', AdaBoostClassifier()),
])
parameters = {
    'clf__n_estimators': [50,70,90],
    'clf__random_state': [1,20,40]
}
AdaBoostModel = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
print("pipeline Parameters:", [name for name, _ in pipeline.steps])
print("Chosen Parameters: are")
pprint(parameters)
t0 = time()
AdaBoostModel.fit(X_train, y_train)
print("Time to train the model is {}".format(time() - t0))
print("")

print("Best reported score is {}".format(AdaBoostModel.best_score_))
print("Best parameters set are: "))
best_parameters = AdaBoostModel.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("{}: {} ".format(param_name, best_parameters[param_name]))

predicted = AdaBoostModel.predict(X_test)
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))
print("Calculated Accuracy is {}".format(metrics.accuracy_score(y_test, predicted)))
print("Precision Score is {}".format(metrics.precision_score(y_test, predicted, average='weighted')))
print("Recall Score is {}".format(metrics.recall_score(y_test, predicted, average='weighted')))
print("F1 Score is {}".format(metrics.f1_score(y_test, predicted, average='weighted')))

Performing grid search...
pipeline: ['rfe', 'clf']
parameters:
{'clf__n_estimators': [50, 70, 90], 'clf__random_state': [1, 20, 40]}
Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:   12.4s finished


done in 14.724s

Best score: 0.408
Best parameters set:
	clf__n_estimators: 90
	clf__random_state: 1
confusion matrix:
[[149  99 127]
 [113 132 163]
 [ 81 111 150]]
Calculated Accuracy is 0.382222222222
Precision Score is 0.388026640958
Recall Score is 0.382222222222
F1 Score is 0.382603326644


Train MultiLayer Perceptron by performing cross validation to tune the hyper parameters of the classifier. The hyper parameter tuning is performed GridSearchCV. 
Compute confusion matrix, Precision score,accuracy,recall score and f1 score

In [19]:
from sklearn.neural_network import MLPClassifier
print("Training the classifier...")
t0 = time()
param_grid = {'hidden_layer_sizes': [(70, 20, 10),(40, 20, 10),(90, 20, 10)],
              'activation' :['logistic', 'tanh', 'relu'],
              'solver': ['lbfgs', 'sgd', 'adam'],
                'alpha':[ 0.0001, 0.001, 0.01]}
clf = GridSearchCV(MLPClassifier(), param_grid)
print(clf)
X_train = SelectKBest(chi2, k=2).fit_transform(X_train, y_train)
X_test = SelectKBest(chi2, k=2).fit_transform(X_test, y_test)
clf.fit(X_train, y_train)
train_time = time() - t0
print("Time to train the classifier is {}".format(train_time))
t0 = time()
pred = clf.predict(X_test)
test_time = time() - t0
print("Time to test the classifier is {}".format(test_time))
score = metrics.accuracy_score(y_test, pred)
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))

print("Calculated Accuracy is {}".format(metrics.accuracy_score(y_test, pred)))
print("Precision Score is {}".format(metrics.precision_score(y_test, pred, average='weighted')))
print("Recall Score is {}".format(metrics.recall_score(y_test, pred, average='weighted')))
print("F1 Score is {}".format(metrics.f1_score(y_test, pred, average='weighted')))

Training: 
GridSearchCV(cv=None, error_score='raise',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': [0.0001, 0.001, 0.01], 'activation': ['logistic', 'tanh', 'relu'], 'solver': ['lbfgs', 'sgd', 'adam'], 'hidden_layer_sizes': [(70, 20, 10), (40, 20, 10), (90, 20, 10)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)
train time: 168.683s
test time:  0.002s
confusion matrix:
[[127 106 142]
 [ 71 174 163]
 [ 56 132 154]]
Calculated Accuracy is 0.404444444444
Precision S

Train MultiLayer Perceptron by performing feature selection and cross validation. The feature selection is perform using Recursive feature selection with cross validation and hyper parameter tuning is performed GridSearchCV. 
Compute confusion matrix, Precision score,accuracy,recall score and f1 score

In [20]:
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFECV
pipeline = Pipeline([
    ('rfe', RFECV(estimator=SVC(kernel="linear"),scoring='accuracy',cv=StratifiedKFold(2),step=1)),
    ('clf', MLPClassifier()),
])
parameters = {
    'clf__hidden_layer_sizes': [(70, 20, 10),(40, 20, 10),(90, 20, 10)],
    'clf__activation': ['identity', 'logistic', 'tanh', 'relu'],
    'clf__solver':['lbfgs', 'sgd', 'adam'],
    'clf__alpha':[ 0.0001, 0.001, 0.001],
    'clf__learning_rate':['constant', 'invscaling', 'adaptive']
}
print("Performing Grid Search to tune the hyper parameters of the model")

MLPModel = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
print("pipeline :", [name for name, _ in pipeline.steps])
print("parameters are :")
pprint(parameters)
t0 = time()
MLPModel.fit(X_train, y_train)
print("Time to train the classifier is {}".format(time()-t0))
print()

print("Best score:{}".format( MLPModel.best_score_))
print("Best parameters set:")
best_parameters = MLPModel.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
t0 = time()
predicted = MLPModel.predict(X_test)
print("Time to test the classifier is {}".format(time()-t0))
print("Calculated Accuracy is {}".format(metrics.accuracy_score(y_test, predicted)))
print("Precision Score is {}".format(metrics.precision_score(y_test, predicted, average='weighted')))
print("Recall Score is {}".format(metrics.recall_score(y_test, predicted, average='weighted')))
print("F1 Score is {}".format(metrics.f1_score(y_test, predicted, average='weighted')))
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))

Performing grid search...
pipeline: ['rfe', 'clf']
parameters:
{'clf__activation': ['identity', 'logistic', 'tanh', 'relu'],
 'clf__alpha': [0.0001, 0.001, 0.001],
 'clf__hidden_layer_sizes': [(70, 20, 10), (40, 20, 10), (90, 20, 10)],
 'clf__learning_rate': ['constant', 'invscaling', 'adaptive'],
 'clf__solver': ['lbfgs', 'sgd', 'adam']}
Fitting 3 folds for each of 324 candidates, totalling 972 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   15.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 972 out of 972 | elapsed:  8.7min finished


done in 526.412s

Best score: 0.428
Best parameters set:
	clf__activation: 'relu'
	clf__alpha: 0.0001
	clf__hidden_layer_sizes: (90, 20, 10)
	clf__learning_rate: 'adaptive'
	clf__solver: 'lbfgs'
Calculated Accuracy is 0.386666666667
Precision Score is 0.413962243581
Recall Score is 0.386666666667
F1 Score is 0.386451447169


Predict Scores using VotingClassifier. The classifiers used are LogisticRegression(),KNeighborsClassifier() and  MLPClassifier().  Compute confusion matrix, Precision score,accuracy,recall score and f1 score

In [21]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
clf1 = LogisticRegression()
clf2 = KNeighborsClassifier()
clf3 = MLPClassifier()
eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('knn', clf3)], voting='hard')
print("Training the classifier...")
print(eclf1)
t0 = time()
eclf1 = eclf1.fit(X_train, y_train)
train_time = time() - t0
print("Time to train the classifier is {}".format(train_time))
t0 = time()
pred = eclf1.predict(X_test)
test_time = time() - t0
print("Time to test the classifier is {}".format(test_time))
print("The confusion matrix: {}".format(metrics.confusion_matrix(y_test, pred)))
print("Accuracy is {}".format(metrics.accuracy_score(y_test, pred)))
print("Precision Score is {}".format(metrics.precision_score(y_test, pred, average='weighted')))
print("Recall Score is {}".format(metrics.recall_score(y_test, pred, average='weighted')))
print("F1 Score is {}".format(metrics.f1_score(y_test, pred, average='weighted')))

Training: 
VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('rf', KNeighb...=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False))],
         n_jobs=1, voting='hard', weights=None)
train time: 0.650s
test time:  0.022s
confusion matrix:
[[ 77 100 198]
 [ 46 169 193]
 [ 43 123 176]]
Calculated Accuracy is 0.375111111111
Precision Score is 0.405335531089
Recall Score is 0.375111111111
F1 Score is 0.365833252279


Train linear SVM  by performing feature selection and cross validation. The feature selection is perform using Recursive feature selection with cross validation and hyper parameter tuning is performed GridSearchCV. 
Compute confusion matrix, Precision score,accuracy,recall score and f1 score

In [23]:
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
pipeline = Pipeline([
    ('rfe', RFECV(estimator=SVC(kernel="linear"),scoring='accuracy',cv=StratifiedKFold(2),step=1)),
    ('clf', SVC()),
])
parameters = {
    'clf__kernel':('linear','rbf','sigmoid','poly'),
    'clf__C': (0.001,0.0001,0.01)
}
SVC_clf = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
print("pipeline Parameters:", [name for name, _ in pipeline.steps])
print("Chosen Parameters: are")
pprint(parameters)
t0 = time()
SVC_clf.fit(X_train, y_train)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score:{}".format( SVC_clf.best_score)_)
print("Best parameters set:")
best_parameters = SVC_clf.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

predicted = SVC_clf.predict(X_test)
print("Calculated Accuracy is {}".format(metrics.accuracy_score(y_test, predicted)))
print("Precision Score is {}".format(metrics.precision_score(y_test, predicted, average='weighted')))
print("Recall Score is {}".format(metrics.recall_score(y_test, predicted, average='weighted')))
print("F1 Score is {}".format(metrics.f1_score(y_test, predicted, average='weighted')))
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))

Performing grid search...
pipeline: ['rfe', 'clf']
parameters:
{'clf__C': (0.001, 0.0001, 0.01),
 'clf__kernel': ('linear', 'rbf', 'sigmoid', 'poly')}
Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:    9.6s finished


done in 10.955s

Best score: 0.343
Best parameters set:
	clf__C: 0.001
	clf__kernel: 'linear'
Calculated Accuracy is 0.304
Precision Score is 0.092416
Recall Score is 0.304
F1 Score is 0.141742331288
confusion matrix:
[[ 77 100 198]
 [ 46 169 193]
 [ 43 123 176]]


Train SVC by performing cross validation to tune the hyper parameters of the classifier. The hyper parameter tuning is performed GridSearchCV. 
Compute confusion matrix, Precision score,accuracy,recall score and f1 score

In [None]:
from sklearn.svm import SVC
parameters = {
    'C': (0.001,0.0001,0.01,1)
}
print("Performing Grid Search to tune the hyper parameters of the model")
SVC_clf = GridSearchCV(SVC(kernel="linear"), parameters, n_jobs=-1, verbose=1)
t0 = time()
SVC_clf.fit(X_train, y_train)
print("Time to train the model is {}".format(time() - t0))
print(" ")
print("Best score: %0.3f" % SVC_clf.best_score_)
print("Best parameters set:")
best_parameters = SVC_clf.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
predicted = SVC_clf.predict(X_test)
print("Calculated Accuracy is {}".format(metrics.accuracy_score(y_test, predicted)))
print("Precision Score is {}".format(metrics.precision_score(y_test, predicted, average='weighted')))
print("Recall Score is {}".format(metrics.recall_score(y_test, predicted, average='weighted')))
print("F1 Score is {}".format(metrics.f1_score(y_test, predicted, average='weighted')))
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))