In [17]:
import numpy as np
import cv2
from skimage.color import rgb2hsv, hsv2rgb
from skimage import feature
from matplotlib import pyplot as plt
import scipy.misc
from skimage.io import imread
import os
import fnmatch
from time import time

In [18]:
def getMoments(image):
    """

    :param image: input image
    :return: numpy array which contains the mean,standard deviation,skewness and heuMoments of the image
    """

    #read the image in HSV color space
    imageHSV = cv2.cvtColor(image,cv2.COLOR_BGR2HSV)

    #calculate the mean and standard deviation of the image
    (means, stds) = cv2.meanStdDev(imageHSV)

    #calculate the image moments
    moments = cv2.moments(imageHSV[:,:,0])
    skew0 = moments['mu11'] / moments['mu02'] if moments['mu02'] !=0 else moments['mu11']
    moments = cv2.moments(imageHSV[:,:,1])
    skew1 = moments['mu11'] / moments['mu02'] if moments['mu02'] !=0 else moments['mu11']
    moments = cv2.moments(imageHSV[:,:,2])
    skew2 = moments['mu11'] / moments['mu02'] if moments['mu02'] !=0 else moments['mu11']
    skew = np.asarray([skew0,skew1,skew2]).reshape((3, 1))

    #concatenate the mean,standard deviation,skew features into a single numpy array
    stats = np.concatenate([means, stds,skew]).flatten()

    #calculate the HuMoments of the image
    imageGray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    Moments = cv2.HuMoments(cv2.moments(imageGray)).flatten()
    moments = np.asarray(Moments)

    #form the final feature vector which contains the mean,standard deviation, skewness and HueMoments of the image
    finalFeatureVector = np.append(stats,moments)

    #return the feature vector
    return finalFeatureVector

In [19]:
def getHSVFeatures(image,bins):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([image], [0, 1, 2], None, bins,
            [0, 180, 0, 256, 0, 256])
    features = cv2.normalize(hist,hist).flatten()
    return features

In [20]:
def getLBPFeatures(image,numPoints, radius):
    """

    :param image: input image (greyscale) as numpy array
    :param numPoints:number of points p in a circularly symmetric neighborhood of central pixel
    :param radius: radius of the circle
    :return: histogram of Local binary pattern of given image
    """
    lbp = feature.local_binary_pattern(image, numPoints,
            radius, method="uniform")
    (hist, _) = np.histogram(lbp.ravel(),
            bins=np.arange(0, numPoints + 3),
            range=(0, numPoints + 2))
    # normalize the histogram
    eps=1e-7
    hist = hist.astype("float")
    hist /= (hist.sum() + eps)
    return hist

In [22]:
def load_Features():
    imageDir = "./../Dataset/MovieGenreFullPosters/"  
    inputData = []
    labels = []
    for index, row in df.iterrows(): 
        filename = str(row[columns[1]])+".jpg"
        label = row[columns[2]]
        original_image = cv2.imread(imageDir+filename)
        features = getHSVFeatures(original_image,(8, 12, 3))   
        moments = getMoments(original_image)
        cFeatures = np.append(features,moments)
        original_image = imread(imageDir+filename,as_grey= True)
        hist = getLBPFeatures(original_image,24,8)
        inputData.append(np.append(cFeatures,hist))
        labels.append(label)
    return inputData,labels

Load the CSV file into panda dataframe. The column names description:

**Genre** : Genre for the given File

**imdbId**: Image file name (File names are the imdbID for the given movie)

**Feature1**: Label assigned to image file with given Genre

In [23]:
import pandas as pd
df = pd.read_csv("./../Dataset/BalancedTop3FullGenre.csv",delimiter=",").fillna("-NA-")
df.head()

Unnamed: 0,Genre,imdbId,Feature1,Feature2
0,Action|Comedy|Crime,71216,Action,0
1,Drama|Romance,245238,Drama,1
2,Drama|War,263366,Drama,1
3,Comedy|Family|Horror,61550,Comedy,2
4,Action|Adventure|Drama,422091,Action,0


Get the columns of the dataframe. This contains the columns of data contained in .csv file

In [24]:
columns = list(df.columns.values)
print(columns)

['Genre', 'imdbId', 'Feature1', 'Feature2']


Get the count of each Genre category in the dataframe

In [25]:
df.Genre.value_counts()

Drama                        1638
Drama|Romance                 826
Comedy                        731
Comedy|Drama|Romance          686
Comedy|Drama                  676
Action|Crime|Drama            546
Comedy|Romance                496
Action|Comedy|Crime           269
Action|Adventure|Drama        256
Action|Crime|Thriller         243
Action|Adventure|Comedy       239
Drama|Thriller                216
Comedy|Crime|Drama            182
Drama|War                     180
Action|Drama                  180
Action|Thriller               161
Action|Adventure|Sci-Fi       161
Action|Drama|Thriller         159
Comedy|Crime                  149
Action|Adventure|Fantasy      135
Action                        125
Action|Comedy                 123
Action|Horror|Sci-Fi          107
Drama|Mystery|Thriller        100
Action|Sci-Fi                  95
Comedy|Drama|Music             92
Drama|Romance|War              89
Comedy|Musical|Romance         88
Comedy|Drama|Fantasy           86
Action|Sci-Fi|

Filter the unique Genre present in data and their count.

In [26]:
uniqueGenre = df.Genre.unique()
print (uniqueGenre.shape)

(378,)


In [27]:
cFeatures,cLabels = load_Features()
cFeatures,cLabels = np.asarray(cFeatures),np.asarray(cLabels)
print (cFeatures.shape,cLabels.shape)

(13800, 330) (13800,)


Divide the dataset into training data and testing data using sklearn train_test_split. The following is the definition of:

**X_train**: This is a numpy array of feature vectors of the image for train the classifier.

**X_test**:This is numpy array of lables of the images for training the classifier.

**y_train**: This is a numpy array of feature vectors of the image for testing the classifier.

**y_test** This is numpy array of lables of the images for testing the classifier.

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(cFeatures, cLabels, test_size=0.25, random_state=42)
X_train, X_test, y_train, y_test = np.asarray(X_train), np.asarray(X_test), np.asarray(y_train), np.asarray(y_test)
print (X_train.shape, X_test.shape, y_train.shape, y_test.shape)

((10350, 2930), (3450, 2930), (10350,), (3450,))


In [15]:
from __future__ import print_function
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression,chi2,SelectPercentile,SelectFpr
from sklearn.model_selection import GridSearchCV
from pprint import pprint
from sklearn import metrics
from time import time

Train RandomForestClassifier classifier and perform cross validation using GridSearchCV and feature selection using recursive cross validation. Compute confusion matrix, Precision score,accuracy,recall score and f1 score

In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
pipeline = Pipeline([
    #('rfe', RFECV(estimator=SVC(kernel='linear'),scoring='accuracy',cv=StratifiedKFold(),step=1)),
    ('clf', RandomForestClassifier()),
])
parameters = {
    'clf__n_estimators': [10,20,50,100],
    'clf__max_features': ['auto', 'sqrt', 'log2'],
    'clf__random_state':[300,400,700]
}
print("Performing Grid Search to tune the hyper parameters of the model")
RandomForestModel = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
print("Performing Pipeline Steps:", [name for name, _ in pipeline.steps])
print("parameters are :")
pprint(parameters)
t0 = time()
RandomForestModel.fit(X_train, y_train)
print("Time to train the classifier is {}".format(time() - t0))
print()

print("Best score {}".format(RandomForestModel.best_score_))
print("Best parameters set:")
best_parameters = RandomForestModel.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
t0 = time()
predicted = RandomForestModel.predict(X_test)
print("Time to test the classifier is {}".format(time() - t0))
print("Calculated Accuracy is {}".format(metrics.accuracy_score(y_test, predicted)))
print("Precision Score is {}".format(metrics.precision_score(y_test, predicted, average='weighted')))
print("Recall Score is {}".format(metrics.recall_score(y_test, predicted, average='weighted')))
print("F1 Score is {}".format(metrics.f1_score(y_test, predicted, average='weighted')))
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))

Performing Grid Search to tune the hyper parameters of the model
Performing Pipeline Steps: ['clf']
parameters are :
{'clf__max_features': ['auto', 'sqrt', 'log2'],
 'clf__n_estimators': [10, 20, 50, 100],
 'clf__random_state': [300, 400, 700]}
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:  6.7min finished


Time to train the classifier is 428.899471045

Best score 0.53806763285
Best parameters set:
	clf__max_features: 'auto'
	clf__n_estimators: 100
	clf__random_state: 700
Time to test the classifier is 0.272365093231
Calculated Accuracy is 0.53652173913
Precision Score is 0.536992111399
Recall Score is 0.53652173913
F1 Score is 0.53551514228
confusion matrix:
[[578 253 302]
 [238 636 290]
 [320 309 524]]


Train AdaBoostClassifier. Compute confusion matrix, Precision score,accuracy,recall score and f1 score

In [29]:
from time import time
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier()
print("Training the classifier: ")
print(clf)
t0 = time()
clf.fit(X_train, y_train)
print("Time to train the model is {}".format(time() - t0))
t0 = time()
pred = clf.predict(X_test)
print("Time to test the model is {}".format(time() - t0))
score = metrics.accuracy_score(y_test, pred)
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))

print("Calculated Accuracy is {}".format(metrics.accuracy_score(y_test, pred)))
print("Precision Score is {}".format(metrics.precision_score(y_test, pred, average='weighted')))
print("Recall Score is {}".format(metrics.recall_score(y_test, pred, average='weighted')))
print("F1 Score is {}".format(metrics.f1_score(y_test, pred, average='weighted')))

Training the classifier: 
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
Time to train the model is 111.384582043
Time to test the model is 0.302144050598
confusion matrix:
[[578 253 302]
 [238 636 290]
 [320 309 524]]
Calculated Accuracy is 0.503768115942
Precision Score is 0.503129353024
Recall Score is 0.503768115942
F1 Score is 0.503369729456


Train AdaBoostClassifier  by performing feature selection and cross validation. The feature selection is perform using Recursive feature selection with cross validation and hyper parameter tuning is performed GridSearchCV. 
Compute confusion matrix, Precision score,accuracy,recall score and f1 score

In [1]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_selection import RFECV
pipeline = Pipeline([
     ('rfe', RFECV(estimator=SVC(kernel="linear"),scoring='accuracy',cv=StratifiedKFold(2),step=1)),
    ('clf', AdaBoostClassifier()),
])
parameters = {
    'clf__n_estimators': [50,70,90],
    'clf__random_state': [1,20,40]
}
AdaBoostModel = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
print("pipeline Parameters:", [name for name, _ in pipeline.steps])
print("Chosen Parameters: are")
pprint(parameters)
t0 = time()
AdaBoostModel.fit(X_train, y_train)
print("Time to train the model is {}".format(time() - t0))
print("")

print("Best reported score is {}".format(AdaBoostModel.best_score_))
print("Best parameters set are: "))
best_parameters = AdaBoostModel.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("{}: {} ".format(param_name, best_parameters[param_name]))

predicted = AdaBoostModel.predict(X_test)
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))
print("Calculated Accuracy is {}".format(metrics.accuracy_score(y_test, predicted)))
print("Precision Score is {}".format(metrics.precision_score(y_test, predicted, average='weighted')))
print("Recall Score is {}".format(metrics.recall_score(y_test, predicted, average='weighted')))
print("F1 Score is {}".format(metrics.f1_score(y_test, predicted, average='weighted')))

Train MultiLayer Perceptron by performing cross validation to tune the hyper parameters of the classifier. The hyper parameter tuning is performed GridSearchCV. 
Compute confusion matrix, Precision score,accuracy,recall score and f1 score

In [72]:
from sklearn.neural_network import MLPClassifier
print("Training the classifier...")
t0 = time()
param_grid = {'hidden_layer_sizes': [(70, 20, 10),(40, 20, 10),(90, 20, 10)],
              'activation' :['logistic', 'tanh', 'relu'],
              'solver': ['lbfgs', 'sgd', 'adam'],
                'alpha':[ 0.0001, 0.001, 0.01]}
clf = GridSearchCV(MLPClassifier(), param_grid)
print(clf)
X_train = SelectKBest(chi2, k=2).fit_transform(X_train, y_train)
X_test = SelectKBest(chi2, k=2).fit_transform(X_test, y_test)
clf.fit(X_train, y_train)
train_time = time() - t0
print("Time to train the classifier is {}".format(train_time))
t0 = time()
pred = clf.predict(X_test)
test_time = time() - t0
print("Time to test the classifier is {}".format(test_time))
score = metrics.accuracy_score(y_test, pred)
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))

print("Calculated Accuracy is {}".format(metrics.accuracy_score(y_test, pred)))
print("Precision Score is {}".format(metrics.precision_score(y_test, pred, average='weighted')))
print("Recall Score is {}".format(metrics.recall_score(y_test, pred, average='weighted')))
print("F1 Score is {}".format(metrics.f1_score(y_test, pred, average='weighted')))

Training: 
MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(70, 20, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)
train time: 23.896s
test time:  0.020s
confusion matrix:
[[251 339   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [165 894   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [ 63 343   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  7 184   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  8  71   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  5   5   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 

Train MultiLayer Perceptron by performing feature selection and cross validation. The feature selection is perform using Recursive feature selection with cross validation and hyper parameter tuning is performed GridSearchCV. 
Compute confusion matrix, Precision score,accuracy,recall score and f1 score

In [41]:
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFECV
pipeline = Pipeline([
    ('rfe', RFECV(estimator=SVC(kernel="linear"),scoring='accuracy',cv=StratifiedKFold(2),step=1)),
    ('clf', MLPClassifier()),
])
parameters = {
    'clf__hidden_layer_sizes': [(70, 20, 10),(40, 20, 10),(90, 20, 10)],
    'clf__activation': ['identity', 'logistic', 'tanh', 'relu'],
    'clf__solver':['lbfgs', 'sgd', 'adam'],
    'clf__alpha':[ 0.0001, 0.001, 0.001],
    'clf__learning_rate':['constant', 'invscaling', 'adaptive']
}
print("Performing Grid Search to tune the hyper parameters of the model")

MLPModel = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
print("pipeline :", [name for name, _ in pipeline.steps])
print("parameters are :")
pprint(parameters)
t0 = time()
MLPModel.fit(X_train, y_train)
print("Time to train the classifier is {}".format(time()-t0))
print()

print("Best score:{}".format( MLPModel.best_score_))
print("Best parameters set:")
best_parameters = MLPModel.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
t0 = time()
predicted = MLPModel.predict(X_test)
print("Time to test the classifier is {}".format(time()-t0))
print("Calculated Accuracy is {}".format(metrics.accuracy_score(y_test, predicted)))
print("Precision Score is {}".format(metrics.precision_score(y_test, predicted, average='weighted')))
print("Recall Score is {}".format(metrics.recall_score(y_test, predicted, average='weighted')))
print("F1 Score is {}".format(metrics.f1_score(y_test, predicted, average='weighted')))
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))

Training: 
VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('rf', KNeighb...=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False))],
         n_jobs=1, voting='hard', weights=None)
train time: 2.429s
test time:  0.113s
confusion matrix:
[[262 325   0   3   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [182 867   2   8   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [ 83 311   2  10   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [ 14 166   1  10   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  7  70   0   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  4   6   0   0   0   0   0   0   0 

Predict Scores using VotingClassifier. The classifiers used are LogisticRegression(),KNeighborsClassifier() and  MLPClassifier().  Compute confusion matrix, Precision score,accuracy,recall score and f1 score

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
clf1 = LogisticRegression()
clf2 = KNeighborsClassifier()
clf3 = MLPClassifier()
eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('knn', clf3)], voting='hard')
print("Training the classifier...")
print(eclf1)
t0 = time()
eclf1 = eclf1.fit(X_train, y_train)
train_time = time() - t0
print("Time to train the classifier is {}".format(train_time))
t0 = time()
pred = eclf1.predict(X_test)
test_time = time() - t0
print("Time to test the classifier is {}".format(test_time))
print("The confusion matrix: {}".format(metrics.confusion_matrix(y_test, pred)))
print("Accuracy is {}".format(metrics.accuracy_score(y_test, pred)))
print("Precision Score is {}".format(metrics.precision_score(y_test, pred, average='weighted')))
print("Recall Score is {}".format(metrics.recall_score(y_test, pred, average='weighted')))
print("F1 Score is {}".format(metrics.f1_score(y_test, pred, average='weighted')))

Train linear SVM  by performing feature selection and cross validation. The feature selection is perform using Recursive feature selection with cross validation and hyper parameter tuning is performed GridSearchCV. 
Compute confusion matrix, Precision score,accuracy,recall score and f1 score

In [None]:
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
pipeline = Pipeline([
    ('rfe', RFECV(estimator=SVC(kernel="linear"),scoring='accuracy',cv=StratifiedKFold(2),step=1)),
    ('clf', SVC()),
])
parameters = {
    'clf__kernel':('linear','rbf','sigmoid','poly'),
    'clf__C': (0.001,0.0001,0.01)
}
SVC_clf = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
print("pipeline Parameters:", [name for name, _ in pipeline.steps])
print("Chosen Parameters: are")
pprint(parameters)
t0 = time()
SVC_clf.fit(X_train, y_train)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score:{}".format( SVC_clf.best_score)_)
print("Best parameters set:")
best_parameters = SVC_clf.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

predicted = SVC_clf.predict(X_test)
print("Calculated Accuracy is {}".format(metrics.accuracy_score(y_test, predicted)))
print("Precision Score is {}".format(metrics.precision_score(y_test, predicted, average='weighted')))
print("Recall Score is {}".format(metrics.recall_score(y_test, predicted, average='weighted')))
print("F1 Score is {}".format(metrics.f1_score(y_test, predicted, average='weighted')))
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))

Train SVC by performing cross validation to tune the hyper parameters of the classifier. The hyper parameter tuning is performed GridSearchCV. 
Compute confusion matrix, Precision score,accuracy,recall score and f1 score

In [None]:
from sklearn.svm import SVC
parameters = {
    'C': (0.001,0.0001,0.01,1)
}
print("Performing Grid Search to tune the hyper parameters of the model")
SVC_clf = GridSearchCV(SVC(kernel="linear"), parameters, n_jobs=-1, verbose=1)
t0 = time()
SVC_clf.fit(X_train, y_train)
print("Time to train the model is {}".format(time() - t0))
print(" ")
print("Best score: %0.3f" % SVC_clf.best_score_)
print("Best parameters set:")
best_parameters = SVC_clf.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
predicted = SVC_clf.predict(X_test)
print("Calculated Accuracy is {}".format(metrics.accuracy_score(y_test, predicted)))
print("Precision Score is {}".format(metrics.precision_score(y_test, predicted, average='weighted')))
print("Recall Score is {}".format(metrics.recall_score(y_test, predicted, average='weighted')))
print("F1 Score is {}".format(metrics.f1_score(y_test, predicted, average='weighted')))
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))