In [2]:
import numpy as np
import cv2
from skimage.color import rgb2hsv, hsv2rgb
from skimage import feature
from matplotlib import pyplot as plt
import scipy.misc
from skimage.io import imread
import os
import fnmatch

In [3]:
def getMoments(image):
    """

    :param image: input image
    :return: numpy array which contains the mean,standard deviation,skewness and heuMoments of the image
    """

    #read the image in HSV color space
    imageHSV = cv2.cvtColor(image,cv2.COLOR_BGR2HSV)

    #calculate the mean and standard deviation of the image
    (means, stds) = cv2.meanStdDev(imageHSV)

    #calculate the image moments
    moments = cv2.moments(imageHSV[:,:,0])
    skew0 = moments['mu11'] / moments['mu02'] if moments['mu02'] !=0 else moments['mu11']
    moments = cv2.moments(imageHSV[:,:,1])
    skew1 = moments['mu11'] / moments['mu02'] if moments['mu02'] !=0 else moments['mu11']
    moments = cv2.moments(imageHSV[:,:,2])
    skew2 = moments['mu11'] / moments['mu02'] if moments['mu02'] !=0 else moments['mu11']
    skew = np.asarray([skew0,skew1,skew2]).reshape((3, 1))

    #concatenate the mean,standard deviation,skew features into a single numpy array
    stats = np.concatenate([means, stds,skew]).flatten()

    #calculate the HuMoments of the image
    imageGray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    Moments = cv2.HuMoments(cv2.moments(imageGray)).flatten()
    moments = np.asarray(Moments)

    #form the final feature vector which contains the mean,standard deviation, skewness and HueMoments of the image
    finalFeatureVector = np.append(stats,moments)

    #return the feature vector
    return finalFeatureVector

In [4]:
def getTopHSVFeatures(image,bins):
    image = cv2.cvtColor(image,cv2.COLOR_BGR2HSV)
    chans = cv2.split(image)
    features = []
#     for ind,channel in enumerate(chans):
#         print ind
#         if ind == 0:
#             hist = cv2.calcHist([channel], [0], None, [180], [0, 180]) 
#         else:
#             hist = cv2.calcHist([channel], [0], None, [256], [0, 256]) 
#         histFeatures = cv2.normalize(hist,hist).flatten()
#         temp = np.partition(-histFeatures, 48)
#         result = -temp[:36]
#         features.extend(histFeatures)
    hist = cv2.calcHist([image], [0, 1, 2], None, bins,
            [0, 180, 0, 256, 0, 256])
    histFeatures = cv2.normalize(hist,hist).flatten()
    temp = np.partition(-histFeatures, 48)
    result = -temp[:48]
    features.extend(result)
    return features

In [7]:
def getHSVFeatures(image,bins):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([image], [0, 1, 2], None, bins,
            [0, 180, 0, 256, 0, 256])
    features = cv2.normalize(hist,hist).flatten()
    return features

In [8]:
def refinedHSVFeatures(image,bins):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    features = []
    
    #compute the center of the move image poster.
    (height, width) = image.shape[:2]
    (centerX, centerY) = (int(width * 0.5), int(height * 0.5))
    
    #entire image is segmented into four rectangeles i.e top-left,top-right, bottom-left, bottom-right
    Imagesegments = [(0, centerX, 0, centerY), (centerX, width, 0, centerY), (centerX, width, centerY, height),
            (0, centerX, centerY, height)]
    
    #an elliptal mask covering the center of the image is created, this will be used to compute the histogram of
    #5 region i.e the mask will be substracted from 5 rectangular regions created above

    (axesX, axesY) = (int(width * 0.75) / 2, int(height * 0.75) / 2)
    ellipMask = np.zeros(image.shape[:2], dtype = "uint8")
    cv2.ellipse(ellipMask, (centerX, centerY), (axesX, axesY), 0, 0, 360, 255, -1)
    
    # loop over the created rectange image segements 
    for (startX, endX, startY, endY) in Imagesegments:
        # a mask is created for each corner of the image by sustracting the elliptical mask created above
        cornerMask = np.zeros(image.shape[:2], dtype = "uint8")
        cv2.rectangle(cornerMask, (startX, startY), (endX, endY), 255, -1)
        cornerMask = cv2.subtract(cornerMask, ellipMask)

        # color histogram from each sub region of the image is computed and then feature vector is updated accordingly.
        hist = cv2.calcHist([image], [0, 1, 2], cornerMask, bins,
            [0, 180, 0, 256, 0, 256])
        hist = cv2.normalize(hist,hist).flatten()
        features.extend(hist)

    #at last the color histogram from the elliptical region of the image is computer and 
    # feature vector is updated accordingle.
    hist = cv2.calcHist([image], [0, 1, 2], ellipMask, bins,
            [0, 180, 0, 256, 0, 256])
    hist = cv2.normalize(hist,hist).flatten()
    features.extend(hist)
    
    # return the feature vector
    return features

Load the CSV file into panda dataframe. The column names description:

**Genre** : Genre for the given File

**imdbId**: Image file name (File names are the imdbID for the given movie)

**Feature1**: Label assigned to image file with given Genre

In [64]:
import pandas as pd
df = pd.read_csv("./../Dataset/Top3FullGenre.csv",delimiter=",").fillna("-NA-")
df.head()

Unnamed: 0,Genre,imdbId,Feature1,Feature2
0,Action|Adventure|Drama,35153,Action,0
1,Drama,1630036,Drama,1
2,Comedy|Romance,1195478,Comedy,2
3,Comedy|Action|Crime,79966,Comedy,2
4,Drama|History,1029364,Drama,1


Get the columns of the dataframe. This contains the columns of data contained in .csv file

In [65]:
columns = list(df.columns.values)
print(columns)

['Genre', 'imdbId', 'Feature1', 'Feature2']


Get the count of each Genre category in the dataframe

In [66]:
df.Genre.value_counts()

Drama                       4205
Comedy                      2408
Comedy|Drama                1474
Drama|Romance               1448
Comedy|Drama|Romance        1130
Comedy|Romance              1002
Action|Crime|Drama           558
Drama|Thriller               509
Drama|War                    321
Comedy|Crime|Drama           289
Action|Comedy|Crime          275
Action|Adventure|Drama       264
Action|Crime|Thriller        256
Comedy|Crime                 247
Action|Adventure|Comedy      245
Drama|Mystery|Thriller       199
Comedy|Horror                194
Action|Drama                 183
Comedy|Drama|Family          169
Action|Thriller              167
Drama|History                166
Action|Drama|Thriller        164
Action|Adventure|Sci-Fi      163
Comedy|Drama|Music           147
Drama|Horror|Thriller        141
Action|Adventure|Fantasy     137
Comedy|Musical|Romance       136
Drama|Horror|Mystery         135
Drama|Family                 134
Action                       131
          

Filter the unique Genre present in data and their count.

In [67]:
uniqueGenre = df.Genre.unique()
print (uniqueGenre.shape)

(454,)


In [68]:
def loadHSVFeatures():
    imageDir = "./../Dataset/MovieGenreFullPosters/"  
    inputData = []
    labels = []
    for index, row in df.iterrows(): 
        filename = str(row[columns[1]])+".jpg"
        label = row[columns[2]]
        original_image = cv2.imread(imageDir+filename)
        features = getHSVFeatures(original_image,(8, 24, 3))    
        inputData.append(features)
        labels.append(label)
    return inputData,labels

In [69]:
def loadTopHSVFeatures():
    imageDir = "./../Dataset/MovieGenreFullPosters/"  
    inputData = []
    labels = []
    for index, row in df.iterrows(): 
        filename = str(row[columns[1]])+".jpg"
        label = row[columns[2]]
        original_image = cv2.imread(imageDir+filename)
        features = getTopHSVFeatures(original_image,(8, 24, 3))    
        inputData.append(features)
        labels.append(label)
    return inputData,labels

In [70]:
def loadMomentFeatures():
    imageDir = "./../Dataset/MovieGenreFullPosters/"  
    inputData = []
    labels = []
    for index, row in df.iterrows(): 
        filename = str(row[columns[1]])+".jpg"
        label = row[columns[2]]
        original_image = cv2.imread(imageDir+filename)
        features = getMoments(original_image)
        inputData.append(features)
        labels.append(label)
    return inputData,labels

In [71]:
def loadRefinedHSVFeatures():
    imageDir = "./../Dataset/MovieGenreFullPosters/"  
    inputData = []
    labels = []
    for index, row in df.iterrows(): 
        filename = str(row[columns[1]])+".jpg"
        label = row[columns[2]]
        original_image = cv2.imread(imageDir+filename)
        features = refinedHSVFeatures(original_image,(8, 12, 3))    
        inputData.append(features)
        labels.append(label)
    return inputData,labels

In [72]:
def loadTopHSVMomentFeatures():
    imageDir = "./../Dataset/MovieGenreFullPosters/"  
    inputData = []
    labels = []
    for index, row in df.iterrows(): 
        filename = str(row[columns[1]])+".jpg"
        label = row[columns[2]]
        original_image = cv2.imread(imageDir+filename)
        features = getTopHSVFeatures(original_image,(8, 24, 3))
        moments = getMoments(original_image)
        inputData.append(np.append(features,moments))
        labels.append(label)
    return inputData,labels

In [59]:
def LoadHSVMomentFeatures():
    imageDir = "./../Dataset/MovieGenreFullPosters/"  
    inputData = []
    labels = []
    for index, row in df.iterrows(): 
        filename = str(row[columns[1]])+".jpg"
        label = row[columns[2]]
        original_image = cv2.imread(imageDir+filename)
        features = getHSVFeatures(original_image,(8, 24, 3))
        moments = getMoments(original_image)
        inputData.append(np.append(features,moments))
        labels.append(label)
    return inputData,labels

In [26]:
HSVFeatures,HSVLabels = loadHSVFeatures()
HSVFeatures,HSVLabels = np.asarray(HSVFeatures),np.asarray(HSVLabels)
print (HSVFeatures.shape,HSVLabels.shape)

((13800, 576), (13800,))


In [42]:
RefinedHSVFeatures,Labels = loadRefinedHSVFeatures()
RefinedHSVFeatures,Labels = np.asarray(RefinedHSVFeatures),np.asarray(Labels)
print (RefinedHSVFeatures.shape,Labels.shape)

((13800, 1440), (13800,))


In [27]:
MomentFeatues,MomentLabels = loadMomentFeatures()
MomentFeatues,MomentLabels = np.asarray(MomentFeatues),np.asarray(MomentLabels)
print (MomentFeatues.shape,MomentLabels.shape)

((8172, 16), (8172,))


In [73]:
cFeatures,cLabels = LoadHSVMomentFeatures()
cFeatures,cLabels = np.asarray(cFeatures),np.asarray(cLabels)
print (cFeatures.shape,cLabels.shape)

(24161, 592) (24161,)


In [45]:
combinedFeatures,combinedLabels = loadTopHSVMomentFeatures()
combinedFeatures,combinedLabels = np.asarray(combinedFeatures),np.asarray(combinedLabels)
print (combinedFeatures.shape,combinedLabels.shape)

In [46]:
TopHSVFeatures,Labels = loadTopHSVFeatures()
TopHSVFeatures,Labels = np.asarray(TopHSVFeatures),np.asarray(Labels)
print (TopHSVFeatures.shape,Labels.shape)

Divide the dataset into training data and testing data using sklearn train_test_split. The following is the definition of:

**X_train**: This is a numpy array of feature vectors of the image for train the classifier.

**X_test**:This is numpy array of lables of the images for training the classifier.

**y_train**: This is a numpy array of feature vectors of the image for testing the classifier.

**y_test** This is numpy array of lables of the images for testing the classifier.

In [74]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(cFeatures, cLabels, test_size=0.25, random_state=42)
X_train, X_test, y_train, y_test = np.asarray(X_train), np.asarray(X_test), np.asarray(y_train), np.asarray(y_test)
print (X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(18120, 592) (6041, 592) (18120,) (6041,)


In [75]:
from __future__ import print_function
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression,chi2,SelectPercentile,SelectFpr
from sklearn.model_selection import GridSearchCV
from pprint import pprint
from sklearn import metrics
from time import time

Train RandomForestClassifier classifier and perform cross validation using GridSearchCV and feature selection using recursive cross validation. Compute confusion matrix, Precision score,accuracy,recall score and f1 score

In [76]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
pipeline = Pipeline([
    ('rfe', RFECV(estimator=SVC(kernel='linear'),scoring='accuracy',cv=StratifiedKFold(),step=1)),
    ('clf', RandomForestClassifier()),
])
parameters = {
    'clf__n_estimators': [10,20,50,100],
    'clf__max_features': ['auto', 'sqrt', 'log2'],
    'clf__random_state':[300,400,700]
}
print("Performing Grid Search to tune the hyper parameters of the model")
RandomForestModel = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
print("Performing Pipeline Steps:", [name for name, _ in pipeline.steps])
print("parameters are :")
pprint(parameters)
t0 = time()
RandomForestModel.fit(X_train, y_train)
print("Time to train the classifier is {}".format(time() - t0))
print()

print("Best score {}".format(RandomForestModel.best_score_))
print("Best parameters set:")
best_parameters = RandomForestModel.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
t0 = time()
predicted = RandomForestModel.predict(X_test)
print("Time to test the classifier is {}".format(time() - t0))
print("Calculated Accuracy is {}".format(metrics.accuracy_score(y_test, predicted)))
print("Precision Score is {}".format(metrics.precision_score(y_test, predicted, average='weighted')))
print("Recall Score is {}".format(metrics.recall_score(y_test, predicted, average='weighted')))
print("F1 Score is {}".format(metrics.f1_score(y_test, predicted, average='weighted')))
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, predicted))

Performing Grid Search to tune the hyper parameters of the model
Performing Pipeline Steps: ['clf']
parameters are :
{'clf__max_features': ['auto', 'sqrt', 'log2'],
 'clf__n_estimators': [10, 20, 50, 100],
 'clf__random_state': [300, 400, 700]}
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed: 17.5min finished


Time to train the classifier is 1103.87059689

Best score 0.551876379691
Best parameters set:
	clf__max_features: 'log2'
	clf__n_estimators: 100
	clf__random_state: 700
Time to test the classifier is 1.82891011238
Calculated Accuracy is 0.555206091707
Precision Score is 0.555168510035
Recall Score is 0.555206091707
F1 Score is 0.533445641052
confusion matrix:
[[ 219  410  579]
 [  80 1571  754]
 [  97  767 1564]]


Train AdaBoostClassifier. Compute confusion matrix, Precision score,accuracy,recall score and f1 score

In [37]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier()
print("Training the classifier: ")
print(clf)
t0 = time()
clf.fit(X_train, y_train)
print("Time to train the model is {}".format(time() - t0))
t0 = time()
pred = clf.predict(X_test)
print("Time to test the model is {}".format(time() - t0))
score = metrics.accuracy_score(y_test, pred)
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))

print("Calculated Accuracy is {}".format(metrics.accuracy_score(y_test, pred)))
print("Precision Score is {}".format(metrics.precision_score(y_test, pred, average='weighted')))
print("Recall Score is {}".format(metrics.recall_score(y_test, pred, average='weighted')))
print("F1 Score is {}".format(metrics.f1_score(y_test, pred, average='weighted')))

Training the classifier: 
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
Time to train the model is 10.706553936
Time to test the model is 0.0874669551849
confusion matrix:
[[205  99  77]
 [134 152  97]
 [ 81  60 220]]
Calculated Accuracy is 0.512888888889
Precision Score is 0.510868978121
Recall Score is 0.512888888889
F1 Score is 0.509486475557


Train AdaBoostClassifier  by performing feature selection and cross validation. The feature selection is perform using Recursive feature selection with cross validation and hyper parameter tuning is performed GridSearchCV. 
Compute confusion matrix, Precision score,accuracy,recall score and f1 score

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_selection import RFECV
pipeline = Pipeline([
     ('rfe', RFECV(estimator=SVC(kernel="linear"),scoring='accuracy',cv=StratifiedKFold(2),step=1)),
    ('clf', AdaBoostClassifier()),
])
parameters = {
    'clf__n_estimators': [50,70,90],
    'clf__random_state': [1,20,40]
}
AdaBoostModel = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
print("pipeline Parameters:", [name for name, _ in pipeline.steps])
print("Chosen Parameters: are")
pprint(parameters)
t0 = time()
AdaBoostModel.fit(X_train, y_train)
print("Time to train the model is {}".format(time() - t0))
print("")

print("Best reported score is {}".format(AdaBoostModel.best_score_))
print("Best parameters set are: "))
best_parameters = AdaBoostModel.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("{}: {} ".format(param_name, best_parameters[param_name]))

predicted = AdaBoostModel.predict(X_test)
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))
print("Calculated Accuracy is {}".format(metrics.accuracy_score(y_test, predicted)))
print("Precision Score is {}".format(metrics.precision_score(y_test, predicted, average='weighted')))
print("Recall Score is {}".format(metrics.recall_score(y_test, predicted, average='weighted')))
print("F1 Score is {}".format(metrics.f1_score(y_test, predicted, average='weighted')))

Train MultiLayer Perceptron by performing cross validation to tune the hyper parameters of the classifier. The hyper parameter tuning is performed GridSearchCV. 
Compute confusion matrix, Precision score,accuracy,recall score and f1 score

In [77]:
from sklearn.neural_network import MLPClassifier
print("Training the classifier...")
t0 = time()
param_grid = {'hidden_layer_sizes': [(70, 20, 10),(40, 20, 10),(90, 20, 10)],
              'activation' :['logistic', 'tanh', 'relu'],
              'solver': ['lbfgs', 'sgd', 'adam'],
                'alpha':[ 0.0001, 0.001, 0.01]}
clf = GridSearchCV(MLPClassifier(), param_grid)
print(clf)
X_train = SelectKBest(chi2, k=2).fit_transform(X_train, y_train)
X_test = SelectKBest(chi2, k=2).fit_transform(X_test, y_test)
clf.fit(X_train, y_train)
train_time = time() - t0
print("Time to train the classifier is {}".format(train_time))
t0 = time()
pred = clf.predict(X_test)
test_time = time() - t0
print("Time to test the classifier is {}".format(test_time))
score = metrics.accuracy_score(y_test, pred)
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))

print("Calculated Accuracy is {}".format(metrics.accuracy_score(y_test, pred)))
print("Precision Score is {}".format(metrics.precision_score(y_test, pred, average='weighted')))
print("Recall Score is {}".format(metrics.recall_score(y_test, pred, average='weighted')))
print("F1 Score is {}".format(metrics.f1_score(y_test, pred, average='weighted')))

Train MultiLayer Perceptron by performing feature selection and cross validation. The feature selection is perform using Recursive feature selection with cross validation and hyper parameter tuning is performed GridSearchCV. 
Compute confusion matrix, Precision score,accuracy,recall score and f1 score

In [39]:
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFECV
pipeline = Pipeline([
    #('rfe', RFECV(estimator=SVC(kernel="linear"),scoring='accuracy',cv=StratifiedKFold(2),step=1)),
    ('clf', MLPClassifier()),
])
parameters = {
    'clf__hidden_layer_sizes': [(70, 20, 10),(40, 20, 10),(90, 20, 10)],
    'clf__activation': ['identity', 'logistic', 'tanh', 'relu'],
    'clf__solver':['lbfgs', 'sgd', 'adam'],
    'clf__alpha':[ 0.0001, 0.001, 0.001],
    'clf__learning_rate':['constant', 'invscaling', 'adaptive']
}
print("Performing Grid Search to tune the hyper parameters of the model")

MLPModel = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
print("pipeline :", [name for name, _ in pipeline.steps])
print("parameters are :")
pprint(parameters)
t0 = time()
MLPModel.fit(X_train, y_train)
print("Time to train the classifier is {}".format(time()-t0))
print()

print("Best score:{}".format( MLPModel.best_score_))
print("Best parameters set:")
best_parameters = MLPModel.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
t0 = time()
predicted = MLPModel.predict(X_test)
print("Time to test the classifier is {}".format(time()-t0))
print("Calculated Accuracy is {}".format(metrics.accuracy_score(y_test, predicted)))
print("Precision Score is {}".format(metrics.precision_score(y_test, predicted, average='weighted')))
print("Recall Score is {}".format(metrics.recall_score(y_test, predicted, average='weighted')))
print("F1 Score is {}".format(metrics.f1_score(y_test, predicted, average='weighted')))
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))

Performing Grid Search to tune the hyper parameters of the model
pipeline : ['clf']
parameters are :
{'clf__activation': ['identity', 'logistic', 'tanh', 'relu'],
 'clf__alpha': [0.0001, 0.001, 0.001],
 'clf__hidden_layer_sizes': [(70, 20, 10), (40, 20, 10), (90, 20, 10)],
 'clf__learning_rate': ['constant', 'invscaling', 'adaptive'],
 'clf__solver': ['lbfgs', 'sgd', 'adam']}
Fitting 3 folds for each of 324 candidates, totalling 972 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 13.6min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 34.5min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 62.8min
[Parallel(n_jobs=-1)]: Done 972 out of 972 | elapsed: 75.2min finished


Time to train the classifier is 4526.29595399

Best score:0.485333333333
Best parameters set:
	clf__activation: 'logistic'
	clf__alpha: 0.0001
	clf__hidden_layer_sizes: (90, 20, 10)
	clf__learning_rate: 'constant'
	clf__solver: 'adam'
Time to test the classifier is 0.0880341529846
Calculated Accuracy is 0.497777777778
Precision Score is 0.490777728344
Recall Score is 0.497777777778
F1 Score is 0.473092479685
confusion matrix:
[[205  99  77]
 [134 152  97]
 [ 81  60 220]]


Predict Scores using VotingClassifier. The classifiers used are LogisticRegression(),KNeighborsClassifier() and  MLPClassifier().  Compute confusion matrix, Precision score,accuracy,recall score and f1 score

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
clf1 = LogisticRegression()
clf2 = KNeighborsClassifier()
clf3 = MLPClassifier()
eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('knn', clf3)], voting='hard')
print("Training the classifier...")
print(eclf1)
t0 = time()
eclf1 = eclf1.fit(X_train, y_train)
train_time = time() - t0
print("Time to train the classifier is {}".format(train_time))
t0 = time()
pred = eclf1.predict(X_test)
test_time = time() - t0
print("Time to test the classifier is {}".format(test_time))
print("The confusion matrix: {}".format(metrics.confusion_matrix(y_test, pred)))
print("Accuracy is {}".format(metrics.accuracy_score(y_test, pred)))
print("Precision Score is {}".format(metrics.precision_score(y_test, pred, average='weighted')))
print("Recall Score is {}".format(metrics.recall_score(y_test, pred, average='weighted')))
print("F1 Score is {}".format(metrics.f1_score(y_test, pred, average='weighted')))

Train linear SVM  by performing feature selection and cross validation. The feature selection is perform using Recursive feature selection with cross validation and hyper parameter tuning is performed GridSearchCV. 
Compute confusion matrix, Precision score,accuracy,recall score and f1 score

In [None]:
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
pipeline = Pipeline([
    #('rfe', RFECV(estimator=SVC(kernel="linear"),scoring='accuracy',cv=StratifiedKFold(2),step=1)),
    ('clf', SVC()),
])
parameters = {
    'clf__kernel':('linear','rbf','sigmoid','poly'),
    'clf__C': (0.001,0.0001,0.01,1)
}
SVC_clf = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
print("pipeline Parameters:", [name for name, _ in pipeline.steps])
print("Chosen Parameters: are")
pprint(parameters)
t0 = time()
SVC_clf.fit(X_train, y_train)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score:{}".format( SVC_clf.best_score_))
print("Best parameters set:")
best_parameters = SVC_clf.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

predicted = SVC_clf.predict(X_test)
print("Calculated Accuracy is {}".format(metrics.accuracy_score(y_test, predicted)))
print("Precision Score is {}".format(metrics.precision_score(y_test, predicted, average='weighted')))
print("Recall Score is {}".format(metrics.recall_score(y_test, predicted, average='weighted')))
print("F1 Score is {}".format(metrics.f1_score(y_test, predicted, average='weighted')))
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))

pipeline Parameters: ['clf']
Chosen Parameters: are
{'clf__C': (0.001, 0.0001, 0.01, 1),
 'clf__kernel': ('linear', 'rbf', 'sigmoid', 'poly')}
Fitting 3 folds for each of 16 candidates, totalling 48 fits


Train SVC by performing cross validation to tune the hyper parameters of the classifier. The hyper parameter tuning is performed GridSearchCV. 
Compute confusion matrix, Precision score,accuracy,recall score and f1 score

In [None]:
from sklearn.svm import SVC
parameters = {
    'C': (0.001,0.0001,0.01,1)
}
print("Performing Grid Search to tune the hyper parameters of the model")
SVC_clf = GridSearchCV(SVC(kernel="linear"), parameters, n_jobs=-1, verbose=1)
t0 = time()
SVC_clf.fit(X_train, y_train)
print("Time to train the model is {}".format(time() - t0))
print(" ")
print("Best score: %0.3f" % SVC_clf.best_score_)
print("Best parameters set:")
best_parameters = SVC_clf.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
predicted = SVC_clf.predict(X_test)
print("Calculated Accuracy is {}".format(metrics.accuracy_score(y_test, predicted)))
print("Precision Score is {}".format(metrics.precision_score(y_test, predicted, average='weighted')))
print("Recall Score is {}".format(metrics.recall_score(y_test, predicted, average='weighted')))
print("F1 Score is {}".format(metrics.f1_score(y_test, predicted, average='weighted')))
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, predicted))

Performing Grid Search to tune the hyper parameters of the model
Fitting 3 folds for each of 4 candidates, totalling 12 fits
