In [1]:
from skimage import feature
import numpy as np
import cv2
import scipy.misc
from skimage.io import imread
import os
import fnmatch

Define the getLBPFeatures method to fetch the local Binary patterns of the image. Local Binary Patterns compute the local representation of the texture.

In [2]:
def getLBPFeatures(image,numPoints, radius):
    """

    :param image: input image (greyscale) as numpy array
    :param numPoints:number of points p in a circularly symmetric neighborhood of central pixel
    :param radius: radius of the circle
    :return: histogram of Local binary pattern of given image
    """
    lbp = feature.local_binary_pattern(image, numPoints,
            radius, method="uniform")
    (hist, _) = np.histogram(lbp.ravel(),
            bins=np.arange(0, numPoints + 3),
            range=(0, numPoints + 2))
    # normalize the histogram
    eps=1e-7
    hist = hist.astype("float")
    hist /= (hist.sum() + eps)
    return hist

Compute the mean,standard deviation,skewness and HUEMoments of the input image.It returns a feature vector as numpy array which embeds the mentioned features. The image is read into HSV color space. 

In [3]:
def getMoments(image):
    """

    :param image: input image
    :return: numpy array which contains the mean,standard deviation,skewness and heuMoments of the image
    """

    #read the image in HSV color space
    imageHSV = cv2.cvtColor(image,cv2.COLOR_BGR2HSV)

    #calculate the mean and standard deviation of the image
    (means, stds) = cv2.meanStdDev(imageHSV)

    #calculate the image moments
    moments = cv2.moments(imageHSV[:,:,0])
    skew0 = moments['mu11'] / moments['mu02'] if moments['mu02'] !=0 else moments['mu11']
    moments = cv2.moments(imageHSV[:,:,1])
    skew1 = moments['mu11'] / moments['mu02'] if moments['mu02'] !=0 else moments['mu11']
    moments = cv2.moments(imageHSV[:,:,2])
    skew2 = moments['mu11'] / moments['mu02'] if moments['mu02'] !=0 else moments['mu11']
    skew = np.asarray([skew0,skew1,skew2]).reshape((3, 1))

    #concatenate the mean,standard deviation,skew features into a single numpy array
    stats = np.concatenate([means, stds,skew]).flatten()

    #calculate the HuMoments of the image
    imageGray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    Moments = cv2.HuMoments(cv2.moments(imageGray)).flatten()
    moments = np.asarray(Moments)

    #form the final feature vector which contains the mean,standard deviation, skewness and HueMoments of the image
    finalFeatureVector = np.append(stats,moments)

    #return the feature vector
    return finalFeatureVector

Load the CSV file into panda dataframe. The column names description:

**Genre** : Genre for the given File

**imdbId**: Image file name (File names are the imdbID for the given movie)

**Feature1**: Label assigned to image file with given Genre

In [16]:
import pandas as pd
df = pd.read_csv("./../Dataset/Top3FullGenre.csv",delimiter=",").fillna("-NA-")
df.head()

Unnamed: 0,Genre,imdbId,Feature1,Feature2
0,Action|Adventure|Drama,35153,Action,0
1,Drama,1630036,Drama,1
2,Comedy|Romance,1195478,Comedy,2
3,Comedy|Action|Crime,79966,Comedy,2
4,Drama|History,1029364,Drama,1


List the counts of each Genre category.

In [17]:
df.Genre.value_counts()

Drama                       4205
Comedy                      2408
Comedy|Drama                1474
Drama|Romance               1448
Comedy|Drama|Romance        1130
Comedy|Romance              1002
Action|Crime|Drama           558
Drama|Thriller               509
Drama|War                    321
Comedy|Crime|Drama           289
Action|Comedy|Crime          275
Action|Adventure|Drama       264
Action|Crime|Thriller        256
Comedy|Crime                 247
Action|Adventure|Comedy      245
Drama|Mystery|Thriller       199
Comedy|Horror                194
Action|Drama                 183
Comedy|Drama|Family          169
Action|Thriller              167
Drama|History                166
Action|Drama|Thriller        164
Action|Adventure|Sci-Fi      163
Comedy|Drama|Music           147
Drama|Horror|Thriller        141
Action|Adventure|Fantasy     137
Comedy|Musical|Romance       136
Drama|Horror|Mystery         135
Drama|Family                 134
Action                       131
          

Print the columns of the csv file

In [18]:
columns = list(df.columns.values)
print(columns)

['Genre', 'imdbId', 'Feature1', 'Feature2']


Fetch the unique feature and print it's count.

In [19]:
uniqueGenre = df.Feature1.unique()
print(uniqueGenre.shape)

(3,)


Itertate over all the movie posters, compute their Local Binary Patterns features and associated labels.
It return the feature vector containing the local binary patterns and lables indicating the Genre associated with the movie poster.

In [20]:
def load_LBP_features():
    imageDir = "./../Dataset/MovieGenreFullPosters/"  
    inputData = []
    labels = []
    for index, row in df.iterrows(): 
        filename = str(row[columns[1]])+".jpg"
        label = row[columns[2]]
        original_image = imread(imageDir+filename,as_grey= True)
        hist = getLBPFeatures(original_image,40,8)
        inputData.append(hist)
        labels.append(label)
    return inputData,labels

Itertate over all the movie posters, compute their moments  and associated labels. It also computes their Local Binary patterns features and associated labels.
It return the feature vector containing image moments and local binary patterns and labels indicating the Genre associated with the movie poster.

In [21]:
def load_moment_features():
    imageDir = "./../Dataset/MovieGenreFullPosters/"  
    inputData = []
    labels = []
    for index, row in df.iterrows(): 
        filename = str(row[columns[1]])+".jpg"
        label = row[columns[2]]
        original_image = imread(imageDir+filename)
        hist = getMoments(original_image)
        inputData.append(hist)
        labels.append(label)
    return inputData,labels

Itertate over all the movie posters, compute their moments,Local Binary pattersn  and associated labels.
It return the feature vector containing image moments, local binary pattern features and lables indicating the Genre associated with the movie poster.

In [22]:
def load_combined_features():
    imageDir = "./../Dataset/MovieGenreFullPosters/"  
    inputData = []
    labels = []
    for index, row in df.iterrows(): 
        filename = str(row[columns[1]])+".jpg"
        label = row[columns[2]]
        original_image = imread(imageDir+filename)
        Moments = getMoments(original_image)
        original_image = imread(imageDir+filename,as_grey=True)
        LBPFeatures = getLBPFeatures(original_image,40,8)
        inputData.append(np.append(LBPFeatures,Moments))
        labels.append(label)
    return inputData,labels

Get the LBP Features of all the movie posters in the dataset and associated label indicating the Genre. The lbpFeatures is a numpy array of N X NUMBER OF LBP FEATURES. N denotes the total number of movie posters. The labels are the numpy array of N X 1.

In [35]:
lbpFeatures,labels = load_LBP_features()
lbpFeatures,labels = np.asarray(lbpFeatures),np.asarray(labels)
print (lbpFeatures.shape,labels.shape)

Get the LBP Features and image moments(mean,standard deviation,skewness and HUEMoments) of all the movie posters in the dataset and associated label indicating the Genre. The lbpFeatures is a numpy array of N X Number of LBP Features +Image moments N denotes the total number of movie posters. The labels are the numpy array of N X 1.

In [None]:
cFeatures,labels = load_combined_features()
cFeatures,labels = np.asarray(cFeatures),np.asarray(labels)
print (cFeatures.shape,labels.shape)

Divide the dataset into training data and testing data using sklearn train_test_split. The following is the definition of:

**X_train**: This is a numpy array of feature vectors of the image for train the classifier.

**X_test**:This is numpy array of lables of the images for training the classifier.

**y_train**: This is a numpy array of feature vectors of the image for testing the classifier.

**y_test** This is numpy array of lables of the images for testing the classifier.

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(cFeatures, labels, test_size=0.25, random_state=42)
X_train, X_test, y_train, y_test = np.asarray(X_train), np.asarray(X_test), np.asarray(y_train), np.asarray(y_test)
print (X_train.shape, X_test.shape, y_train.shape, y_test.shape)

((7642, 58), (2548, 58), (7642,), (2548,))


Defined all the modules to be imported.

In [14]:
from __future__ import print_function
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression,chi2
from sklearn.model_selection import GridSearchCV
from pprint import pprint
from sklearn import metrics
from time import time

Train RandomForestClassifier classifier and perform cross validation using GridSearchCV. Compute confusion matrix, Precision score,accuracy,recall score and f1 score

In [39]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
pipeline = Pipeline([
    ('rfe', RFECV(estimator=AdaBoostClassifier(),scoring='accuracy',cv=StratifiedKFold(3),step=1)),
    ('clf', RandomForestClassifier()),
])
parameters = {
    'clf__n_estimators': [10,20,50,100],
    'clf__max_features': ['auto', 'sqrt', 'log2'],
    'clf__random_state':[300,400,700]
}
print("Performing Grid Search to tune the hyper parameters of the model")
RandomForestModel = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
print("Performing Pipeline Steps:", [name for name, _ in pipeline.steps])
print("parameters are :")
pprint(parameters)
t0 = time()
RandomForestModel.fit(X_train, y_train)
print("Time to train the classifier is {}".format(time() - t0))
print()

print("Best score {}".format(RandomForestModel.best_score_))
print("Best parameters set:")
best_parameters = RandomForestModel.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
t0 = time()
predicted = RandomForestModel.predict(X_test)
print("Time to test the classifier is {}".format(time() - t0))
print("Calculated Accuracy is {}".format(metrics.accuracy_score(y_test, predicted)))
print("Precision Score is {}".format(metrics.precision_score(y_test, predicted, average='weighted')))
print("Recall Score is {}".format(metrics.recall_score(y_test, predicted, average='weighted')))
print("F1 Score is {}".format(metrics.f1_score(y_test, predicted, average='weighted')))
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, predicted))

Performing Grid Search to tune the hyper parameters of the model
Performing Pipeline Steps: ['clf']
parameters are :
{'clf__max_features': ['auto', 'sqrt', 'log2'],
 'clf__n_estimators': [10, 20, 50, 100],
 'clf__random_state': [300, 400, 700]}
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   44.7s
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:  2.0min finished


Time to train the classifier is 126.436920166

Best score 0.468071185554
Best parameters set:
	clf__max_features: 'auto'
	clf__n_estimators: 100
	clf__random_state: 400
Time to test the classifier is 0.262647151947
Calculated Accuracy is 0.472135007849
Precision Score is 0.489173307935
Recall Score is 0.472135007849
F1 Score is 0.402710564222
confusion matrix:
[[275 300  14   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [168 851  38   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [ 63 278  63   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  7 158  15  11   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  8  67   3   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  5   3   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [ 20  38   2   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  1   5   0   0   0   0   0   0   0   0 

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Predict Scores using VotingClassifier. The classifiers used are LogisticRegression(),KNeighborsClassifier() and  MLPClassifier().  Compute confusion matrix, Precision score,accuracy,recall score and f1 score

In [23]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
clf1 = LogisticRegression()
clf2 = KNeighborsClassifier()
clf3 = MLPClassifier()
eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('knn', clf3)], voting='hard')
print("Training the classifier...")
print(eclf1)
t0 = time()
eclf1 = eclf1.fit(X_train, y_train)
train_time = time() - t0
print("Time to train the classifier is {}".format(train_time))
t0 = time()
pred = eclf1.predict(X_test)
test_time = time() - t0
print("Time to test the classifier is {}".format(test_time))
print("The confusion matrix: {}".format(metrics.confusion_matrix(y_test, pred)))
print("Accuracy is {}".format(metrics.accuracy_score(y_test, pred)))
print("Precision Score is {}".format(metrics.precision_score(y_test, pred, average='weighted')))
print("Recall Score is {}".format(metrics.recall_score(y_test, pred, average='weighted')))
print("F1 Score is {}".format(metrics.f1_score(y_test, pred, average='weighted')))

Training the classifier...
VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('rf', KNeighb...=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False))],
         n_jobs=1, voting='hard', weights=None)
Time to train the classifier is 6.37550592422
Time to test the classifier is 1.25316596031
The confusion matrix: [[692 312 129]
 [346 709 109]
 [513 439 201]]
Accuracy is 0.464347826087
Precision Score is 0.463383258573
Recall Score is 0.464347826087
F1 Score is 0.436057343358


Train MultiLayer Perceptron by performing feature selection and cross validation. The feature selection is perform using Recursive feature selection with cross validation and hyper parameter tuning is performed GridSearchCV. 
Compute confusion matrix, Precision score,accuracy,recall score and f1 score

In [16]:
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFECV
pipeline = Pipeline([
    #('rfe', RFECV(estimator=SVC(kernel="linear"),scoring='accuracy',cv=StratifiedKFold(2),step=1)),
    ('clf', MLPClassifier()),
])
parameters = {
    'clf__hidden_layer_sizes': [(70, 20, 10),(40, 20, 10),(90, 20, 10)],
    'clf__activation': ['identity', 'logistic', 'tanh', 'relu'],
    'clf__solver':['lbfgs', 'sgd', 'adam'],
    'clf__alpha':[ 0.0001, 0.001, 0.001]
}
print("Performing Grid Search to tune the hyper parameters of the model")

MLPModel = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
print("pipeline :", [name for name, _ in pipeline.steps])
print("parameters are :")
pprint(parameters)
t0 = time()
MLPModel.fit(X_train, y_train)
print("Time to train the classifier is {}".format(time()-t0))
print()

print("Best score:{}".format( MLPModel.best_score_))
print("Best parameters set:")
best_parameters = MLPModel.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
t0 = time()
predicted = MLPModel.predict(X_test)
print("Time to test the classifier is {}".format(time()-t0))
print("Calculated Accuracy is {}".format(metrics.accuracy_score(y_test, predicted)))
print("Precision Score is {}".format(metrics.precision_score(y_test, predicted, average='weighted')))
print("Recall Score is {}".format(metrics.recall_score(y_test, predicted, average='weighted')))
print("F1 Score is {}".format(metrics.f1_score(y_test, predicted, average='weighted')))
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))

Performing Grid Search to tune the hyper parameters of the model
pipeline : ['clf']
parameters are :
{'clf__activation': ['identity', 'logistic', 'tanh', 'relu'],
 'clf__alpha': [0.0001, 0.001, 0.001],
 'clf__hidden_layer_sizes': [(70, 20, 10), (40, 20, 10), (90, 20, 10)],
 'clf__solver': ['lbfgs', 'sgd', 'adam']}
Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:  8.5min finished


Time to train the classifier is 518.967584848

Best score:0.443961352657
Best parameters set:
	clf__activation: 'tanh'
	clf__alpha: 0.001
	clf__hidden_layer_sizes: (40, 20, 10)
	clf__solver: 'lbfgs'
Time to test the classifier is 0.0170249938965
Calculated Accuracy is 0.446376811594
Precision Score is 0.444648781614
Recall Score is 0.446376811594
F1 Score is 0.428113344498
confusion matrix:
[[694 288 151]
 [438 560 166]
 [486 419 248]]


Train MultiLayer Perceptron by performing cross validation to tune the hyper parameters of the classifier. The hyper parameter tuning is performed GridSearchCV. 
Compute confusion matrix, Precision score,accuracy,recall score and f1 score

In [42]:
from sklearn.neural_network import MLPClassifier
print("Training the classifier...")
t0 = time()
param_grid = {'hidden_layer_sizes': [(70, 20, 10),(40, 20, 10),(50, 20, 10)],
              'activation' :['logistic', 'tanh', 'relu'],
              'solver': ['lbfgs', 'sgd', 'adam'],
                'alpha':[ 0.0001, 0.001, 0.01]}
clf = GridSearchCV(MLPClassifier(), param_grid)
print(clf)
clf.fit(X_train, y_train)
train_time = time() - t0
print("Time to train the classifier is {}".format(train_time))
t0 = time()
pred = clf.predict(X_test)
test_time = time() - t0
print("Time to test the classifier is {}".format(test_time))
score = metrics.accuracy_score(y_test, pred)
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))

print("Calculated Accuracy is {}".format(metrics.accuracy_score(y_test, pred)))
print("Precision Score is {}".format(metrics.precision_score(y_test, pred, average='weighted')))
print("Recall Score is {}".format(metrics.recall_score(y_test, pred, average='weighted')))
print("F1 Score is {}".format(metrics.f1_score(y_test, pred, average='weighted')))

Training the classifier...
GridSearchCV(cv=None, error_score='raise',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': [0.0001, 0.001, 0.01], 'activation': ['logistic', 'tanh', 'relu'], 'solver': ['lbfgs', 'sgd', 'adam'], 'hidden_layer_sizes': [(70, 20, 10), (40, 20, 10), (50, 20, 10)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)




Time to train the classifier is 1447.73412108
Time to test the classifier is 0.0211679935455
confusion matrix:
[[236 354   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [184 873   0   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [ 69 335   0   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [ 12 176   0   3   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  4  74   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  4   6   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [ 20  41   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  1   5   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  3   6   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  2  18   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  2  26   0   0   0   0   0   0   0  

Train AdaBoostClassifier  by performing feature selection and cross validation. The feature selection is perform using Recursive feature selection with cross validation and hyper parameter tuning is performed GridSearchCV. 
Compute confusion matrix, Precision score,accuracy,recall score and f1 score

In [41]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_selection import RFECV
pipeline = Pipeline([
     #('rfe', RFECV(estimator=SVC(kernel="linear"),scoring='accuracy',cv=StratifiedKFold(2),step=1)),
    ('clf', AdaBoostClassifier()),
])
parameters = {
    'clf__n_estimators': [50,70,90],
    'clf__random_state': [1,20,40]
}
AdaBoostModel = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
print("pipeline Parameters:", [name for name, _ in pipeline.steps])
print("Chosen Parameters: are")
pprint(parameters)
t0 = time()
AdaBoostModel.fit(X_train, y_train)
print("Time to train the model is {}".format(time() - t0))
print("")

print("Best reported score is {}".format(AdaBoostModel.best_score_))
print("Best parameters set are: ")
best_parameters = AdaBoostModel.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("{}: {} ".format(param_name, best_parameters[param_name]))

predicted = AdaBoostModel.predict(X_test)
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))
print("Calculated Accuracy is {}".format(metrics.accuracy_score(y_test, predicted)))
print("Precision Score is {}".format(metrics.precision_score(y_test, predicted, average='weighted')))
print("Recall Score is {}".format(metrics.recall_score(y_test, predicted, average='weighted')))
print("F1 Score is {}".format(metrics.f1_score(y_test, predicted, average='weighted')))

pipeline Parameters: ['clf']
Chosen Parameters: are
{'clf__n_estimators': [50, 70, 90], 'clf__random_state': [1, 20, 40]}
Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:   45.5s finished


Time to train the model is 49.6613788605

Best reported score is 0.380266945826
Best parameters set are: 
clf__n_estimators: 50 
clf__random_state: 1 
confusion matrix:
[[298 289   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   3
    0   0]
 [289 758   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  12
    0   0]
 [ 89 309   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   8
    0   0]
 [ 18 172   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1
    0   0]
 [  9  67   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   3
    0   0]
 [  6   4   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [ 29  32   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  2   4   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  2   7   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  2  18   0   0   0   0   0   0   0   0   0   0   0   0   0   0 

Train AdaBoostClassifier. Compute confusion matrix, Precision score,accuracy,recall score and f1 score

In [40]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier()
print("Training the classifier: ")
print(clf)
t0 = time()
clf.fit(X_train, y_train)
print("Time to train the model is {}".format(time() - t0))
t0 = time()
pred = clf.predict(X_test)
print("Time to test the model is {}".format(time() - t0))
score = metrics.accuracy_score(y_test, pred)
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))

print("Calculated Accuracy is {}".format(metrics.accuracy_score(y_test, pred)))
print("Precision Score is {}".format(metrics.precision_score(y_test, pred, average='weighted')))
print("Recall Score is {}".format(metrics.recall_score(y_test, pred, average='weighted')))
print("F1 Score is {}".format(metrics.f1_score(y_test, pred, average='weighted')))


Training the classifier: 
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
Time to train the model is 2.84198689461
Time to test the model is 0.0790550708771
confusion matrix:
[[298 289   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   3
    0   0]
 [289 758   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  12
    0   0]
 [ 89 309   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   8
    0   0]
 [ 18 172   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1
    0   0]
 [  9  67   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   3
    0   0]
 [  6   4   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [ 29  32   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  2   4   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  2   7   0   0   0   0   0   0   0   0   0   0   0   0   0   0  

Train linear SVM  by performing feature selection and cross validation. The feature selection is perform using Recursive feature selection with cross validation and hyper parameter tuning is performed GridSearchCV. 
Compute confusion matrix, Precision score,accuracy,recall score and f1 score

In [1]:
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
pipeline = Pipeline([
    ('rfe', RFECV(estimator=SVC(kernel="linear"),scoring='accuracy',cv=StratifiedKFold(2),step=1)),
    ('clf', SVC()),
])
parameters = {
    'clf__kernel':('linear','rbf','sigmoid','poly'),
    'clf__C': (0.001,0.0001,0.01)
}
SVC_clf = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
print("pipeline Parameters:", [name for name, _ in pipeline.steps])
print("Chosen Parameters: are")
pprint(parameters)
t0 = time()
SVC_clf.fit(X_train, y_train)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score:{}".format( SVC_clf.best_score_))
print("Best parameters set:")
best_parameters = SVC_clf.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

predicted = SVC_clf.predict(X_test)
print("Calculated Accuracy is {}".format(metrics.accuracy_score(y_test, predicted)))
print("Precision Score is {}".format(metrics.precision_score(y_test, predicted, average='weighted')))
print("Recall Score is {}".format(metrics.recall_score(y_test, predicted, average='weighted')))
print("F1 Score is {}".format(metrics.f1_score(y_test, predicted, average='weighted')))
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))

Train SVC by performing cross validation to tune the hyper parameters of the classifier. The hyper parameter tuning is performed GridSearchCV. 
Compute confusion matrix, Precision score,accuracy,recall score and f1 score

In [1]:
from sklearn.svm import SVC
parameters = {
    'C': (0.001,0.0001,0.01,1)
}
print("Performing Grid Search to tune the hyper parameters of the model")
SVC_clf = GridSearchCV(SVC(kernel="linear"), parameters, n_jobs=-1, verbose=1)
t0 = time()
SVC_clf.fit(X_train, y_train)
print("Time to train the model is {}".format(time() - t0))
print(" ")
print("Best score: %0.3f" % SVC_clf.best_score_)
print("Best parameters set:")
best_parameters = SVC_clf.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
predicted = SVC_clf.predict(X_test)
print("Calculated Accuracy is {}".format(metrics.accuracy_score(y_test, predicted)))
print("Precision Score is {}".format(metrics.precision_score(y_test, predicted, average='weighted')))
print("Recall Score is {}".format(metrics.recall_score(y_test, predicted, average='weighted')))
print("F1 Score is {}".format(metrics.f1_score(y_test, predicted, average='weighted')))
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, predicted))