In [1]:
%matplotlib inline
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from time import time   
from sklearn.svm import LinearSVC, SVC
from sklearn.preprocessing import Binarizer
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_svmlight_files 
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve
from sklearn.decomposition import NMF
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier

from __future__ import print_function

  from numpy.core.umath_tests import inner1d


## Load data and preprocessing

In [2]:
training_data, raw_training_target, testing_data, raw_testing_target = load_svmlight_files(
    files= ["../data/imdb/train/labeledBow.feat","../data/imdb/test/labeledBow.feat"], 
    n_features=None, dtype=None)

In [3]:
tfidf = TfidfTransformer(sublinear_tf=True, norm='l2')

In [4]:
training_data = tfidf.fit_transform(training_data)
testing_data = tfidf.transform(testing_data)
print (training_data.shape, testing_data.shape)

(25000, 89527) (25000, 89527)


In [None]:
factorize_matrix = NMF(n_components=1500)
training_data = factorize_matrix.fit_transform(training_data)

factorize_matrix = NMF(n_components=1500)
testing_data = factorize_matrix.fit_transform(testing_data)

In [None]:
# Binerize label
training_target = np.array(raw_training_target>5)
testing_target =  np.array(raw_testing_target>5)
print (training_target.shape, testing_target.shape)

In [None]:
labels, counts =  np.unique(training_target, return_counts=True)
print(labels)
print(counts)

## Classification

In [None]:
class Classifiers(object):
    # Train and test Logistic Regression Classifier
    def lrc(self, training_data, training_target, testing_data, testing_target):
        start = time()
        logreg = LogisticRegression()
        logreg.fit(training_data, training_target)    
        logreg_accuracy = logreg.score(testing_data, testing_target) * 100
        end = time()
        return [logreg, round(logreg_accuracy,2), str(round((end-start), 2))]
    
    # Train and test Linear SVM Classifier with and without parameter 
    def lSVC(self, training_data, training_target, testing_data, testing_target, parameter=False):
        start = time()
        if parameter == True:        
            result_lSVC= self.lSVC_para(training_data, training_target, testing_data, testing_target)
            end = time()
            return [result_lSVC[0], round(result_lSVC[1],2), result_lSVC[2], str(round((end-start), 2))]
        else:
            clf_linear = LinearSVC()
      
            clf_linear.fit(training_data, training_target)
        
            result_lSVC = clf_linear.score(testing_data, testing_target)*100    
            end = time()
            return [clf_linear, round(result_lSVC,2), str(round((end-start), 2))]
    
    # Calculating best parameter for LinearSVC Classifier
    def lSVC_para(self, training_data, training_target, testing_data, testing_target):
        #print("Calculating best parameter for LinearSVC Classifier ...")
        clist = 2**np.array(range(-2, 10), dtype='float')
        cvscores = []
        for c in clist:
            #print(c)
            clf= LinearSVC(C=c)
            scores = cross_val_score(clf, training_data, training_target, cv=3)
            #print("score", scores)
            cvscores.append(scores.mean()*100)
            bestscore, bestC = max([(val, clist[idx]) for (idx, val) in enumerate(cvscores)])
        #print('Best CV accuracy =', round(bestscore,2), '% achieved at C =', bestC)

        # Retrain on whole trainning set using best C value obtained from Cross validation
        #print("Retrain on whole trainning set using best C value obtained from Cross validation")
        clf = LinearSVC(C=bestC)
        clf.fit(training_data, training_target)
        accu = clf.score(testing_data, testing_target)*100
        return [clf, accu, bestC]

    # Train and test Random Forest Classifier
    def random_forest(self, training_data, training_target, testing_data, testing_target):
        start = time()
        clf_forest = RandomForestClassifier(n_estimators = 100, min_samples_leaf=5, max_features='auto', max_depth=16)
        clf_forest.fit(training_data, training_target)
        clf_forest_accuracy = clf_forest.score(testing_data, testing_target)*100
        end = time()
        return [clf_forest, round(clf_forest_accuracy,2),str(round((end-start), 2))]

    # Train and test Kernel SVM Classifier
    def kernel_SVM(self, training_data, training_target, testing_data, testing_target):
        start = time()
        clf_kernel = SVC()
        clf_kernel.fit(training_data, training_target)
        end = time()
        clf_kernel_accuracy = clf_kernel.score(testing_data, testing_target)*100
        end = time() 
        return [clf_kernel, round(clf_kernel_accuracy,2),str(round((end-start), 2))]
    
    # Prediction from Random Forest 
    def prediction(self, obj_clf,fileName,labels):
        pre = obj_clf.predict(testing_data)
    
        prediction_result = []
        for i in range(len(pre)):
            if pre[i] == 0:
                prediction_result.append(str(i) + ", negative") 
            else:
                prediction_result.append(str(i) + ", positive") 
        self.save_csv(prediction_result, fileName, labels)
        
    # Storing prediction in CSV file
    def save_csv(self, prediction_result, fileName, labels):
        print("Creating CSV file")
        # Open File
        output_file = open(fileName+".csv",'w')
        output_file.write(','.join(labels)+"\n")
        # Write data to file
        for r in prediction_result:
            output_file.write(r + "\n")
        output_file.close()
        print("File saved!")

In [None]:
sa = Classifiers()

result = sa.lrc(training_data, training_target, testing_data, testing_target)
print("Logistic Regression, Accuracy = %f"%result[1], " Time =", result[2],"seconds\n")

result = sa.lSVC(training_data, training_target, testing_data, testing_target)
print("Linear SVM, Accuracy = %f"%result[1], " Time =", result[2],"seconds\n")

result = sa.random_forest(training_data, training_target, testing_data, testing_target)
print("Random Forest, Accuracy = %f"%result[1], " Time =", result[2],"seconds\n")

##Too slow
#result = sa.kernel_SVM(training_data, training_target, testing_data, testing_target)
#print("Linear SVM, Accuracy = %f"%result[1], " Time =", result[2],"seconds\n")


In [None]:
print("Linear SVM Classifier With Parameter Selection")
result = sa.lSVC(training_data, training_target, testing_data, testing_target, True)
obj_lSVC_para = result[0]
print("Accuracy = %f"%result[1], " at Best C =", result[2],"Time =", result[3],"seconds")

