In [4]:
#Import Necessary Modules
import os
import sys
import re
import time
import shutil
import collections
import pandas as pd
from sklearn import tree
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import scipy
import matplotlib
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import svm
from sklearn.model_selection import KFold

In [5]:
#Test Set
test1 = ('HAMLET',["Mann's", 'Laurie','1991'], 
        ['is', 'a', 'superbly-produced'], True)
test2 = ('HAM',["Am'd", 'film','1998'], 
        ['isss', 'aaa', '2001'], True)
test3 = ('Ham',["Am's", 'Laur',' (1998) '], 
        ['isss', "aaa'd", 'super'], False)
test4 = ('HAM 2',["'", 'Laur','198'], 
        ['isss', 'aaa', 'super'], True)
test5 = ('HAM ',['Am', 'review','test'], 
        ['isss', '*', 'super'],True)
test6 = ('HAM ',['Am', 'Laur','test'], 
        ['isss', 'II', 'super'],True)
test7 = ('HAM ',[')', 'Laur','test'], 
        ['isss', 'The', 'super'],True)
test8 = ('BATMANT',["nice", '(','1991'], 
        ['is', 'a', 'superbly'], True)
test9 = ('Batman',["Am'd", 'review','1998'], 
        ['isss', 'aaa', '2001'], True)
test10 = ('Batman',["Am's", '"',' (1998) '], 
        ['isss', "aaa'd", 'super'], False)



cases = [test1,test2,test3, test4, test5, test6, test7, test8, test9, test10]

# Feature Generation

In [26]:
def isitcapitalized(testcase):
    """Is the label capitalized? Return True or False"""
    return testcase[0].isupper()


def isthereayear(testcase):
    """Is the YEAR (YYYY) format in the surrounding 3 words before and after? Return True or False"""
    combo = testcase[1] + testcase[2]
    joined = "".join(combo)
    yearsearch = re.compile(r"\d\d\d\d")
    if re.search(yearsearch, joined):
        return True
    else:
        return False
    
def hasCueWords(testcase, cuewords,cueword=None):
    """Are any of the cuewords we determined found in the surrounding text? Return True or False"""
    combo = testcase[1] + testcase[2]
    if cueword:
        for word in combo:
            if word == cueword:
                return True
    else:
        for word in combo:
            if word in cuewords:
                return True
    return False

def hasCueSymbols(testcase,cuesymbols ,cuesymbol=None):
    """Are any of the cuesymbols we determined found in the surrounding text? Return True or False"""
    combo = testcase[1] + testcase[2]
    if cuesymbol:
        for symb in combo:
            if symb == cuesymbol:
                return True
    else:
        for symb in cuesymbols:
            return True
    return False

def hasAppostrophe(testcase):
    """Does "'s" (sign of film ownership) appear in the surrounding text? Return True or False"""
    if testcase[1][0].find("'s") == -1:
        return False
    return True

def isPositive(testcase):
    """Is this a positive or negative example? Return True or False"""
    return testcase[3]

def featureGen(cases):
    """This function brings together all of our feature generation functions and
    exports a pandas dataframe which can be fed into the ML classifiers"""
    #Set Key words and Key symbols
    cuewords = {'in':True, 'made':True, 'review':True, 'film':True, 'recommend':True, "showing":True, 'released':True, "The": True}
    cuesymbols = {'*':True, '(':True, ')':True ,'"':True, "'":True, ':':True, 'II':True, 'III':True}
    
    for cueword in cuewords:
        cuewords[cueword] = [hasCueWords(case, cuewords, cueword) for case in cases]  
        
    for cuesymb in cuesymbols:
        cuesymbols[cuesymb] = [hasCueSymbols(case, cuesymbols, cuesymb) for case in cases]   
        
    #Create subdataframes
    cuewords_df = pd.DataFrame(cuewords)
    cuesymbols_df = pd.DataFrame(cuesymbols)
    
    #Put it all together into master dataframe
    features = pd.DataFrame(
        {'hasYear':[isthereayear(case) for case in cases],
         'isCap':[isitcapitalized(case) for case in cases],
         'hasCueWords':[hasCueWords(case, cuewords) for case in cases],    
         'hasApostrophe':[hasAppostrophe(case) for case in cases],
         'isPositive':[isPositive(case) for case in cases]
        }
    )
    df = pd.concat([cuewords_df, cuesymbols_df, features], axis=1)
    return(df)
    
    

In [28]:
df = featureGen(cases)
df

Unnamed: 0,The,film,in,made,recommend,released,review,showing,"""",',...,),*,:,II,III,hasApostrophe,hasCueWords,hasYear,isCap,isPositive
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,True,True,True
1,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,True,True,True
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,True,False,False
3,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,True,True
4,False,False,False,False,False,False,True,False,False,False,...,False,True,False,False,False,False,True,False,True,True
5,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,True,True
6,True,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,True,False,True,True
7,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,True
8,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,True,True,False,True
9,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,True,False,True,False,False


# ML Models

In [29]:
def returnPrecision(y_truth, y_pred, classifier):
    """This function takes in an array of true observed and an array of predicted observed and outputs three 
    different measures for precision"""
    precision = {}
    precision["p_macro"] = precision_score(y_truth, y_pred, average='macro')
    precision["p_micro"] = precision_score(y_truth, y_pred, average='micro')
    precision["p_weight"] = precision_score(y_truth, y_pred, average='weighted')
    precision["p"] = precision_score(y_truth, y_pred, pos_label=True, average='binary')
    return pd.DataFrame(precision, index=[classifier])

def returnRecall(y_truth, y_pred, classifier):
    """This function takes in an array of true observed and an array of predicted observed and outputs three 
    different measures for precision"""
    recall = {}
    recall["r_macro"] = recall_score(y_truth, y_pred, average='macro')
    recall["r_micro"] = recall_score(y_truth, y_pred, average='micro')
    recall["r_weight"] = recall_score(y_truth, y_pred, average='weighted')
    recall["r"] = recall_score(y_truth, y_pred, pos_label=True, average="binary")
    return pd.DataFrame(recall, index=[classifier])

def returnF1(y_truth, y_pred, classifier):
    """This function takes in an array of true observed and an array of predicted observed and outputs three 
    different measures for F1"""
    F1 = {}
    F1["f1_macro"] = f1_score(y_truth, y_pred, average='macro')
    F1["f1_micro"] = f1_score(y_truth, y_pred, average='micro')
    F1["f1_weight"] = f1_score(y_truth, y_pred, average='weighted')
    F1["f1"] = f1_score(y_truth, y_pred, pos_label=True, average='binary')
    return pd.DataFrame(F1, index=[classifier])

def returnAccuracy(y_truth, y_pred, classifier):
    """This function takes in an array of true observed and an array of predicted observed and outputs accuracy"""
    accuracy = {}
    accuracy["accu"] = accuracy_score(y_truth, y_pred)
    return pd.DataFrame(accuracy, index=[classifier])

def returnMetrics(y_truth, y_pred, classifier):
    """Returns recall, precision, and accuracy as a pandas dataframe for each classifier"""
    rec = returnRecall(y_truth, y_pred, classifier)
    prec = returnPrecision(y_truth, y_pred, classifier)
    accu = returnAccuracy(y_truth, y_pred, classifier)
    f1 = returnF1(y_truth, y_pred, classifier)
    df = pd.concat([rec, prec, f1, accu], axis=1)
    return(df)

def runModels(X, y, kf, names, classifiers):
    """Function that runs over the classifiers and does k-fold cross-validation - returns metrics as dataframe"""
    df = pd.DataFrame([])
    for name, clf in zip(names, classifiers):
        for train_index, test_index in kf.split(X):
            clf.fit(X[train_index], y[train_index])
            predicted = clf.predict(X[test_index])
            data = returnMetrics(y[test_index], predicted, name)
            myscore = clf.score(X[test_index], y[test_index])
            score = pd.DataFrame({"score": myscore}, index = [name])
            ccat = pd.concat([data, score], axis = 1)
            df = df.append(ccat)
    return df

def MLmodels(X, Y, nsplits):
    """This function will run classifiers and export accuracy, precision, and recall for each classifier.
    N-fold cross validation has been automated here, where user determines number of folds"""
    ###########################################
    #Define classifiers
    ###########################################
    names = ["Decision Tree", "Logistic", "SVM", "Random Forest", "ExtraTrees"]
    classifiers = [ LogisticRegression(), tree.DecisionTreeClassifier(), svm.SVC(),
                   RandomForestClassifier(), ExtraTreesClassifier()]
    
    ###########################################
    #Create Train/Test Split
    ###########################################
    #X = df.ix[:, df.columns != 'isPositive'].values
    #y = np.ravel(df[['isPositive']].values)
    kf = KFold(n_splits = nsplits)
    
    ###########################################
    #Run Models
    ###########################################
    return(runModels(X, y, kf, names, classifiers))
    
    

  