## Import Modules

In [64]:
import sklearn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectKBest
from scipy.sparse import csr_matrix
from scipy.sparse import hstack, vstack

## Function Definitions

In [68]:
df = pd.read_csv("../info-status/games-data-20180412.csv")

def get_labels(df):
    accent_columns = df[['word_tobi_break_index', 'word_tobi_pitch_accent']]
    
    y_labels = []
    
    for index, row in accent_columns.iterrows():
        
        if row['word_tobi_pitch_accent'] == "*?":
            y_labels.append(0)
            
        elif row['word_tobi_pitch_accent'] == "_":
            y_labels.append(0)
            
        else:
            y_labels.append(1)
            
    return y_labels

def filter_referring_expressions(df, columns):
    df = df[['word_pos_tag']]
    
    df = pd.concat([df, columns], axis=1)
    
    return df.loc[df['word_pos_tag'] == "NN"]

def get_input_has_been_mentioned(df):
    return (df['Most_Recent_Mention'] > 0).astype(int)

def get_input_num_mentions(df):
    return np.maximum(df['Number_Of_Coref_Mentions'], 0).fillna(value=0).astype(int)

def get_input_far_back_mentioned(df):
    return df['word_end_time'] - df['Most_Recent_Mention'].fillna(value=0)

def num_tokens_intonational_phrase_prev_mention(df):
    df = pd.concat([df['Intonational_Phrase_ID'], get_input_has_been_mentioned(df)], axis=1)
    
    ip_id = -1
    counter = 0
    curr_ip_id = 1
    ip_series = np.array([])

    ip_groups = df.groupby(['Intonational_Phrase_ID'])
    

    for i in range(1, len(ip_groups.groups) + 1):
        counter = 0
        for row in ip_groups.get_group(i)['Most_Recent_Mention']:
            if row == 1:
                counter += 1
                
        ip_series = np.append(ip_series, np.full(len(ip_groups.get_group(i)), counter))
            
    
    return pd.Series(ip_series, dtype=int).to_frame("Num_in_IP_Prev_Mention")

def classifyNB(x_train, x_test, y_train, y_test):
    gsclf = BernoulliNB()
#     params = {'alpha': np.logspace(-2., 2., 50)}
#     gsclf = GridSearchCV(gsclf, params, cv=10, n_jobs=-1)
    gsclf.fit(x_train, y_train)
    y_pred = gsclf.predict(x_test)

    print_accuracy_f1_scores("Naive Bayes", gsclf, x_test, y_test, gsclf.predict(x_test))
    
def classifyRF(x_train, x_test, y_train, y_test):
    clf = RFC()
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    print_accuracy_f1_scores("Random Forest", clf, x_test, y_test, clf.predict(x_test))
    
def classifyLR(x_train, x_test, y_train, y_test):
    clf = LogisticRegression()
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    
    print_accuracy_f1_scores("LR", clf, x_test, y_test, clf.predict(x_test))
    
def classifySVM(x_train, x_test, y_train, y_test):
    gsclf = LinearSVC()
    gsclf.fit(x_train, y_train)
    y_pred = gsclf.predict(x_test)

    print_accuracy_f1_scores("SVM", gsclf, x_test, y_test, gsclf.predict(x_test))
    
def print_accuracy_f1_scores(classifier_name, clf, x_test, y_test, y_pred):
    print(classifier_name + " Accuracy : {:.2f}%".format(clf.score(x_test, y_test) * 100))
    print(classifier_name + " F1-Score: {:.2f}".format(f1_score(y_test, y_pred)))
    print(80 * "=")


In [69]:
columns = pd.concat([get_input_has_been_mentioned(df), 
                     get_input_num_mentions(df), 
                     get_input_far_back_mentioned(df),
                     num_tokens_intonational_phrase_prev_mention(df),
                     pd.Series(get_labels(df))],
                     axis=1)

filtered_df = filter_referring_expressions(df, columns)



In [80]:
# x_train, x_test, y_train, y_test = train_test_split(filtered_df['Most_Recent_Mention'], filtered_df[1], random_state = 1)
# x_train, x_test, y_train, y_test = train_test_split(filtered_df['Number_Of_Coref_Mentions'], filtered_df[1], random_state = 1)
# x_train, x_test, y_train, y_test = train_test_split(filtered_df[0], filtered_df[1], random_state = 1)
x_train, x_test, y_train, y_test = train_test_split(filtered_df['Num_in_IP_Prev_Mention'], filtered_df[1], random_state = 1)

x_train = x_train.values.reshape(-1, 1)
x_test = x_test.values.reshape(-1, 1)

classifyLR(x_train, x_test, y_train, y_test)
classifyNB(x_train, x_test, y_train, y_test)
classifyRF(x_train, x_test, y_train, y_test)
classifySVM(x_train, x_test, y_train, y_test)

LR Accuracy : 88.92%
LR F1-Score: 0.94
Naive Bayes Accuracy : 88.92%
Naive Bayes F1-Score: 0.94
Random Forest Accuracy : 88.92%
Random Forest F1-Score: 0.94
SVM Accuracy : 88.92%
SVM F1-Score: 0.94


In [76]:
label_counts = np.bincount(filtered_df[1])
print("Majority class baseline classifier accuracy: {:.2f}%".format((max(label_counts) / np.sum(label_counts)) * 100))

Majority class baseline classifier accuracy: 88.80%
