## Import Modules

In [106]:
import sklearn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectKBest
from scipy.sparse import csr_matrix
from scipy.sparse import hstack, vstack
from sklearn.preprocessing import StandardScaler

## Function Definitions

In [107]:
df = pd.read_csv("../info-status/games-data-20180420.csv")
REFERRING_EXP_POS = ["NN", "NNS", "NNP", "NNPS", "PDT", "CD", "POS", "PRP", "PRP$"]

def filter_referring_expressions(df):    
    return df.loc[df['word_pos_tag'].isin(REFERRING_EXP_POS)]

df = filter_referring_expressions(df)

def get_labels(df):
    accent_columns = df[['word_tobi_break_index', 'word_tobi_pitch_accent']]
    
    y_labels = []
    
    for index, row in accent_columns.iterrows():
        
        if row['word_tobi_pitch_accent'] == "*?":
            y_labels.append(0)
            
        elif row['word_tobi_pitch_accent'] == "_":
            y_labels.append(0)
            
        else:
            y_labels.append(1)
            
    return pd.Series(y_labels, dtype=int).to_frame("Labels")

def get_input_has_been_mentioned(df):
    return (df['Most_Recent_Mention'] > 0).astype(int).to_frame("Has_Been_Mentioned")

def get_input_num_mentions(df):
    return np.maximum(df['Number_Of_Coref_Mentions'], 0).fillna(value=0).astype(int).to_frame('Num_Mentions')

def get_input_far_back_mentioned(df):
    return (df['word_end_time'] - df['Most_Recent_Mention']).to_frame('Far_Back_Mentioned').fillna(value=np.finfo('float32').min)

def num_tokens_intonational_phrase_prev_mention(df, use_percentage=False):
    '''
    Get percentage of all referring expressions in an intonational phrase that have been mentioned before
    or determine whehther all referring expressions in an IP have been mentioned before.
    
    :param df Dataframe to filter from
    :param bool use_percentage Flag that determines whether to calculate percentage or not.    
    '''
    
    df = pd.concat([df['Intonational_Phrase_ID'], get_input_has_been_mentioned(df)], axis=1)

    ip_id = -1
    counter = 0
    curr_ip_id = 1
    ip_series = np.array([])

    ip_groups = df.groupby(['Intonational_Phrase_ID'])
    
    group_keys = ip_groups.groups.keys()
    
    if use_percentage:
        for i in group_keys:
            counter = 0
            for row in ip_groups.get_group(i)['Has_Been_Mentioned']:
                if row == 1:
                    counter += 1

            ip_series = np.append(ip_series, np.full(len(ip_groups.get_group(i)), counter / len(ip_groups.get_group(i))))
    
    else:
        for i in group_keys:
            value = 1
            for row in ip_groups.get_group(i)['Has_Been_Mentioned']:
                if row != 1:
                    value = 0

            ip_series = np.append(ip_series, np.full(len(ip_groups.get_group(i)), value))


    return pd.Series(ip_series).to_frame("IP_Prev_Mentions").fillna(0)

def classifyNB(x_train, x_test, y_train, y_test):
    gsclf = BernoulliNB()
    gsclf.fit(x_train, y_train)
    y_pred = gsclf.predict(x_test)

    print_accuracy_f1_scores("Naive Bayes", gsclf, x_test, y_test, gsclf.predict(x_test))
    
def classifyRF(x_train, x_test, y_train, y_test):
    clf = RFC()
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    print_accuracy_f1_scores("Random Forest", clf, x_test, y_test, clf.predict(x_test))
    
def classifyLR(x_train, x_test, y_train, y_test):
    clf = LogisticRegression()
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    
    print_accuracy_f1_scores("LR", clf, x_test, y_test, clf.predict(x_test))
    
def classifySVM(x_train, x_test, y_train, y_test):
    gsclf = LinearSVC()
    gsclf.fit(x_train, y_train)
    y_pred = gsclf.predict(x_test)

    print_accuracy_f1_scores("SVM", gsclf, x_test, y_test, gsclf.predict(x_test))
    
def print_accuracy_f1_scores(classifier_name, clf, x_test, y_test, y_pred):
    print(classifier_name + " Accuracy : {:.2f}%".format(clf.score(x_test, y_test) * 100))
    print(classifier_name + " F1-Score: {:.2f}".format(f1_score(y_test, y_pred)))
    print(classification_report(y_test, y_pred))
    count = np.bincount(y_pred)
    print("Unaccented Count:", count[0])
    print("Accented Count:", count[1])
    
    print(80 * "=")


In [108]:
columns = get_input_has_been_mentioned(df).join(get_input_num_mentions(df))
columns = columns.join(get_input_far_back_mentioned(df))
columns = columns.reset_index(drop=True)
columns = columns.join(num_tokens_intonational_phrase_prev_mention(df))
columns = columns.join(get_labels(df))



In [109]:
def run_classifiers_on_columns(features, labels):
    
    if len(features) == 1:
        x_train, x_test, y_train, y_test = train_test_split(features[0], labels, random_state = 1)
        x_train = x_train.values.reshape(-1, 1)
        x_test = x_test.values.reshape(-1, 1)
        
    else:
        x_train, x_test, y_train, y_test = train_test_split(filtered_df[columns], filtered_df[1], random_state = 1)
        x_train = x_train.values.reshape(-1, len(columns[0]))
        x_test = x_test.values.reshape(-1, len(columns[0]))
        
    
    print()
    print("Features:", [f.name for f in features])
    classifyLR(x_train, x_test, y_train, y_test)
    classifyNB(x_train, x_test, y_train, y_test)
    classifyRF(x_train, x_test, y_train, y_test)
    classifySVM(x_train, x_test, y_train, y_test)
    print()

In [110]:
run_classifiers_on_columns([columns['Has_Been_Mentioned']], columns['Labels'])
run_classifiers_on_columns([columns['Num_Mentions']], columns['Labels'])
run_classifiers_on_columns([columns['Far_Back_Mentioned']], columns['Labels'])
run_classifiers_on_columns([columns['IP_Prev_Mentions']], columns['Labels'])
# run_classifiers_on_columns([['Most_Recent_Mention', 'Number_Of_Coref_Mentions']])


Features: ['Has_Been_Mentioned']
LR Accuracy : 78.73%
LR F1-Score: 0.88
             precision    recall  f1-score   support

          0       0.00      0.00      0.00       975
          1       0.79      1.00      0.88      3609

avg / total       0.62      0.79      0.69      4584

Unaccented Count: 0
Accented Count: 4584
Naive Bayes Accuracy : 78.73%
Naive Bayes F1-Score: 0.88
             precision    recall  f1-score   support

          0       0.00      0.00      0.00       975
          1       0.79      1.00      0.88      3609

avg / total       0.62      0.79      0.69      4584

Unaccented Count: 0
Accented Count: 4584
Random Forest Accuracy : 78.73%
Random Forest F1-Score: 0.88
             precision    recall  f1-score   support

          0       0.00      0.00      0.00       975
          1       0.79      1.00      0.88      3609

avg / total       0.62      0.79      0.69      4584

Unaccented Count: 0
Accented Count: 4584
SVM Accuracy : 78.73%
SVM F1-Score: 0.88


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


SVM Accuracy : 77.84%
SVM F1-Score: 0.87
             precision    recall  f1-score   support

          0       0.19      0.01      0.02       975
          1       0.79      0.99      0.87      3609

avg / total       0.66      0.78      0.69      4584

Unaccented Count: 67
Accented Count: 4517


Features: ['Far_Back_Mentioned']
LR Accuracy : 61.26%
LR F1-Score: 0.72
             precision    recall  f1-score   support

          0       0.27      0.49      0.35       975
          1       0.82      0.65      0.72      3609

avg / total       0.71      0.61      0.64      4584

Unaccented Count: 1759
Accented Count: 2825
Naive Bayes Accuracy : 78.73%
Naive Bayes F1-Score: 0.88
             precision    recall  f1-score   support

          0       0.00      0.00      0.00       975
          1       0.79      1.00      0.88      3609

avg / total       0.62      0.79      0.69      4584

Unaccented Count: 0
Accented Count: 4584
Random Forest Accuracy : 73.89%
Random Forest F1-Score: 

  'precision', 'predicted', average, warn_for)
  return umr_sum(a, axis, dtype, out, keepdims)
  return umr_sum(a, axis, dtype, out, keepdims)


SVM Accuracy : 40.64%
SVM F1-Score: 0.51
             precision    recall  f1-score   support

          0       0.17      0.47      0.25       975
          1       0.73      0.39      0.51      3609

avg / total       0.61      0.41      0.45      4584

Unaccented Count: 2658
Accented Count: 1926


Features: ['IP_Prev_Mentions']
LR Accuracy : 78.73%
LR F1-Score: 0.88
             precision    recall  f1-score   support

          0       0.00      0.00      0.00       975
          1       0.79      1.00      0.88      3609

avg / total       0.62      0.79      0.69      4584

Unaccented Count: 0
Accented Count: 4584
Naive Bayes Accuracy : 78.73%
Naive Bayes F1-Score: 0.88
             precision    recall  f1-score   support

          0       0.00      0.00      0.00       975
          1       0.79      1.00      0.88      3609

avg / total       0.62      0.79      0.69      4584

Unaccented Count: 0
Accented Count: 4584
Random Forest Accuracy : 78.73%
Random Forest F1-Score: 0.8

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [111]:
label_counts = np.bincount(columns['Labels'])
print("Majority class baseline classifier accuracy: {:.2f}%".format((max(label_counts) / np.sum(label_counts)) * 100))

Majority class baseline classifier accuracy: 78.03%
