## Import Modules

In [94]:
import sklearn
import copy
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectKBest
from scipy.sparse import csr_matrix
from scipy.sparse import hstack, vstack
from sklearn.preprocessing import StandardScaler
from scipy.stats import randint as sp_randint

## Function Definitions

In [95]:
df = pd.read_csv("../info-status/games-data-20180420.csv")

# Part of Speech tags that are considered to be referring expressions
REFERRING_EXP_POS = ["NN", "NNS", "NNP", "NNPS", "PDT", "CD", "POS", "PRP", "PRP$"]

# List of features to extract from the table
FEATURE_LIST = ['tree_depth', 'tree_width', 'word_depth', 'constituent_width', 'constituent_label',
                'constituent_forward_position', 'constituent_backward_position',
                'word_end_time', 'Intonational_Phrase_ID', 'Number_Of_Coref_Mentions',
                'Most_Recent_Mention', 'word_pos_tag',
                'word_tobi_break_index', 'word_tobi_pitch_accent']


def filter_referring_expressions(df):
    '''
    Returns a dataframe only containing row with referring expressions
    
    :param df dataframe to be filtered
    '''
    return df.loc[df['word_pos_tag'].isin(REFERRING_EXP_POS)]

df = df[FEATURE_LIST]
# Comment out the line below to run experiments on all tokens instead of just referring expressions
df = filter_referring_expressions(df)

def get_labels(df):
    '''
    Retrieves accented labels from each row in the table
    Token is considered to be accented if its value for column 'word_tobi_pitch_accent' is not '*?' or '_'
    
    :param df dataframe to retrieve labels from 
    '''
    accent_columns = df[['word_tobi_break_index', 'word_tobi_pitch_accent']]
    
    y_labels = []
    
    for index, row in accent_columns.iterrows():
        
        if row['word_tobi_pitch_accent'] == "*?":
            y_labels.append(0)
            
        elif row['word_tobi_pitch_accent'] == "_":
            y_labels.append(0)
            
        else:
            y_labels.append(1)
            
    return pd.Series(y_labels, dtype=int).to_frame("Labels")

def convert_pos_to_feature(df):
    '''
    Converts Part of Speech feature within a dataframe into integer representation in order to be used as a 
    feature for classifiers
    
    :param df dataframe that contains PoS column
    '''   
    pos_set = set()
    pos_dict = dict()
    converted_pos = np.array([])
    
    for i, row in df.iterrows():
        pos_set.add(row['word_pos_tag'])
        
    for idx, item in enumerate(pos_set):
        pos_dict[item] = idx
        
    for i, row in df.iterrows():
        converted_pos = np.append(converted_pos, pos_dict[row['word_pos_tag']])
        
    return pd.Series(converted_pos).to_frame('word_pos_tag')

def get_input_has_been_mentioned(df):
    '''
    Generates binary value feature column that determines whether a token as been mentioned before or not.
    '''
    return (df['Most_Recent_Mention'] > 0).astype(int).to_frame("Has_Been_Mentioned")

def get_input_num_mentions(df):
    '''
    Generates a feature with a count for how many Coreferences a token has
    '''
    return np.maximum(df['Number_Of_Coref_Mentions'], 0).fillna(value=0).astype(int).to_frame('Num_Mentions')

def get_input_far_back_mentioned(df):
    '''
    Generates a feature that determines how far back temporally a token was mentioned
    If it wasn't previously mentioned the feature value is set to the max value for float32
    '''
    return (df['word_end_time'] - df['Most_Recent_Mention']).to_frame('Far_Back_Mentioned').fillna(value=np.finfo('float32').min)

def num_tokens_intonational_phrase_prev_mention(df, use_percentage=False):
    '''
    Get percentage of all referring expressions in an intonational phrase that have been mentioned before
    or determine whehther all referring expressions in an IP have been mentioned before.
    
    Generates feature with length of the Intonational Phrase the token is a part of
    
    Generates features with the normalized position of the token within its intonational phrase
    
    :param df Dataframe to filter from
    :param bool use_percentage Flag that determines whether to calculate percentage or not.    
    '''
    
    df = pd.concat([df['Intonational_Phrase_ID'], get_input_has_been_mentioned(df)], axis=1)

    ip_id = -1
    counter = 0
    curr_ip_id = 1
    ip_series = np.array([])
    ip_length = np.array([])
    ip_pos_norm = np.array([])

    ip_groups = df.groupby(['Intonational_Phrase_ID'])
    
    group_keys = ip_groups.groups.keys()
    
    for i in group_keys:
        counter = 0
        value = 1
        for idx, row in enumerate(ip_groups.get_group(i)['Has_Been_Mentioned']):

            # Count how many referring expressions have been previously mentioned             
            if row == 1:
                counter += 1

            # Detect if a referring expression in IP has not been previously mentioned
            if row != 1:
                value = 0

            if idx > 0:
                ip_pos_norm = np.append(ip_pos_norm, idx / len(ip_groups.get_group(i)))                    
            else:
                ip_pos_norm = np.append(ip_pos_norm, 0)

        if use_percentage:
            ip_series = np.append(ip_series, np.full(len(ip_groups.get_group(i)), counter / len(ip_groups.get_group(i))))
        else:
            ip_series = np.append(ip_series, np.full(len(ip_groups.get_group(i)), value))

        ip_length = np.append(ip_length, np.full(len(ip_groups.get_group(i)), len(ip_groups.get_group(i))))

    df_ip_mentions = pd.Series(ip_series).to_frame("IP_Prev_Mentions").fillna(0)
    df_ip_length = pd.Series(ip_length).to_frame("IP_Length").fillna(0)
    df_ip_pos_norm = pd.Series(ip_pos_norm).to_frame("IP_Pos_Normalized").fillna(0)           
            
    return df_ip_pos_norm.join(df_ip_mentions.join(df_ip_length))

def get_syntactic_features(df):
    '''
    Return syntactic features from the dataframe
    
    Converts the 'constituent_label' feature to integer representation for use in classification
    '''
    
    df = df[['tree_depth', 'tree_width', 'word_depth', 'constituent_width',
             'constituent_label', 'constituent_forward_position', 'constituent_backward_position']]
    
    label_set = set()
    label_dict = dict()
    converted_labels = np.array([])
    
    for idx, row in df.iterrows():
        label_set.add(row['constituent_label'])
        
    for i, label in enumerate(label_set):
        label_dict[label] = i
                     
    for idx, row in df.iterrows():
        converted_labels = np.append(converted_labels, label_dict[row['constituent_label']])
        
    df['constituent_label'] = pd.Series(converted_labels).values
    
    return df

def classifyNB(x_train, x_test, y_train, y_test):
    '''
    Naive Bayes classifier
    '''
    
    gsclf = BernoulliNB()
    gsclf.fit(x_train, y_train)
    y_pred = gsclf.predict(x_test)

    print_accuracy_f1_scores("Naive Bayes", gsclf, x_test, y_test, gsclf.predict(x_test))
    
def classifyRF(x_train, x_test, y_train, y_test):
    '''
    Random Forest classifier
    '''
    
    # Create the random grid
    param_grid = { 
        'n_estimators': [100, 900],
        'max_features': ['auto', 'sqrt', 'log2']
    }
    
    clf = RFC(max_features = 'auto', n_estimators = 700)
    
#     clf = GridSearchCV(estimator=clf, param_grid=param_grid, cv= 5)

    clf.fit(x_train, y_train)

    y_pred = clf.predict(x_test)
    
#     print(clf.best_params_)

    print_accuracy_f1_scores("Random Forest", clf, x_test, y_test, clf.predict(x_test))
    
def classifyLR(x_train, x_test, y_train, y_test):
    '''
    Logistic Regression classifier
    '''
    
    clf = LogisticRegression()
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    
    print_accuracy_f1_scores("LR", clf, x_test, y_test, clf.predict(x_test))
    
def classifySVM(x_train, x_test, y_train, y_test):
    '''
    Support vector classifier
    '''
    
    gsclf = LinearSVC()
    gsclf.fit(x_train, y_train)
    y_pred = gsclf.predict(x_test)

    print_accuracy_f1_scores("SVM", gsclf, x_test, y_test, gsclf.predict(x_test))
    
def print_accuracy_f1_scores(classifier_name, clf, x_test, y_test, y_pred):
    '''
    Print a classifier's accuracy, F1 score, and classification report
    '''
    
    print(classifier_name + " Accuracy : {:.2f}%".format(clf.score(x_test, y_test) * 100))
    print(classifier_name + " F1-Score: {:.2f}".format(f1_score(y_test, y_pred)))
    print(classification_report(y_test, y_pred))
    count = np.bincount(y_pred)
    print("Unaccented Count:", count[0])
    print("Accented Count:", count[1])
    
    print(80 * "=")


In [96]:
# Generate and join all features into one dataframe

columns = get_input_has_been_mentioned(df).join(get_input_num_mentions(df))
columns = columns.join(get_input_far_back_mentioned(df))
columns = columns.reset_index(drop=True)
columns = columns.join(num_tokens_intonational_phrase_prev_mention(df, use_percentage=False))
columns = columns.join(get_syntactic_features(df))
columns = columns.join(get_labels(df))
columns = columns.join(convert_pos_to_feature(df))
columns = columns.fillna(value=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [97]:
# Run classifiers on a single feature
def run_classifiers_on_single_feature(features, labels):
    
    x_train, x_test, y_train, y_test = train_test_split(features[0], labels, random_state = 1)
    x_train = x_train.values.reshape(-1, 1)
    x_test = x_test.values.reshape(-1, 1)

    print()
    print("Features:", [f.name for f in features])
    classifyLR(x_train, x_test, y_train, y_test)
    classifyNB(x_train, x_test, y_train, y_test)
    classifyRF(x_train, x_test, y_train, y_test)
    classifySVM(x_train, x_test, y_train, y_test)
    print()

# Run classifiers on multiple features
def run_classifiers_on_multiple_features(ref_df, columns):

    x_train, x_test, y_train, y_test = train_test_split(ref_df[columns], ref_df['Labels'], random_state = 1)
    
    print()
    print("Features:", columns)
    classifyLR(x_train, x_test, y_train, y_test)
    classifyNB(x_train, x_test, y_train, y_test)
    classifyRF(x_train, x_test, y_train, y_test)
    classifySVM(x_train, x_test, y_train, y_test)
    print()
    

In [98]:
# run_classifiers_on_single_feature([columns['Has_Been_Mentioned']], columns['Labels'])
# run_classifiers_on_single_feature([columns['Num_Mentions']], columns['Labels'])
# run_classifiers_on_single_feature([columns['Far_Back_Mentioned']], columns['Labels'])
# run_classifiers_on_single_feature([columns['IP_Prev_Mentions']], columns['Labels'])

In [99]:
# run_classifiers_on_multiple_features(columns, ['Has_Been_Mentioned', 'Num_Mentions'])
# run_classifiers_on_multiple_features(columns, ['Has_Been_Mentioned', 'Far_Back_Mentioned'])
# run_classifiers_on_multiple_features(columns, ['Has_Been_Mentioned', 'IP_Prev_Mentions'])

In [100]:
# run_classifiers_on_multiple_features(columns, ['Num_Mentions', 'Far_Back_Mentioned'])
# run_classifiers_on_multiple_features(columns, ['Num_Mentions', 'IP_Prev_Mentions'])

In [101]:
# run_classifiers_on_multiple_features(columns, ['Far_Back_Mentioned', 'IP_Prev_Mentions'])

In [102]:
# run_classifiers_on_multiple_features(columns, ['Has_Been_Mentioned', 'Far_Back_Mentioned', 'IP_Prev_Mentions'])
# run_classifiers_on_multiple_features(columns, ['Has_Been_Mentioned', 'Num_Mentions', 'IP_Prev_Mentions'])
# run_classifiers_on_multiple_features(columns, ['Num_Mentions', 'Far_Back_Mentioned', 'IP_Prev_Mentions'])

In [103]:
# run_classifiers_on_multiple_features(columns, ['Has_Been_Mentioned', 'Num_Mentions', 'Far_Back_Mentioned', 'IP_Prev_Mentions'])

In [104]:
# Complete features list for use with classifiers
f_list = ['tree_depth', 'tree_width', 'word_depth', 'constituent_width',
         'Has_Been_Mentioned', 'Num_Mentions', 'Far_Back_Mentioned', 'IP_Prev_Mentions',
          'constituent_label', 'constituent_forward_position', 'constituent_backward_position', 'word_pos_tag']

run_classifiers_on_multiple_features(columns, f_list)


Features: ['tree_depth', 'tree_width', 'word_depth', 'constituent_width', 'Has_Been_Mentioned', 'Num_Mentions', 'Far_Back_Mentioned', 'IP_Prev_Mentions', 'constituent_label', 'constituent_forward_position', 'constituent_backward_position', 'word_pos_tag']
LR Accuracy : 61.26%
LR F1-Score: 0.72
             precision    recall  f1-score   support

          0       0.27      0.49      0.35       975
          1       0.82      0.65      0.72      3609

avg / total       0.71      0.61      0.64      4584

Unaccented Count: 1759
Accented Count: 2825
Naive Bayes Accuracy : 80.52%
Naive Bayes F1-Score: 0.88
             precision    recall  f1-score   support

          0       0.56      0.38      0.45       975
          1       0.85      0.92      0.88      3609

avg / total       0.79      0.81      0.79      4584

Unaccented Count: 656
Accented Count: 3928


  return umr_sum(a, axis, dtype, out, keepdims)
  return umr_sum(a, axis, dtype, out, keepdims)


Random Forest Accuracy : 84.01%
Random Forest F1-Score: 0.90
             precision    recall  f1-score   support

          0       0.66      0.51      0.58       975
          1       0.88      0.93      0.90      3609

avg / total       0.83      0.84      0.83      4584

Unaccented Count: 752
Accented Count: 3832
SVM Accuracy : 80.87%
SVM F1-Score: 0.89
             precision    recall  f1-score   support

          0       0.89      0.11      0.20       975
          1       0.81      1.00      0.89      3609

avg / total       0.82      0.81      0.74      4584

Unaccented Count: 126
Accented Count: 4458



In [105]:
label_counts = np.bincount(columns['Labels'])
print("Majority class baseline classifier accuracy: {:.2f}%".format((max(label_counts) / np.sum(label_counts)) * 100))

Majority class baseline classifier accuracy: 78.03%


In [106]:
mention_counts = np.bincount(columns['Has_Been_Mentioned'])
print(mention_counts)
print("Percentage of tokens with previous mention: {:.2f}%".format((mention_counts[1] / np.sum(mention_counts)) * 100))

[10360  7973]
Percentage of tokens with previous mention: 43.49%


In [107]:
accent_groups = columns.groupby(['Labels'])

for i in accent_groups.groups.keys():
    accent_counts = np.bincount(accent_groups.get_group(i)['Has_Been_Mentioned'])
    print(accent_counts)
    print("Percentage of {} label with previous mention: {:.2f}%".format(i, (accent_counts[1] / np.sum(accent_counts)) * 100))

[1811 2217]
Percentage of 0 label with previous mention: 55.04%
[8549 5756]
Percentage of 1 label with previous mention: 40.24%


In [108]:
accent_groups = columns.groupby(['Has_Been_Mentioned'])

accent_counts = np.bincount(accent_groups.get_group(1)['Labels'])
print(accent_counts)
print("Percentage of previously mentioned tokens that are unaccented: {:.2f}%".format((accent_counts[0] / np.sum(accent_counts)) * 100))
print("Percentage of previously mentioned tokens that are accented: {:.2f}%".format((accent_counts[1] / np.sum(accent_counts)) * 100))

[2217 5756]
Percentage of previously mentioned tokens that are unaccented: 27.81%
Percentage of previously mentioned tokens that are accented: 72.19%
