In [1]:
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic

In [2]:
import pandas as pd
import numpy as np
import math

In [61]:
# Get all documents as every debate paragraphs over all debates
all_documents = []
import os
for filename in os.listdir('../For Against Scripts'):
    file = pd.read_csv('../For Against Scripts/' + filename)
    for script in file.script:
        all_documents.append(script)

In [62]:
all_documents = pd.Series(all_documents).apply(lambda x: x.lower()).tolist()

In [63]:
def idf(w, all_documents):
    df = sum([w in x for x in all_documents])
    N = len(all_documents)
    try:
        return math.log(N/df)
    except:
        return 0

In [122]:
brown_ic = wordnet_ic.ic('ic-brown.dat')
def sim(w1, w2, brown_ic):
    ## error will occur when words passed in is invalid word
    try:
        w1_syn = wn.synsets(w1, pos=wn.NOUN)[0]
        w2_syn = wn.synsets(w2, pos=wn.NOUN)[0]
        sim_value = w1_syn.res_similarity(w2_syn, brown_ic)
        if sim_value > 100:
            return 0
        else:
            return sim_value
    except:
        return 0

In [66]:
## Calculate the max similarity between a word and a sentence
##     - w: a word in string
##     - T: a sentence in string
def maxSim(w, T, brown_ic):
    return pd.Series([sim(w, word, brown_ic) for word in T.split()]).max()

In [123]:
## Calculate the similarity between two sentences
##     - T1: a sentence in string
##     - T2: a sentence in string
def sim_Sentences(T1, T2, idfs1, idfs2):
    T1 = T1.lower()
    T2 = T2.lower()
    max_sims_1 = np.array([maxSim(w1, T2, brown_ic) for w1 in T1.split()])
    numberator1 = sum(max_sims_1*np.array(idfs1))
    denominator1 = sum(idfs1)
    max_sims_2 = np.array([maxSim(w2, T1, brown_ic) for w2 in T2.split()])
    numberator2 = sum(max_sims_2*np.array(idfs2))
    denominator2 = sum(idfs2)

    toReturn = (numberator1/denominator1 + numberator2/denominator2)/2
    return toReturn

In [68]:
live_result = pd.read_csv('../results_data/final_live.csv')
online_result = pd.read_csv('../results_data/final_online.csv')

In [165]:
pred_results = []
online_winners = []
live_winners = []

for test_id in live_result.id:
    print(test_id)
    main_points = open('../Main Points/main_points_' + test_id).read().split('\n')

    break_point = main_points.index("Against The Motion")
    for_main_points = main_points[1:break_point]
    against_main_points = main_points[break_point+1:]
    script_df = pd.read_csv('../For Against Scripts/' + 'for_against_scripts_' + test_id + '.csv')

    try:
        for_main_points.remove('')
    except:
        print('Nothing to be removed from for')
    try:
        against_main_points.remove('')
    except:
        print('Nothing to be removed from against')
    
    ## create idfs ahead to avoid duplicated computation in sim_Sentences
    ##     - S: the sentence whose idf of each word is to be calculated 
    ##     - all_documents: document corpus that idf is relied upon.
    def idfs_cal(S, all_documents):
        idfs = [idf(w, all_documents) for w in S.split()]
        return idfs

    idfs_for = [idfs_cal(s, all_documents) for s in for_main_points]
    idfs_against = [idfs_cal(s, all_documents) for s in against_main_points]
    idfs_passage = [idfs_cal(s, all_documents) for s in script_df.script]
    
    # We have a maximum of 6 main points for each side
    sim_results = []
    for i in script_df.index:
        sim_result_one_row = []
        side = script_df.loc[i].side

        target_passage = script_df.loc[i].script

        for j in range(len(for_main_points)):
            point = for_main_points[j]
            sentence_sim_score = sim_Sentences(point, target_passage, idfs_for[j], idfs_passage[i])
            if side == 'for':
                sim_result_one_row.append(sentence_sim_score)
            else:
                sim_result_one_row.append(-sentence_sim_score)

        for j in range(len(against_main_points)):
            point = against_main_points[j]
            sentence_sim_score = sim_Sentences(point, target_passage, idfs_against[j], idfs_passage[i])
            if side == 'for':
                sim_result_one_row.append(-sentence_sim_score)
            else:
                sim_result_one_row.append(sentence_sim_score) 
        sim_results.append(sim_result_one_row)
        
    online_winner = online_result[online_result['id'] == test_id].winner
    live_winner = live_result[live_result['id'] == test_id].winner
    pred_result = pd.Series([sum(row) for row in sim_results]).dropna().sum()
    
    online_winners.append(online_winner)
    live_winners.append(live_winner)
    pred_results.append(pred_result)

d20191112
Nothing to be removed from for
d20191029
Nothing to be removed from for




d20191022
Nothing to be removed from for
d20190917
Nothing to be removed from for
d20190802
Nothing to be removed from for




d20190628
Nothing to be removed from for
d20190418
Nothing to be removed from for
d20190413
Nothing to be removed from for
d20190328
Nothing to be removed from for
d20190304
Nothing to be removed from for
d20190211
Nothing to be removed from for
d20190131
Nothing to be removed from for
d20190116
Nothing to be removed from for
d20181111
Nothing to be removed from for
d20181101
Nothing to be removed from for
d20181011
Nothing to be removed from for
d20180926
Nothing to be removed from for
d20180628
Nothing to be removed from for
d20180626
Nothing to be removed from for
d20180623
Nothing to be removed from for
d20180514
Nothing to be removed from for
d20180502
Nothing to be removed from for
d20180421
Nothing to be removed from for
d20180417
Nothing to be removed from for
d20180327
Nothing to be removed from for
d20180309
Nothing to be removed from for
d20180206
Nothing to be removed from for
d20171207
Nothing to be removed from for
d20171024
Nothing to be removed from for
d20171003
Nothin

ZeroDivisionError: division by zero

In [177]:
online_winners = [x.tolist() for x in online_winners]

In [184]:
online_winners = np.array(online_winners).reshape(1,-1).tolist()

In [186]:
online_winners = online_winners[0]

In [189]:
filter_online = [x != 'undecided' for x in online_winners]

In [217]:
pred_results_bool = (np.array(pred_results) > 0)
pred_results_bool = np.where(pred_results_bool == True, 'against', pred_results_bool)
pred_results_bool = np.where(pred_results_bool == 'False', 'for', pred_results_bool)

In [218]:
## online accuracy
valid_online = pd.Series(online_winners)[filter_online]
valid_result = pd.Series(pred_results_bool)[filter_online]

In [219]:
sum(valid_online == valid_result)/len(valid_online)

0.5428571428571428

In [220]:
#live_winners = [x.tolist() for x in live_winners]
#live_winners = np.array(live_winners).reshape(1,-1).tolist()
#live_winners = live_winners[0]
filter_live = [x != 'undecided' for x in live_winners]
## live accuracy
valid_live = pd.Series(live_winners)[filter_live]
valid_result = pd.Series(pred_results_bool)[filter_live]

In [221]:
sum(valid_live == valid_result)/len(valid_live)

0.5213675213675214