In [1]:
import numpy as np
import pandas as pd
import glob 
import nltk
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
import ast
import tensorflow as tf

## GloVe embedding + cosine similarity

In [3]:
embedding_index = {}

f = open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:],dtype='float32')
    embedding_index[word] = coefs
f.close()

print('found word vecs: ',len(embedding_index))

found word vecs:  400000


In [4]:
main_points =pd.read_csv('DebateStar/Meta Data/metadata_appended_main_points.csv') 
main_points.head()

Unnamed: 0,id,title,date,for,against,For_Main_Points,against_Main_Points
0,d20191112,Capitalism Is a Blessing,2019-11-12,"['John Mackey', 'Katherine Mangu-Ward']","['Bhaskar Sunkara', 'Richard D. Wolff']",['By promoting market competition and rewardin...,['Capitalism serves the interests of large cor...
1,d20191029,Parenting Is Overrated,2019-10-29,"['Robert Plomin', 'Nancy Segal']","['Paige Harden', 'Ann Pleshette Murphy']","[""We're in the midst of a DNA revolution: Whil...","['While DNA is important, factors like familia..."
2,d20191022,Europe Has Declared War on American Tech Compa...,2019-10-22,"['Roslyn Layton', 'Berin Szóka']","['Marietje Schaake', 'Ramesh Srinivasan']",['European regulators have declared war on Ame...,['Brussels isn’t waging war on Silicon Valley....
3,d20190917,Replace Private Insurance with Medicare for All,2019-09-17,"['Dr. Adam Gaffney', 'Joseph Sanberg']","['Nick Gillespie', 'Sally Pipes']",['The United States government should follow t...,['Individuals should have the freedom to choos...
4,d20190912,Unresolved: Shifting Power in the Middle East,2019-09-12,"['Michael Doran', 'Reuel Marc Gerecht', 'Berna...","['Brett McGurk', 'Barbara Slavin']",,


In [5]:
result_live = pd.read_csv('results_data/final_live.csv')
result_online = pd.read_csv('results_data/final_online.csv')
result_live.head()

Unnamed: 0,for the against side to against,for the for side to for,from the against side to for,from the against side to undecided,from the for side to against,from the for side to undecided,from undecided to against,from undecided to for,post-debate-against,post-debate-for,post-debate-undecided,pre-debate-against,pre-debate-for,pre-debate-undecided,undecided to undecided,url,winner,id
0,17,49,2,2,5,2,6,11,28,62,10,21,56,23,6,https://www.intelligencesquaredus.org/debates/...,against,d20191112
1,40,15,8,4,9,3,10,9,59,32,9,52,27,21,2,https://www.intelligencesquaredus.org/debates/...,against,d20191029
2,19,14,3,1,15,1,30,13,64,30,6,23,30,47,4,https://www.intelligencesquaredus.org/debates/...,against,d20191022
3,29,26,5,1,6,4,16,9,51,40,9,35,36,29,4,https://www.intelligencesquaredus.org/debates/...,against,d20190917
4,49,12,1,1,14,0,20,2,83,15,2,51,26,23,1,https://www.intelligencesquaredus.org/debates/...,against,d20190802


In [7]:
fid = 'd20061129'
df = pd.read_csv('For Against Scripts/for_against_scripts_' + fid + '.csv')
df.head()

Unnamed: 0,side,script
0,for,Thank you very much for being here this evenin...
1,against,Providing arms to the other side. The bottom l...
2,for,"Madam Chairperson, ladies and gentlemen. Uh we..."
3,against,Thank you very much Ms. Woodruff. You can add ...
4,for,Thank you very much. Can you hear me? Now star...


In [8]:
punctuations = string.punctuation
stop_words = set(stopwords.words('english'))

In [9]:
def cleanText(text):
    text = text.translate(str.maketrans('', '', punctuations))
    return [t for t in text.lower().split() if t not in stop_words]

In [10]:
def get_side_script(df):
    for_script = ''
    for s in df.loc[df['side'] == 'for']['script']:
        for_script += s + ' '
    against_script = ''
    for s in df.loc[df['side'] == 'against']['script']:
        against_script += s + ' '
    return for_script, against_script

In [11]:
def sentence_vec(sentence):
    vector = np.zeros(embedding_index['the'].shape)
    count = 0
    for s in cleanText(sentence):
        try:
            vector += embedding_index[s]
            count += 1
        except:
            0
    if count == 0:
        return vector.reshape(1, len(vector))
    vector = vector / count
    return vector.reshape(1, len(vector))

In [12]:
def get_main_points(fid, main_points):
    for_points = main_points.loc[main_points['id'] == fid]['For_Main_Points'].item()
    for_points = ast.literal_eval(for_points)
    against_points = main_points.loc[main_points['id'] == fid]['against_Main_Points'].item()
    against_points = ast.literal_eval(against_points)
    return for_points, against_points

In [18]:
def sim_vector(sentence, side, for_points, against_points):
    max_len = 6
    for_len = len(for_points)
    against_len = len(against_points)
    assert max_len >= for_len
    assert max_len >= against_len, str(against_len)
    for_vec = [cosine_similarity(sentence_vec(sentence), sentence_vec(for_points[i])).item() \
               for i in range(for_len)]
    against_vec = [cosine_similarity(sentence_vec(sentence), sentence_vec(against_points[i])).item() \
                   for i in range(against_len)]
    if for_len < max_len:
        for_vec += [0] * (max_len - for_len)
    if against_len < max_len:
        against_vec += [0] * (max_len - against_len)
    vec = for_vec + against_vec
    if side == 'for':
        return vec
    else:
        return [-a for a in vec]

In [22]:
(np.array(sim_vector(for_script, 'for', for_points, against_points)) + \
np.array(sim_vector(against_script, 'against', for_points, against_points))).sum()

0.021241144491519925

In [19]:
totals = []
winners_live = []
winners_online = []
for i in range(len(main_points)):
    if pd.isnull(main_points.iloc[i]['For_Main_Points']):
        continue
    fid = main_points.iloc[i]['id']
    try:
        scripts = pd.read_csv('For Against Scripts/for_against_scripts_' + fid + '.csv')
    except:
        print(fid)
        continue
    if len(scripts) == 0:
        continue
    for_script, against_script = get_side_script(scripts)
    for_points, against_points = get_main_points(fid, main_points)
#     total = (np.array(sim_vector(for_script, 'for', for_points, against_points)) + \
#     np.array(sim_vector(against_script, 'against', for_points, against_points)))
    total = sim_vector(for_script, 'for', for_points, against_points) + \
    sim_vector(against_script, 'against', for_points, against_points)
    totals.append(total)
    winners_live.append(result_live.loc[result_live['id'] == fid]['winner'].item())
    winners_online.append(result_online.loc[result_online['id'] == fid]['winner'].item())

In [33]:
winners_live

['against',
 'against',
 'against',
 'against',
 'against',
 'for',
 'against',
 'for',
 'for',
 'against',
 'for',
 'against',
 'against',
 'against',
 'against',
 'for',
 'against',
 'against',
 'against',
 'for',
 'against',
 'against',
 'against',
 'for',
 'against',
 'against',
 'against',
 'for',
 'for',
 'against',
 'against',
 'for',
 'for',
 'against',
 'against',
 'against',
 'against',
 'for',
 'against',
 'against',
 'against',
 'for',
 'against',
 'against',
 'against',
 'against',
 'for',
 'for',
 'against',
 'for',
 'against',
 'against',
 'against',
 'for',
 'for',
 'against',
 'against',
 'against',
 'against',
 'against',
 'for',
 'against',
 'against',
 'against',
 'for',
 'for',
 'against',
 'for',
 'against',
 'against',
 'for',
 'for',
 'for',
 'against',
 'against',
 'for',
 'for',
 'against',
 'for',
 'for',
 'for',
 'for',
 'for',
 'against',
 'for',
 'against',
 'for',
 'against',
 'against',
 'for',
 'for',
 'for',
 'against',
 'against',
 'for',
 'against',


In [34]:
predictions = []
for i in range(len(totals)):
    total = totals[i]
    winner = winners_live[i]
    if winner == 'undecided':
        continue
    if sum(total) > 0:
        predictions.append(winner == 'for')
    elif sum(total) <= 0:
        predictions.append(winner == 'against')
    else:
        print('error')

In [35]:
np.mean(predictions)

0.5882352941176471

In [38]:
pd.Series(winners_live).value_counts(normalize = True)

against      0.516556
for          0.384106
undecided    0.099338
dtype: float64

In [39]:
predictions = []
for i in range(len(totals)):
    total = totals[i]
    winner = winners_online[i]
    if winner == 'undecided':
        continue
    if sum(total) > 0:
        predictions.append(winner == 'for')
    elif sum(total) <= 0:
        predictions.append(winner == 'against')
    else:
        print('error')

In [40]:
np.mean(predictions)

0.603448275862069

In [41]:
pd.Series(winners_online).value_counts(normalize = True)

against      0.397351
for          0.370861
undecided    0.231788
dtype: float64

In [144]:
total = np.zeros((1,8))
for i in range(77):
    try:
        side = df['side'][i]
        vec = np.array(sim_vector(df['script'][i], side, for_points, against_points)).reshape((1,8))
        if vec.sum() == 0:
            print(side, i)
        total += vec
    except:
        print(i)


against 13
for 48
for 62


In [146]:
total.sum()

0.29452330592472986

In [122]:
sim_vector(df['script'][13], 'for', for_points, against_points)

[0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0]