In [1]:
import ast
import numpy as np
import tensorflow as tf
from bert_serving.client import BertClient
import pandas as pd
import nltk
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
# import fse

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [15]:
# Start the BERT service
# Set check_length=False becasue after combining scripts for for and against sides, the script is very long
# And the default setting has max_length of 25
bc = BertClient(check_length=False)

In [18]:
main_points =pd.read_csv('DebateStar/Meta Data/metadata_appended_main_points.csv') 
main_points.head()

Unnamed: 0,id,title,date,for,against,For_Main_Points,against_Main_Points
0,d20191112,Capitalism Is a Blessing,2019-11-12,"['John Mackey', 'Katherine Mangu-Ward']","['Bhaskar Sunkara', 'Richard D. Wolff']",['By promoting market competition and rewardin...,['Capitalism serves the interests of large cor...
1,d20191029,Parenting Is Overrated,2019-10-29,"['Robert Plomin', 'Nancy Segal']","['Paige Harden', 'Ann Pleshette Murphy']","[""We're in the midst of a DNA revolution: Whil...","['While DNA is important, factors like familia..."
2,d20191022,Europe Has Declared War on American Tech Compa...,2019-10-22,"['Roslyn Layton', 'Berin Szóka']","['Marietje Schaake', 'Ramesh Srinivasan']",['European regulators have declared war on Ame...,['Brussels isn’t waging war on Silicon Valley....
3,d20190917,Replace Private Insurance with Medicare for All,2019-09-17,"['Dr. Adam Gaffney', 'Joseph Sanberg']","['Nick Gillespie', 'Sally Pipes']",['The United States government should follow t...,['Individuals should have the freedom to choos...
4,d20190912,Unresolved: Shifting Power in the Middle East,2019-09-12,"['Michael Doran', 'Reuel Marc Gerecht', 'Berna...","['Brett McGurk', 'Barbara Slavin']",,


In [19]:
result_live = pd.read_csv('results_data/final_live.csv')
result_online = pd.read_csv('results_data/final_online.csv')
result_live.head()

Unnamed: 0,for the against side to against,for the for side to for,from the against side to for,from the against side to undecided,from the for side to against,from the for side to undecided,from undecided to against,from undecided to for,post-debate-against,post-debate-for,post-debate-undecided,pre-debate-against,pre-debate-for,pre-debate-undecided,undecided to undecided,url,winner,id
0,17,49,2,2,5,2,6,11,28,62,10,21,56,23,6,https://www.intelligencesquaredus.org/debates/...,against,d20191112
1,40,15,8,4,9,3,10,9,59,32,9,52,27,21,2,https://www.intelligencesquaredus.org/debates/...,against,d20191029
2,19,14,3,1,15,1,30,13,64,30,6,23,30,47,4,https://www.intelligencesquaredus.org/debates/...,against,d20191022
3,29,26,5,1,6,4,16,9,51,40,9,35,36,29,4,https://www.intelligencesquaredus.org/debates/...,against,d20190917
4,49,12,1,1,14,0,20,2,83,15,2,51,26,23,1,https://www.intelligencesquaredus.org/debates/...,against,d20190802


In [21]:
# For each side of the debate, combine the scripts together
def get_side_script(df):
    for_script = ''
    for s in df.loc[df['side'] == 'for']['script']:
        for_script += s + ' '
    against_script = ''
    for s in df.loc[df['side'] == 'against']['script']:
        against_script += s + ' '
    return for_script, against_script

# Returns main points that are for the motion and those against the motion
def get_main_points(fid, main_points):
    for_points = main_points.loc[main_points['id'] == fid]['For_Main_Points'].item()
    for_points = ast.literal_eval(for_points)
    against_points = main_points.loc[main_points['id'] == fid]['against_Main_Points'].item()
    against_points = ast.literal_eval(against_points)
    return for_points, against_points

In [23]:
# Returns a list of sentences in the text
def cleanText(text):
    punctuations = string.punctuation
    l = text.split('.')
    result = []
    for s in l:
        result.extend(s.split('?'))
    result = [text.translate(str.maketrans('', '', punctuations)) for text in result]
    result = [t.lower().strip() for t in result]
    result = [t for t in result if len(t) > 0]
    return result

In [24]:
# Returns the cleaned sentence, given the original sentence
def cleanSentence(text):
    punctuations = string.punctuation
    result = text.translate(str.maketrans('', '', punctuations)) 
    return result.lower()

In [51]:
fid = main_points.iloc[0]['id']
df = pd.read_csv('For Against Scripts/for_against_scripts_' + fid + '.csv')
df.head()

Unnamed: 0,side,script
0,for,"Thanks. Thanks, John. It's great to be here. T..."
1,against,Thank you.
2,for,"So, let me start out by saying, is capitalism ..."
3,against,"Well, let's be clear about what the question o..."
4,for,"So, I was going to start by thanking my partne..."


In [52]:
for_script, against_script = get_side_script(df)
for_points, against_points = get_main_points(fid, main_points)

In [25]:
cleaned_for_script = cleanText(for_script)
for_vecs = bc.encode(cleaned_for_script)
cleaned_against_script = cleanText(against_script)
against_vecs = bc.encode(cleaned_against_script)

In [27]:
for_points_vec = bc.encode([cleanSentence(text) for text in for_points])
against_points_vec = bc.encode([cleanSentence(text) for text in against_points])

In [31]:
# Return an array of similarity scores between the vector of interest to all target vectors
def score(query_vec, vecs):
    return np.sum(query_vec * vecs, axis=1) / np.linalg.norm(vecs, axis=1)

In [28]:
# A Demo of finding the closest sentence to 'i do not think it is the case'
query_vec = bc.encode(['i do not think it is the case'])[0]
topk = 5
# compute normalized dot product as score
score = np.sum(query_vec * for_vecs, axis=1) / np.linalg.norm(for_vecs, axis=1)
topk_idx = np.argsort(score)[::-1][:topk]
for idx in topk_idx:
    print('> %s\t%s' % (score[idx], cleanText(for_script)[idx]))

> 9.72066	but this is not the subject were examining here this evening
> 9.590052	that certainly didn’t happen with the plo once it returned to gaza i dont expect it will happen with hamas
> 9.475156	um i think that there’s really no condition
> 9.369792	note though that is a very limited concession
> 9.320732	my critics will say this is old news


In [32]:
print(score(for_total_vec, for_points_vec))
print(-score(for_total_vec, against_points_vec))

[7.8843465 7.7781625 7.5085626 8.14621  ]
[-8.010284  -7.9231772 -8.058084  -7.9358616]


In [33]:
print(-score(against_total_vec, for_points_vec))
print(score(against_total_vec, against_points_vec))

[-10.793051 -10.563604 -10.708664 -10.981587]
[10.512351 10.854763 10.285327 11.047818]


In [34]:
print(score(for_total_vec, for_points_vec)-score(against_total_vec, for_points_vec))
print(score(for_total_vec, against_points_vec)-score(against_total_vec, against_points_vec))

[-2.9087043 -2.7854419 -3.2001014 -2.8353777]
[-2.5020666 -2.9315858 -2.2272434 -3.1119566]


In [29]:
for_total_vec = bc.encode([' '.join(cleaned_for_script)])
against_total_vec = bc.encode([' '.join(cleaned_against_script)])

In [36]:
# defend_atttack list
# for_for: a list of scores of similarities between for side script and main points for the motion
for_for = []
# for_against: a list of scores of similarities between for side script and main points against the motion
for_against = []
# similar to above
against_for = []
against_against = []

winners_live = []
winners_online = []
for i in range(len(main_points)):
    if pd.isnull(main_points.iloc[i]['For_Main_Points']):
        continue
    fid = main_points.iloc[i]['id']
    try:
        scripts = pd.read_csv('For Against Scripts/for_against_scripts_' + fid + '.csv')
    except:
        print(fid)
        continue
    if len(scripts) == 0:
        continue
    # get scripts and main points
    for_script, against_script = get_side_script(scripts)
    for_points, against_points = get_main_points(fid, main_points)
    for_points_vec = bc.encode([cleanSentence(text) for text in for_points])
    against_points_vec = bc.encode([cleanSentence(text) for text in against_points])
    # get cleaned scripts
    cleaned_for_script = cleanText(for_script)
    cleaned_against_script = cleanText(against_script)
    # get the document embedding 
    for_total_vec = bc.encode([' '.join(cleaned_for_script)])
    against_total_vec = bc.encode([' '.join(cleaned_against_script)])
    # add scores to the lists
    for_for.append(score(for_total_vec, for_points_vec))
    for_against.append(score(for_total_vec, against_points_vec))
    against_for.append(score(against_total_vec, for_points_vec))
    against_against.append(score(against_total_vec, against_points_vec))
    # add results to the lists
    winners_live.append(result_live.loc[result_live['id'] == fid]['winner'].item())
    winners_online.append(result_online.loc[result_online['id'] == fid]['winner'].item())

In [37]:
sums = []
winners = []
for i in range(len(winners_online)):
    if winners_online[i] == 'undecided':
        continue
    # predict by the sum of mean differences
    sums.append((for_for[i]-against_for[i]).mean()+(for_against[i]-against_against[i]).mean())
    winners.append(winners_online[i])

In [38]:
(pd.Series([s > 0 for s in sums]) == pd.Series([w == 'for' for w in winners])).mean()

0.5862068965517241

In [39]:
dist = pd.Series(winners_online).value_counts(normalize = True)

against      0.397351
for          0.370861
undecided    0.231788
dtype: float64

In [40]:
sums = []
winners = []
for i in range(len(winners_live)):
    if winners_live[i] == 'undecided':
        continue
    sums.append((for_for[i]-against_for[i]).mean()+(for_against[i]-against_against[i]).mean())
    winners.append(winners_live[i])

In [41]:
(pd.Series([s > 0 for s in sums]) == pd.Series([w == 'for' for w in winners])).mean()

0.5735294117647058

In [42]:
pd.Series(winners_live).value_counts(normalize = True)

against      0.516556
for          0.384106
undecided    0.099338
dtype: float64

In [114]:
bc.close()