In [198]:
import pandas as pd
import string
from nltk.tokenize import sent_tokenize, word_tokenize
import numpy as np
import re

In [199]:
test_components_data = pd.read_csv('./data/test_components.csv')
train_components_data = pd.read_csv('./data/train_components.csv')
validation_components_data = pd.read_csv('./data/validation_components.csv')

In [200]:
test_speeches_data = pd.read_csv('./data/test_speeches.csv')
train_speeches_data = pd.read_csv('./data/train_speeches.csv')
validation_speeches_data = pd.read_csv('./data/validation_speeches.csv')

In [201]:
# open relation graph
relation_graph = pd.read_csv('./data/relation_graph.csv')

In [202]:
relation_graph.head()

Unnamed: 0,year,date,sectionID,rel_label,C_out,C_in,Speaker1,Speaker2,Comp1,Comp2,ID_x,text_out,ID_y,text_in
0,1960,07Oct,1,Support,T11,T13,NIXON,NIXON,Claim,Claim,T11,For me to have made such a statement would bee...,T13,"As a matter of fact in his book, The Strategy ..."
1,1960,07Oct,1,Support,T13,T12,NIXON,NIXON,Claim,Claim,T13,"As a matter of fact in his book, The Strategy ...",T12,"Now I'm very surprised that Senator Kennedy, w..."
2,1960,07Oct,1,Support,T15,T12,NIXON,NIXON,Premise,Claim,T15,Senator Kennedy also indicated with regard to ...,T12,"Now I'm very surprised that Senator Kennedy, w..."
3,1960,07Oct,1,Support,T14,T2,NIXON,NIXON,Claim,Claim,T14,We think that's pretty good progress,T2,"I look at Cuba today, I believe that we are fo..."
4,1960,07Oct,1,Attack,T3,T2,NIXON,NIXON,Premise,Claim,T3,a course which is difficult,T2,"I look at Cuba today, I believe that we are fo..."


In [203]:
len(relation_graph)

25595

In [204]:
# print le premier element de la relation graph
relation_graph.iloc[0]

year                                                      1960
date                                                     07Oct
sectionID                                                    1
rel_label                                              Support
C_out                                                      T11
C_in                                                       T13
Speaker1                                                 NIXON
Speaker2                                                 NIXON
Comp1                                                    Claim
Comp2                                                    Claim
ID_x                                                       T11
text_out     For me to have made such a statement would bee...
ID_y                                                       T13
text_in      As a matter of fact in his book, The Strategy ...
Name: 0, dtype: object

In [205]:
# print line when C_out is not egual to ID_x
print(relation_graph[relation_graph['C_out'] != relation_graph['ID_x']])
# same for C_in and ID_y
print(relation_graph[relation_graph['C_in'] != relation_graph['ID_y']])
# où est l'intérêt d'avoir des informations redondantes ?
# d'ailleur, serait il intérressant d'avoir l'id du speech ? Car pour l'instant nous n'avons que l'ID du component
# certains speechs contiennent des retours à la ligne tel que \n ou \r, est ce que cela peut être un problème ?

Empty DataFrame
Columns: [year, date, sectionID, rel_label, C_out, C_in, Speaker1, Speaker2, Comp1, Comp2, ID_x, text_out, ID_y, text_in]
Index: []
Empty DataFrame
Columns: [year, date, sectionID, rel_label, C_out, C_in, Speaker1, Speaker2, Comp1, Comp2, ID_x, text_out, ID_y, text_in]
Index: []


In [206]:
# Etant donné que les colonnes C_out et C_in sont redondantes avec ID_x et ID_y, on peut les supprimer (afin de ne pas avoir de confusion, peuvent être remis au besoin)
relation_graph.drop(['ID_x', 'ID_y'], axis=1, inplace=True)
relation_graph.head()

Unnamed: 0,year,date,sectionID,rel_label,C_out,C_in,Speaker1,Speaker2,Comp1,Comp2,text_out,text_in
0,1960,07Oct,1,Support,T11,T13,NIXON,NIXON,Claim,Claim,For me to have made such a statement would bee...,"As a matter of fact in his book, The Strategy ..."
1,1960,07Oct,1,Support,T13,T12,NIXON,NIXON,Claim,Claim,"As a matter of fact in his book, The Strategy ...","Now I'm very surprised that Senator Kennedy, w..."
2,1960,07Oct,1,Support,T15,T12,NIXON,NIXON,Premise,Claim,Senator Kennedy also indicated with regard to ...,"Now I'm very surprised that Senator Kennedy, w..."
3,1960,07Oct,1,Support,T14,T2,NIXON,NIXON,Claim,Claim,We think that's pretty good progress,"I look at Cuba today, I believe that we are fo..."
4,1960,07Oct,1,Attack,T3,T2,NIXON,NIXON,Premise,Claim,a course which is difficult,"I look at Cuba today, I believe that we are fo..."


In [207]:
# rassembler tous les speeches dans un seul dataframe
speeches_dataframe = pd.concat([train_speeches_data, validation_speeches_data, test_speeches_data])
print(len(speeches_dataframe))
# enelever les doublons sur le Speaker, la SectionID et le SpeechID
speeches_dataframe.drop_duplicates(subset=['Speaker', 'SectionID', 'SpeechID'], inplace=True)
print(len(speeches_dataframe))

7103
6198


In [208]:
# prendre la ligne 1 pour exemple
line1 = relation_graph.iloc[1]
# retrouver le speech contenant le text_out
text_out_speech = speeches_dataframe[speeches_dataframe['Speech'].str.contains(line1['text_out'])]
print(text_out_speech)
#retouver le speech contenant le text_in
text_in_speech = speeches_dataframe[speeches_dataframe['Speech'].str.contains(line1['text_in'])]
print(text_in_speech)

    Year   Date Speaker SectionID  SpeechID  \
42  1960  07Oct   NIXON         1         2   

                                               Speech  Start   End  
42   Well first of all, I don't agree with Senator...   2122  4424  
    Year   Date Speaker SectionID  SpeechID  \
42  1960  07Oct   NIXON         1         2   

                                               Speech  Start   End  
42   Well first of all, I don't agree with Senator...   2122  4424  


In [209]:
validation_speeches_data.head()

Unnamed: 0,Year,Date,Speaker,SectionID,SpeechID,Speech,Start,End
0,1960,13Oct,SHADEL,6,0,Mr. Cater's question is for Vice President Ni...,0,59
1,1960,13Oct,CATER,6,1,"Mr. Vice President, I'd like to return just o...",59,910
2,1960,13Oct,NIXON,6,2,"Well, Mr. Cater, of course it's a criticism t...",910,3441
3,1960,13Oct,SHADEL,6,3,Senator Kennedy? \r\n,3441,3467
4,1960,13Oct,KENNEDY,6,4,"Well number one on Indochina, Mr. Nixon talke...",3467,5388


In [210]:
# faire une colonne context3 dans le dataframe relation_graph
relation_graph['context3'] = ''

In [211]:
# pour chaque ligne de la relation graph
not_found = []
for index, row in relation_graph.iterrows():
    # surrond with try except
    try:
        # retrouver le speech contenant le text_out
        text_out_speech = speeches_dataframe[speeches_dataframe['Speech'].str.contains(row['text_out'], regex=False)]
        # retouver le speech contenant le text_in
        text_in_speech = speeches_dataframe[speeches_dataframe['Speech'].str.contains(row['text_in'], regex=False)]
        # verifier que les deux speeches ont bien été trouvé
        if len(text_out_speech) > 0 and len(text_in_speech) > 0:
            # TODO: retrouver le contexte 3
            pass
        else:
            
            # insert l'index de la ligne dans la liste not_found
            not_found.append(index)
    except Exception as e:
        print(index)
        not_found.append(index)
        continue

In [212]:
# id 1766 et 1767 ??
"The farmers were the victims of the double-digit inflation and the 21\1/2\-percent interest rates of the Carter-Mondale administration and the grain embargo"
# why back slash ? 

'The farmers were the victims of the double-digit inflation and the 21\x01/2\\-percent interest rates of the Carter-Mondale administration and the grain embargo'

In [213]:
print(len(not_found))
# print le premier element de la liste not_found
print(not_found[0])
# 5649

3790
56


In [214]:
# reprenons l'exmple de l'ID 1766
line1766 = relation_graph.iloc[1766]
print(line1766)

year                                                      1984
date                                                     07Oct
sectionID                                                    4
rel_label                                              Support
C_out                                                     T120
C_in                                                      T118
Speaker1                                                REAGAN
Speaker2                                                REAGAN
Comp1                                                  Premise
Comp2                                                    Claim
text_out     The farmers were the victims of the double-dig...
text_in      The farmers are not the victims of anything th...
context3                                                      
Name: 1766, dtype: object


In [215]:
# retrouver le speech contenant le text "The farmers were the victims of the double-digit inflation and the"
text_out_speech = speeches_dataframe[speeches_dataframe['Speech'].str.match("The farmers were the victims of the double-digit inflation and the")]
print(text_out_speech)

# recupérer tous les speeches du speacker "REAGAN" dont la date est le 07Oct et dont le speech contient "farmers"
text_in_speech = speeches_dataframe[(speeches_dataframe['Speaker'] == 'REAGAN') & (speeches_dataframe['Date'] == '07Oct') & (speeches_dataframe['Speech'].str.contains("farmers"))]
print(text_in_speech)

Empty DataFrame
Columns: [Year, Date, Speaker, SectionID, SpeechID, Speech, Start, End]
Index: []
     Year   Date Speaker SectionID  SpeechID  \
415  1984  07Oct  REAGAN         4        12   

                                                Speech  Start    End  
415   Before I campaigned as a Democrat for a Repub...   9306  10333  


In [216]:
# Comme dans les components pour nos context 1 et 2, nous avons certains cas où une partie du text est manquante, nous empechant donc de retrouver l'élément avec un simple str.contians
# ceci dit nous pouvons faire une methode semblable à celle que nous avons utilisé pour les components
# CAD: prendre n premiers mots verifier qu'ils sont présent dans notre speech puis prendre les n+1 mots qui suivent et verifier qu'ils font aussi partie du speech

In [217]:
# technique que nous allons implémenter pour le context 3
# pour chaque élément de la liste not_found
ctx3_not_found = []
for index in not_found:
    # recuperer la ligne correspondante
    line = relation_graph.iloc[index]

    text_out_speech = speeches_dataframe[speeches_dataframe['Speech'].str.contains(row['text_out'], regex=False)]
    text_in_speech = speeches_dataframe[speeches_dataframe['Speech'].str.contains(row['text_in'], regex=False)]
    
    # vefrifier à l'aide de la taille de la liste s'il est nécessaire de faire le traitement
    if len(text_out_speech) == 0 :
        # si le text_out n'est pas trouvé alors nous devons faire un traitement plus poussé
        text_out = line['text_out'].split()
        n = 0
        founded = []
        for n in range(1, len(text_out)):
            # concat les n premiers mots du component
            start_component = " ".join(text_out[0:n])
            end_component = " ".join(text_out[n:])
            # trouver les speeches qui contiennent le start_component
            speeches_intermed = speeches_dataframe[speeches_dataframe['Speech'].str.find(start_component) != -1]
            speeches_intermed = speeches_intermed[speeches_intermed['Speech'].str.find(end_component) != -1]
            if(len(speeches_intermed) > 0):
                # append each speech that contains the component
                founded.append(speeches_intermed)
                
        if len(founded > 1):
            # on a un problème car on a trouvéla relation dans plusieurs speeches
            print("multiple speeches found ", index)
        elif len(founded == 1):
            # on a trouvé le speech qui nous interesse
            pass
        else:
            # on ne l'a toujours pas trouvé, dommage
            ctx3_not_found.append(index)

print(len(ctx3_not_found))


0
