In [33]:
import spacy
import pandas as pd
import multiprocessing
import numpy as np


In [34]:
df_hansard  = pd.read_csv('data/hansard_all.csv')

df_group = df_hansard.groupby('subjectOfBusinessId')

q_a = []
for i, index in df_group.groups.items():
    # don't bother with odd pairs
    if (len(index) % 2 != 0): 
        continue

    # Create conversatoin pairs
    t = df_hansard.iloc[list(index)]['content'].values
    q_a.append(list(zip(t[::2], t[1::2])))

q_a = [item for sublist in q_a for item in sublist]

q_a = q_a[:5000]

print('number of q & a', len(q_a))
    
df_q_a = pd.DataFrame(q_a)
df_q_a.columns = ['Q', 'A']
df_q_a.to_csv('data/q_a_all.csv')
df_q_a.tail()

number of q & a 5000


Unnamed: 0,Q,A
4995,"Mr. Speaker, in all fairness, I have to keep B...","Order, please. I do not think referring to eac..."
4996,"Mr. Speaker, yesterday you thought it was okay...","Mr. Speaker, all I ask is for that hon. member..."
4997,"Mr. Speaker, in a recent survey conducted by T...","Mr. Speaker, I thank the member for all the gr..."
4998,"Mr. Speaker, Conservatives cannot seem to agre...","Mr. Speaker, our engagement with regard to off..."
4999,"Mr. Speaker, we had to introduce Bill C-419 to...","Mr. Speaker, as I just mentioned, our policie..."


In [35]:
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load('en')

def sentence_tokenizer(raw_text):
    # Create doc = nlp(raw_text) and parse sentences. 
    return u' '.join(['BOS ' + sent.string.strip() + ' EOS' for sent in nlp(raw_text).sents])

In [36]:
%%time

def _apply_df(args):
    df, func, kwargs = args
    df['Q'] = df['Q'].apply(func, **kwargs)
    df['A'] = df['A'].apply(func, **kwargs)
    return df#df.apply(func, **kwargs)

def apply_by_multiprocessing(df, func, **kwargs):
    workers = kwargs.pop('workers')
    pool = multiprocessing.Pool(processes=workers)
    result = pool.map(_apply_df, [(d, func, kwargs) for d in np.array_split(df, workers)])
    pool.close()
    return pd.concat(list(result))
    
num_cores = multiprocessing.cpu_count()
print(num_cores) 
    
df_q_a = pd.read_csv('data/q_a_all.csv')

df_q_a = apply_by_multiprocessing(df_q_a, sentence_tokenizer,  workers=num_cores)
   
df_q_a.to_csv('data/q_a_all.csv')
df_q_a.tail()


8
CPU times: user 824 ms, sys: 268 ms, total: 1.09 s
Wall time: 5min 59s


In [38]:

df_q_a = pd.read_csv('data/q_a_all.csv')
# df_q_a.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1, inplace=True)
# df_q_a.to_csv('data/q_a_all.csv')
# text= ' Mr. Speaker, when the Prime Minister interrupted a woman at a town hall, correcting her use of “mankind” with “peoplekind“, his mansplaining went viral. Around the world, the Prime Minister was mocked for his political correctness. The Prime Minister eventually conceded that it was a dumb joke, but his principal secretary, Gerald Butts, tweeted that any and all who criticized his boss were Nazis. The Prime Minister once said that any statement by Mr. Butts could be considered his own, and in this case?'
print(df_q_a['Q'][10])
# print(sentence_t.okenizer(df_q_a['Q'][10]))

# df_q_a['Q'] = df_q_a['Q'].apply(sentence_tokenizer)
# df_q_a['A'] = df_q_a['A'].apply(sentence_tokenizer)
# df_q_a.to_csv('data/q_a.csv')
df_q_a.tail()



BOS Mr. Speaker, France and Italy have recognized the Libyan National Council as that country's legitimate government. EOS BOS Can the Minister of Foreign Affairs clarify Canada's position on this? EOS


Unnamed: 0,Q,A
4995,"BOS Mr. Speaker, in all fairness, I have to ke...","BOS Order, please. EOS BOS I do not think refe..."
4996,"BOS Mr. Speaker, yesterday you thought it was ...","BOS Mr. Speaker, all I ask is for that hon. EO..."
4997,"BOS Mr. Speaker, in a recent survey conducted ...","BOS Mr. Speaker, I thank the member for all th..."
4998,"BOS Mr. Speaker, Conservatives cannot seem to ...","BOS Mr. Speaker, our engagement with regard to..."
4999,"BOS Mr. Speaker, we had to introduce Bill C-41...","BOS Mr. Speaker, as I just mentioned, our poli..."


In [32]:
df_q_a['Q_A'] = df_q_a[['Q', 'A']].apply(lambda x: u' '.join(x), axis=1)


print(len(df_q_a['Q_A']))
with open('data/Q_A_pairs.txt', 'w') as file:
    for row in df_q_a['Q_A']:
        file.write(row.strip() + '\n')


27372
