In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import zipfile
from zipfile import *
from random import random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

## 0. Load Data

In [2]:
with zipfile.ZipFile("data/QQP/train.csv.zip","r") as zip_ref:
    zip_ref.extractall("data/QQP/")

train = pd.read_csv('data/QQP/train.csv', low_memory=False)
who = pd.read_csv('data/WHO/WHO_QA_data.csv', low_memory=False)

In [3]:
print("Size of original QQP Training Data: {}".format(len(train)))
print("Size of original WHO Data: {}".format(len(who)))

Size of original QQP Training Data: 404290
Size of original WHO Data: 1279


## 1. Prepare WHO Data

### 1.1 Filter WHO Data by Covid Topic

In [4]:
covid_words = ['coronavirus', 'covid']
covidQA_df = pd.DataFrame(columns=['question','answer'])

i = 0
for index, row in who.iterrows():
    text = row['question'] + row['answer']
    for w in covid_words:
        if w in text.lower():            
            covidQA_df.loc[i] = [row['question'], row['answer']]
            i += 1

print("COVID-19 Question and Answer Pairs from WHO")
covidQA_df.to_csv('./data/WHO/covid19_QA_data.csv', encoding='utf-8')
covidQA_df

COVID-19 Question and Answer Pairs from WHO


Unnamed: 0,question,answer
0,How are COVID-19 and influenza viruses similar?,"Firstly, COVID-19 and influenza viruses have a..."
1,How are COVID-19 and influenza viruses different?,The speed of transmission is an important poin...
2,What medical interventions are available for C...,While there are a number of therapeutics curre...
3,How do we know that a vaccine is safe?,The most commonly used vaccines we have today ...
4,Are pregnant women at higher risk from COVID-19?,Research is currently underway to understand t...
...,...,...
110,I am a policy maker. What can I do to prevent ...,When making preparedness and response plans fo...
111,Has violence against women increased since the...,"Violence against women is highly prevalent, an..."
112,How does COVID-19 increase risks of violence f...,"Stress, the disruption of social and protectiv..."
113,Who is most vulnerable?,"Women who are displaced, who are migrants or r..."


### 1.2 Get Top Words from WHO

In [5]:
import nltk
from nltk import FreqDist

text = ""
for index, row in covidQA_df.iterrows():
    text += row['question']+ " " + row['answer']


def clean_text(text): 
    # remove special symbol
    text = text.replace("\xa0"," ").replace("\n"," ").replace("\t"," ").replace('\\\'s','\'s')
    # remove punctuation
    text = text.replace(","," ").replace("."," ").replace("?"," ").replace("("," ").replace(")"," ").replace("–"," ")
    return text.split()

tokens = clean_text(text)
stopwords = set(nltk.corpus.stopwords.words('english')).union(["the"])

fdist = FreqDist(w.lower() for w in tokens if not w.lower() in stopwords)
freq_df = pd.DataFrame(fdist.items())
freq_df.columns = ['word', 'freq']
freq_df = freq_df.sort_values(by=['freq'], ascending=False)
freq_df[:200]

Unnamed: 0,word,freq
0,covid-19,326
24,health,123
73,people,76
139,risk,74
141,women,60
...,...,...
310,recommendations,11
428,caused,11
334,high,11
743,ensuring,11


## 2. Prepare QQP Training Data

In [None]:
def filter_QQP_train(df):
    # set kwlist by top words in WHO Covid Context
    kwlist = ['flu ','influenza','health','virus','disease','hiv','treatment','infect','malaria','patient','protect', 'prevent', 'symptom', 'pandemic', 'medicine', 'illness', 'mask', 'vaccine']

    i = 0
    filtered_df = pd.DataFrame(columns=['question1','question2','is_duplicate','kw'])
    for index, row in df.iterrows():
        try:
            s = row['question1'] + " " + row['question2']
            kw = [w for w in kwlist if w in s.lower()]
            if len(kw) >= 1:
                filtered_df.loc[i] = [row['question1'], row['question2'], row['is_duplicate'], kw]
                i += 1
        except:
            pass
    return filtered_df

filtered_train = filter_QQP_train(train)
filtered_train

In [None]:
def plot_kw_freq(filtered, usage, label):
    filtered_label = filtered.loc[lambda filtered: filtered['is_duplicate'] == label]
    print("Size of filtered QQP {} Data with label {}: {}".format(usage, label, len(filtered_label)))
    d = {}
    for index, row in filtered_label.iterrows():
        kws = row['kw']
        for kw in kws:
            count = d.get(kw, 0)
            d[kw] = count + 1
    kw_freq = pd.DataFrame(d.items())
    kw_freq.columns = ['kw', 'count']
    kw_freq = kw_freq.sort_values('count', ascending=True)
    kw_freq.plot.barh(x='kw', y='count', figsize=(10,6), fontsize=12)
    return kw_freq.sort_values('count', ascending=False)

In [None]:
print(plot_kw_freq(filtered_train, 'Training', 0))

In [None]:
print(plot_kw_freq(filtered_train, 'Training', 1))

In [None]:
# reduce the size of filtered_train0, make it similar to filtered_train1
filtered_train0 = filtered_train.loc[lambda filtered_train: filtered_train['is_duplicate'] == 0]

i = 0
new_filtered_train0 = pd.DataFrame(columns=['question1','question2', 'is_duplicate','kw'])
for index, row in filtered_train0.iterrows():
    s = row['question1'] + " " + row['question2']
    # decrease kw list1 for 70%
    decrease1 = ['health','patient','prevent','protect','medicine','treatment','hiv']
    # decrease kw list2 for 50%
    decrease2 = ['virus','symptom','disease','infect']
    if len([w for w in decrease1 if w in s.lower()])>=1:
        if random() > 0.7:
            new_filtered_train0.loc[i] = [row['question1'],row['question2'],row['is_duplicate'],row['kw']]
            i += 1
    elif len([w for w in decrease2 if w in s.lower()])>=1:
        if random() > 0.5:
            new_filtered_train0.loc[i] = [row['question1'],row['question2'],row['is_duplicate'],row['kw']]
            i += 1 
    else:
        new_filtered_train0.loc[i] = [row['question1'],row['question2'],row['is_duplicate'],row['kw']]
        i += 1
            
new_filtered_train0       

In [None]:
d = {}
for index, row in new_filtered_train0.iterrows():
    kws = row['kw']
    for kw in kws:
        count = d.get(kw, 0)
        d[kw] = count + 1
kw_freq = pd.DataFrame(d.items())
kw_freq.columns = ['kw', 'count']
kw_freq = kw_freq.sort_values('count', ascending=True)
kw_freq.plot.barh(x='kw', y='count', figsize=(10,6), fontsize=12)
kw_freq.sort_values('count', ascending=False)

In [None]:
# get filtered training data with label 1
filtered_train1 = filtered_train.loc[lambda filtered_train: filtered_train['is_duplicate'] == 1]
# combine filtered label 0 and label 1
training_data = pd.concat([new_filtered_train0, filtered_train1]).reset_index(drop=True)
training_data.to_csv('./data/QQP/filteredQQP.csv', encoding='utf-8')
print("Size of filtered QQP Training Data with label 0: {}".format(len(new_filtered_train0)))
print("Size of filtered QQP Training Data with label 1: {}".format(len(filtered_train1)))
training_data

## 3. Split QQP Data to Training and Testing Data

In [None]:
filtered_QQP0 = new_filtered_train0
filtered_QQP1 = filtered_train1

# split portion: 80% training data, 20% testing data
filtered_QQP_train0, filtered_QQP_test0 = train_test_split(filtered_QQP0, test_size=0.2, random_state=40)
filtered_QQP_train1, filtered_QQP_test1 = train_test_split(filtered_QQP1, test_size=0.2, random_state=40)

# combine label 0 and label 1
training_data = pd.concat([filtered_QQP_train0, filtered_QQP_train1]).reset_index(drop=True)
testing_data = pd.concat([filtered_QQP_test0, filtered_QQP_test1]).reset_index(drop=True)

# save to csv file
training_data.to_csv('./data/training_data.csv', encoding='utf-8')
training_data.to_csv('./data/testing_data.csv', encoding='utf-8')

In [None]:
name_list=["Training Data","Testing Data"]
pair_list=[[len(filtered_QQP_train0),len(filtered_QQP_test0)],[len(filtered_QQP_train1),len(filtered_QQP_test1)]]
pair_list=np.array(pair_list)
img_df = pd.DataFrame(pair_list,index=name_list,columns=["Training Data","Testing Data"])
img_df.plot(kind="bar",rot=0, figsize=(10, 5), title="Filtered QQP Train-Test Data Size")
plt.savefig("data/FilteredQQP_TrainTestDataSize.jpg")
plt.show()