In [1]:
import pandas as pd
import numpy as np
from preprocessing import *

In [2]:
x_train = pd.read_csv('./data/train_data.csv')
y_train = pd.read_csv('./data/train_labels.csv')
x_test = pd.read_csv('./data/test_data.csv')

# 1. Preprocess train and test data and save to file
## Assign id to each unique questions

In [3]:
x_train, x_test = unique_question_map(x_train, x_test)

## Combine train and test data into one dataframe and clean

In [4]:
df_train = x_train.drop('is_duplicate',1)
df_train = df_train.merge(y_train,on="id")

df_train = df_train[['id','question1', 'qid1', 'question2', 'qid2','is_duplicate']].dropna()
df_train['question1'] = df_train['question1'].apply(preprocess) 
df_train['question2'] = df_train['question2'].apply(preprocess)

df_train.head()

Unnamed: 0,id,question1,qid1,question2,qid2,is_duplicate
0,0,what is the step by step guide to invest in share market in india,0,what is the step by step guide to invest in share market,241804,0
1,1,what is the story of kohinoor kohinoor diamond,1,what would happen if the indian government stole the kohinoor kohinoor diamond back,13483,0
2,2,how can i increase the speed of my internet connection while using a vpn,2,how can internet speed be increased by hacking through dns,241805,0
3,3,why am i mentally very lonely how can i solve it,3,find the remainder when math two three two four math is divided by two four two three,241806,0
4,4,which one dissolve in water quikly sugar salt methane and carbon di oxide,4,which fish would survive in salt water,241807,0


In [5]:
df_test = x_test

df_test = df_test[['test_id','question1', 'qid1', 'question2', 'qid2']].dropna()
df_test['question1'] = df_test['question1'].apply(preprocess) 
df_test['question2'] = df_test['question2'].apply(preprocess)

df_test.head()

Unnamed: 0,test_id,question1,qid1,question2,qid2
0,15,what would a trump presidency mean for current international master s students on an f one visa,108613,how will a trump presidency affect the students presently in us or planning to study in us,39392
1,20,why do rockets look white,449444,why are rockets and boosters painted white,314405
2,21,whats causing someone to be jealous,449445,what can i do to avoid being jealous of someone,492291
3,23,how much is three zero kv in hp,449446,where can i find a conversion chart for cc to horsepower,492292
4,34,what is the best travel website in spain,449447,what is the best travel website,275556


### Tokenize

In [6]:
token_train = df_train.assign(tokenq1 = df_train['question1'].apply(tokenize),
                              tokenq2 = df_train['question2'].apply(tokenize))
token_train = token_train[['id','qid1','tokenq1','qid2','tokenq2','is_duplicate']]
token_train.head()

Unnamed: 0,id,qid1,tokenq1,qid2,tokenq2,is_duplicate
0,0,0,"[what, is, the, step, by, step, guide, to, invest, in, share, market, in, india]",241804,"[what, is, the, step, by, step, guide, to, invest, in, share, market]",0
1,1,1,"[what, is, the, story, of, kohinoor, kohinoor, diamond]",13483,"[what, would, happen, if, the, indian, government, stole, the, kohinoor, kohinoor, diamond, back]",0
2,2,2,"[how, can, i, increase, the, speed, of, my, internet, connection, while, using, a, vpn]",241805,"[how, can, internet, speed, be, increased, by, hacking, through, dns]",0
3,3,3,"[why, am, i, mentally, very, lonely, how, can, i, solve, it]",241806,"[find, the, remainder, when, math, two, three, two, four, math, is, divided, by, two, four, two, three]",0
4,4,4,"[which, one, dissolve, in, water, quikly, sugar, salt, methane, and, carbon, di, oxide]",241807,"[which, fish, would, survive, in, salt, water]",0


In [7]:
token_test = df_test.assign(tokenq1 = df_test['question1'].apply(tokenize),
                              tokenq2 = df_test['question2'].apply(tokenize))
token_test = token_test[['test_id','qid1','tokenq1','qid2','tokenq2']]
token_test.head()

Unnamed: 0,test_id,qid1,tokenq1,qid2,tokenq2
0,15,108613,"[what, would, a, trump, presidency, mean, for, current, international, master, s, students, on, an, f, one, visa]",39392,"[how, will, a, trump, presidency, affect, the, students, presently, in, us, or, planning, to, study, in, us]"
1,20,449444,"[why, do, rockets, look, white]",314405,"[why, are, rockets, and, boosters, painted, white]"
2,21,449445,"[whats, causing, someone, to, be, jealous]",492291,"[what, can, i, do, to, avoid, being, jealous, of, someone]"
3,23,449446,"[how, much, is, three, zero, kv, in, hp]",492292,"[where, can, i, find, a, conversion, chart, for, cc, to, horsepower]"
4,34,449447,"[what, is, the, best, travel, website, in, spain]",275556,"[what, is, the, best, travel, website]"


## Some exploration

In [8]:
print('Total number of question pairs for training: {}'.format(len(df_train)))
print('Duplicate pairs: {}%'.format(round(df_train['is_duplicate'].mean()*100, 2)))
print('Non-duplicate pairs: {}%'.format(100.00 - round(df_train['is_duplicate'].mean()*100, 2)))

print('Number of unique questions: {}'.format(len(pd.concat([df_train['question1'], df_train['question2']]).unique())))

Total number of question pairs for training: 323162
Duplicate pairs: 36.88%
Non-duplicate pairs: 63.12%
Number of unique questions: 448325


# 2. Extract unique questions from train data

In [9]:
unique_questions = pd.concat([df_train['question1'], df_train['question2']]).fillna("")
unique_questions = pd.DataFrame(unique_questions.unique())

In [10]:
print('Number of unique questions: {}'.format(len(unique_questions)))

Number of unique questions: 448325


# 3. Save to file

In [11]:
df_train.to_pickle("./data/df_train.pkl")
df_test.to_pickle("./data/df_test.pkl")

In [12]:
unique_questions.to_pickle("./data/unique_questions.pkl")

In [14]:
token_train.to_pickle("./data/token_train.pkl")
token_test.to_pickle("./data/token_test.pkl")