In [2]:
import pandas as pd
import numpy as np
from preprocessing import *

In [3]:
x_train = pd.read_csv('./data/train_data.csv')
y_train = pd.read_csv('./data/train_labels.csv')
x_test = pd.read_csv('./data/test_data.csv')

# 1. Preprocess train data and save to file
## Assign id to each unique questions

In [4]:
x_train, x_test = unique_question_map(x_train, x_test)

## Combine train data into one dataframe and clean

In [5]:
df_train = x_train.drop('is_duplicate',1)
df_train = df_train.merge(y_train,on="id")

df_train = df_train[['question1', 'qid1', 'question2', 'qid2','is_duplicate']].dropna()
df_train['question1'] = df_train['question1'].apply(preprocess) 
df_train['question2'] = df_train['question2'].apply(preprocess)

df_train.head()

Unnamed: 0,question1,qid1,question2,qid2,is_duplicate
0,what is the step by step guide to invest in sh...,0,what is the step by step guide to invest in sh...,241804,0
1,what is the story of kohinoor kohinoor diamond,1,what would happen if the indian government sto...,13483,0
2,how can i increase the speed of my internet co...,2,how can internet speed be increased by hacking...,241805,0
3,why am i mentally very lonely how can i solve it,3,find the remainder when math two three two fo...,241806,0
4,which one dissolve in water quikly sugar salt ...,4,which fish would survive in salt water,241807,0


## Some exploration

In [7]:
print('Total number of question pairs for training: {}'.format(len(df_train)))
print('Duplicate pairs: {}%'.format(round(df_train['is_duplicate'].mean()*100, 2)))
print('Non-duplicate pairs: {}%'.format(100.00 - round(df_train['is_duplicate'].mean()*100, 2)))

print('Number of unique questions: {}'.format(len(pd.concat([df_train['question1'], df_train['question2']]).unique())))

Total number of question pairs for training: 323162
Duplicate pairs: 36.88%
Non-duplicate pairs: 63.12%
Number of unique questions: 448622


# 2. Extract unique questions from train data

In [8]:
unique_questions = pd.concat([df_train['question1'], df_train['question2']]).fillna("")
unique_questions = pd.DataFrame(unique_questions.unique())

In [9]:
print('Number of unique questions: {}'.format(len(unique_questions)))

Number of unique questions: 448622


# 3. Save to file

In [10]:
df_train.to_pickle("./data/df_train.pkl")

In [11]:
unique_questions.apply(preprocess).head()

0    what is the step by step guide to invest in sh...
dtype: object

In [11]:
unique_questions.to_pickle("./data/unique_questions.pkl")