In [2]:
import pandas as pd
from pandas import DataFrame
import itertools

In [3]:
questions = pd.read_csv("questions.csv")
paragraphs = pd.read_csv("paragraphs.csv")
sample = pd.read_csv("sample.csv")
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [4]:
# Test data
# It pairs all test question ids with all available paragraphs.
# 
# A question-paragraph id is generated by concatenating the id of the question to the symbol “#”
# and then to the paragraph id.
# 
# You will need to parse it to extract the two identifiers (e.g. question-id and paragraph-id). 
# 
# Format:
# id – identifier for question-paragraph pair 
# qpid – (question-id # para-id) – combined id or question and paragraph

# Pre-process training data

# Split qpid into qid and ParaId
test['qid'], test['ParaId'] = test['qpid'].str.split('#', 1).str

# Cast question id columns and paragraph id columns to int
test['qid'] = test['qid'].astype(int)
test['ParaId'] = test['ParaId'].astype(int)
train['qid'] = train['qid'].astype(int)
train['ParaId'] = train['ParaId'].astype(int)

# Join questions and paragraphs together using train dataset

qpid = pd.merge(train, questions, left_on='qid', right_on='qid', how='left')
qpid = pd.merge(qpid, paragraphs, left_on='ParaId', right_on='ParaId', how='left')
qpid.head()

Unnamed: 0,qid,ParaId,qtext,Chapter,Title,SectionTitle,SubsectionTitle,Source,FeatureType,ParaText
0,580,21,Can I delay paying taxes on my advanced commis...,Chapter 5,"Wages, salaries, and other earnings",Employee Compensation,Miscellaneous Compensation,EY,EXPLANATION,"In some cases, an advance payment of a commiss..."
1,1188,323,I received a dividend from a corporation that ...,Chapter 8,Dividends and other corporate distributions,Ordinary Dividends,Qualified Dividends,TaxCode,TAXCODE,Qualified dividends are the ordinary dividends...
2,1072,226,If I purchased but did not cash a coowned bond...,Chapter 7,Interest income,Taxable Interest,U.S. Savings Bonds,TaxCode,TAXCODE,Co-owners. If a U.S. savings bond is issued in...
3,15,499,I work for a state or local government or a ta...,Chapter 10,"Retirement plans, pensions, and annuities",General Information,,TaxCode,TAXCODE,More than one program. If you receive benefits...
4,589,28,My employer promised a bonusaward Do I need to...,Chapter 5,"Wages, salaries, and other earnings",Employee Compensation,Miscellaneous Compensation,TaxCode,TAXCODE,Bonuses and awards. If you receive a bonus or ...


In [6]:
# Creating training data as a product of all questions and paragraphs;

product = pd.DataFrame([train['qid'], train['ParaId']]).T
product = pd.DataFrame([e for e in itertools.product(product.qid, product.ParaId)], columns=product.columns)

# Identifying if pair question-paragraph is a target ('target' means it was present in provided train dataset)
temp = pd.merge(product, qpid, on=['qid', 'ParaId'], how='left')
temp['target'] = (temp['ParaText'].isnull() == False).astype(int)

# Using 'target' in the final training dataset
train_data = pd.merge(product, questions, on='qid', how='left')
train_data = pd.merge(train_data, paragraphs, on='ParaId', how='left')
train_data['target'] = temp['target']
train_data.head()

Unnamed: 0,qid,ParaId,qtext,Chapter,Title,SectionTitle,SubsectionTitle,Source,FeatureType,ParaText,target
0,580,21,Can I delay paying taxes on my advanced commis...,Chapter 5,"Wages, salaries, and other earnings",Employee Compensation,Miscellaneous Compensation,EY,EXPLANATION,"In some cases, an advance payment of a commiss...",1
1,580,323,Can I delay paying taxes on my advanced commis...,Chapter 8,Dividends and other corporate distributions,Ordinary Dividends,Qualified Dividends,TaxCode,TAXCODE,Qualified dividends are the ordinary dividends...,0
2,580,226,Can I delay paying taxes on my advanced commis...,Chapter 7,Interest income,Taxable Interest,U.S. Savings Bonds,TaxCode,TAXCODE,Co-owners. If a U.S. savings bond is issued in...,0
3,580,499,Can I delay paying taxes on my advanced commis...,Chapter 10,"Retirement plans, pensions, and annuities",General Information,,TaxCode,TAXCODE,More than one program. If you receive benefits...,0
4,580,28,Can I delay paying taxes on my advanced commis...,Chapter 5,"Wages, salaries, and other earnings",Employee Compensation,Miscellaneous Compensation,TaxCode,TAXCODE,Bonuses and awards. If you receive a bonus or ...,0


In [11]:
# Prepare test data in the same fashion

test_data = pd.merge(product, questions, on='qid', how='left')
test_data = pd.merge(test_data, paragraphs, on='ParaId', how='left')
test_data.head()

Unnamed: 0,qid,ParaId
count,456976.0,456976.0
mean,979.181953,367.304734
std,578.592589,218.421759
min,15.0,2.0
25%,471.75,144.0
50%,941.0,419.5
75%,1530.25,565.0
max,1935.0,712.0


In [12]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 456976 entries, 0 to 456975
Data columns (total 10 columns):
qid                456976 non-null int64
ParaId             456976 non-null int64
qtext              456976 non-null object
Chapter            456976 non-null object
Title              456976 non-null object
SectionTitle       440076 non-null object
SubsectionTitle    194688 non-null object
Source             456976 non-null object
FeatureType        456976 non-null object
ParaText           456976 non-null object
dtypes: int64(2), object(8)
memory usage: 38.4+ MB
