In [1]:
#@author : https://github.com/nitish11  
#@decription : finding relationship betwen two sentences 
#@date : 31st June, 2017'

## Objective : 

* Text classification for given two columns of data *train_data[[1,2]]*.
* No. of classes = 5 *[' ForwardEntailment', ' Independent', ' Equivalence', ' ReverseEntailment', ' OtherRelated']*

* Other columns contain extra  information about the two segments of sentences

## Approach 

### Using classical method of classification

* Preprcoessing of columns of the given data
    * Extract the numeral information from the data given as extra columns
    * Store only these columns as training data
* Use the extracted information from training data and perform classical methods (Random Forest, XGboost) for classification. 


### Deep Learning Solution 

* Use only starting 2 columns (text_data) and do pre-processing.
* Use word2vec or Glove for representing text as vectors.
* Use the vectors and do classiication using differen models as LSTM.

### Loading the data and preprocessing 

In [2]:
#import modules
import pandas as pd
import numpy as np

In [3]:
#Loading training data
ppdb_train_data = pd.read_csv("ppdb.train.csv", header=None, sep=',')

print("-- keys : ",ppdb_train_data.keys())
print("---text_data:",ppdb_train_data[[1,2]].head(3))
print("--- Results :",ppdb_train_data[17].unique())
print("--- #of rows:",len(ppdb_train_data.index))

('-- keys : ', Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], dtype='int64'))
('---text_data:',                  1                     2
0   used to treat              to treat 
1    education is    education programs 
2    are reviewed      are under review )
('--- Results :', array([' ForwardEntailment', ' Independent', ' Equivalence',
       ' ReverseEntailment', ' OtherRelated'], dtype=object))
('--- #of rows:', 1000)


In [4]:
#Displaying one data-point 
print "-"*80
for key in ppdb_train_data.columns:
    print key,":", ppdb_train_data.iloc[0][key]


--------------------------------------------------------------------------------
0 : [VP/NP] 
1 :  used to treat 
2 :  to treat 
3 :  PPDB2.0Score=4.76476 PPDB1.0Score=12.744950 -logp(LHS
4 : e1)=0.30786 -logp(LHS
5 : e2)=0.89456 -logp(e1
6 : LHS)=14.95995 -logp(e1
7 : e2)=7.90234 -logp(e1
8 : e2,LHS)=6.75647 -logp(e2
9 : LHS)=12.48692 -logp(e2
10 : e1)=4.84261 -logp(e2
11 : e1,LHS)=4.28343 AGigaSim=0.80561 Abstract=0 Adjacent=0 CharCountDiff=-5 CharLogCR=-0.48551 ContainsX=0 Equivalence=0.088485 Exclusion=0.047945 GlueRule=0 GoogleNgramSim=0.33742 Identity=0 Independent=0.133904 Lex(e1
12 : e2)=63.30685 Lex(e2
13 : e1)=63.30685 Lexical=1 LogCount=0 MVLSASim=NA Monotonic=1 OtherRelated=0.018057 PhrasePenalty=1 RarityPenalty=0.01832 ForwardEntailment=0.711609 SourceTerminalsButNoTarget=0 SourceWords=3 TargetComplexity=0.97133 TargetTerminalsButNoSource=0 TargetWords=2 UnalignedSource=0 UnalignedTarget=0 WordCountDiff=-1 WordLenDiff=-0.16667 WordLogCR=-0.40547 
14 : nan
15 : nan
16 :  0-

### Confusing data points 

#### Reference : the above columns 

* Columns 4th, 5th, 6th, 7th, 8th, 9th and 10th are very confusing and not making sense.
* I think there is an issue in making this csv file.

In [5]:
#training_data preparation 
columns = ["text_type","ppdb1_score","ppdb2_score", "logp_lhs_e1", "logp_lhs_e2", "logp_e1_lhs", 
           "logp_e2_lhs", "logp_e1_e2", "logp_e1_e2_lhs", "logp_e2_e1", "logp_e2_e1_lhs",
           "AGigaSim", "GoogleNgramSim", "result"]

#Creating blank dataframe
dummy_data = np.random.randn(len(ppdb_train_data.index),len(columns))
train_data = pd.DataFrame(dummy_data, columns=columns)

In [6]:
train_data.head(2)

Unnamed: 0,text_type,ppdb1_score,ppdb2_score,logp_lhs_e1,logp_lhs_e2,logp_e1_lhs,logp_e2_lhs,logp_e1_e2,logp_e1_e2_lhs,logp_e2_e1,logp_e2_e1_lhs,AGigaSim,GoogleNgramSim,result
0,-0.483986,0.408002,0.767297,0.435509,0.232661,-0.946388,0.228995,0.914758,-0.03176,0.435317,-1.280009,-0.624575,0.894714,-0.004533
1,0.631244,-0.518553,-1.425335,0.53189,0.052497,-0.561156,1.401802,0.440235,0.33699,0.173777,-0.029438,2.251837,-0.127533,-0.582025


In [7]:
# Preprocessing of result column
print "-- Unique Results : ",ppdb_train_data[17].unique()
print pd.value_counts(ppdb_train_data[17])

for index,element in enumerate(ppdb_train_data[17].unique()):
    indices = ppdb_train_data[ppdb_train_data[17] == element].index.tolist()
    train_data.loc[indices, "result"] = index

print "-- Updated Results : ",train_data["result"].unique()
print pd.value_counts(train_data["result"])

-- Unique Results :  [' ForwardEntailment' ' Independent' ' Equivalence' ' ReverseEntailment'
 ' OtherRelated']
 Equivalence          303
 ReverseEntailment    301
 Independent          212
 ForwardEntailment    176
 OtherRelated           8
Name: 17, dtype: int64
-- Updated Results :  [ 0.  1.  2.  3.  4.]
2.0    303
3.0    301
1.0    212
0.0    176
4.0      8
Name: result, dtype: int64


### Processing of each column

In [8]:
#Processing of 0th column
# print "---"*30
# print pd.value_counts(ppdb_train_data[0])
for index,element in enumerate(ppdb_train_data[0].unique()):
    indices = ppdb_train_data[ppdb_train_data[0] == element].index.tolist()
    train_data.loc[indices, "text_type"] = index

# print "---"*30
# print pd.value_counts(train_data["text_type"])

In [9]:
train_data.head(2)

Unnamed: 0,text_type,ppdb1_score,ppdb2_score,logp_lhs_e1,logp_lhs_e2,logp_e1_lhs,logp_e2_lhs,logp_e1_e2,logp_e1_e2_lhs,logp_e2_e1,logp_e2_e1_lhs,AGigaSim,GoogleNgramSim,result
0,0.0,0.408002,0.767297,0.435509,0.232661,-0.946388,0.228995,0.914758,-0.03176,0.435317,-1.280009,-0.624575,0.894714,0.0
1,1.0,-0.518553,-1.425335,0.53189,0.052497,-0.561156,1.401802,0.440235,0.33699,0.173777,-0.029438,2.251837,-0.127533,1.0


In [10]:
for index,element in enumerate(ppdb_train_data[3]):
    ppdb1 = element.split('=')[2].split(' ')[0]
    ppdb2 = element.split('=')[1].split(' ')[0]
    train_data.loc[index, "ppdb1_score"] = ppdb1
    train_data.loc[index, "ppdb2_score"] = ppdb2

In [11]:
train_data.head(2)

Unnamed: 0,text_type,ppdb1_score,ppdb2_score,logp_lhs_e1,logp_lhs_e2,logp_e1_lhs,logp_e2_lhs,logp_e1_e2,logp_e1_e2_lhs,logp_e2_e1,logp_e2_e1_lhs,AGigaSim,GoogleNgramSim,result
0,0.0,12.74495,4.76476,0.435509,0.232661,-0.946388,0.228995,0.914758,-0.03176,0.435317,-1.280009,-0.624575,0.894714,0.0
1,1.0,24.04632,4.68897,0.53189,0.052497,-0.561156,1.401802,0.440235,0.33699,0.173777,-0.029438,2.251837,-0.127533,1.0


In [12]:
#Extracting features from the csv

for index,element in enumerate(ppdb_train_data[4]):
    logp_lhs_e1 = element.split('=')[1].split(' ')[0]
    train_data.loc[index, "logp_lhs_e1"] = logp_lhs_e1

for index,element in enumerate(ppdb_train_data[5]):
    logp_lhs_e2 = element.split('=')[1].split(' ')[0]
    train_data.loc[index, "logp_lhs_e2"] = logp_lhs_e2
    
for index,element in enumerate(ppdb_train_data[6]):
    logp_e1_lhs = element.split('=')[1].split(' ')[0]
    train_data.loc[index, "logp_e1_lhs"] = logp_e1_lhs
    
for index,element in enumerate(ppdb_train_data[7]):
    logp_e1_e2 = element.split('=')[1].split(' ')[0]
    train_data.loc[index, "logp_e1_e2"] = logp_e1_e2

for index,element in enumerate(ppdb_train_data[8]):
    logp_e1_e2_lhs = element.split('=')[1].split(' ')[0]
    train_data.loc[index, "logp_e1_e2_lhs"] = logp_e1_e2_lhs
    
for index,element in enumerate(ppdb_train_data[9]):
    logp_e2_lhs = element.split('=')[1].split(' ')[0]
    train_data.loc[index, "logp_e2_lhs"] = logp_e2_lhs
    
for index,element in enumerate(ppdb_train_data[10]):
    logp_e2_e1 = element.split('=')[1].split(' ')[0]
    train_data.loc[index, "logp_e2_e1"] = logp_e2_e1

for index,element in enumerate(ppdb_train_data[11]):
    logp_e2_e1_lhs = element.split('=')[1].split(' ')[0]
    train_data.loc[index, "logp_e2_e1_lhs"] = logp_e2_e1_lhs

for index,element in enumerate(ppdb_train_data[11]):
    AGigaSim = element.split('=')[2].split(' ')[0]
    train_data.loc[index, "AGigaSim"] = AGigaSim

for index,element in enumerate(ppdb_train_data[11]):
    GoogleNgramSim = element.split('=')[11].split(' ')[0]
    train_data.loc[index, "GoogleNgramSim"] = GoogleNgramSim



In [13]:
train_data.head(2)

Unnamed: 0,text_type,ppdb1_score,ppdb2_score,logp_lhs_e1,logp_lhs_e2,logp_e1_lhs,logp_e2_lhs,logp_e1_e2,logp_e1_e2_lhs,logp_e2_e1,logp_e2_e1_lhs,AGigaSim,GoogleNgramSim,result
0,0.0,12.74495,4.76476,0.30786,0.89456,14.95995,12.48692,7.90234,6.75647,4.84261,4.28343,0.80561,0.33742,0.0
1,1.0,24.04632,4.68897,0.15099,1.28093,11.88724,14.59836,11.23257,7.19074,12.81375,9.90186,0.535,0.0,1.0


In [15]:
## Implementing a random forest ##

from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier

#using only the features to be used for training
features = [column for column in columns if column is not "result"]

# Initialize our algorithm with the default paramters
# n_estimators is the number of trees we want to make
# min_samples_split is the minimum number of rows we need to make a split
# min_samples_leaf is the minimum number of samples we can have at the place where a tree branch ends (the bottom points of the tree)
alg = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1)
scores = cross_validation.cross_val_score(alg, train_data[features],train_data["result"],cv=3).mean()
print(scores)

0.424995227564


### Next Steps

* Try out different classical methods of classification : https://github.com/nitish11/Kaggle-submissions/blob/master/red-hat-business-value_train.py
* Hyper parameter tuning for different algorithm 
* Ensemble method : to use combination of algorithms for classification 
* Adding more features from CSV
* Selecting best features from training data "Feature Selection"