In [1]:
#@author : https://github.com/nitish11  
#@decription : finding relationship betwen two sentences 
#@date : 31st June, 2017'

## Objective : 

* Text classification for given two columns of data *train_data[[1,2]]*.
* No. of classes = 5 *[' ForwardEntailment', ' Independent', ' Equivalence', ' ReverseEntailment', ' OtherRelated']*

* Other columns contain extra  information about the two segments of sentences

## Approach 

### Using classical method of classification

* Preprcoessing of columns of the given data
    * Extract the numeral information from the data given as extra columns
    * Store only these columns as training data
* Use the extracted information from training data and perform classical methods (Random Forest, XGboost) for classification. 


### Deep Learning Solution 

* Use only starting 2 columns (text_data) and do pre-processing.
* Use word2vec or Glove for representing text as vectors.
* Use the vectors and do classiication using differen models as LSTM.

### Loading the data and preprocessing 

In [2]:
#import modules
import pandas as pd
import numpy as np

In [3]:
#Loading training data
ppdb_train_data = pd.read_csv("ppdb.train.csv", header=None, sep=',')

print("-- keys : ",ppdb_train_data.keys())
print("---text_data:",ppdb_train_data[[1,2]].head(3))
print("--- Results :",ppdb_train_data[17].unique())
print("--- #of rows:",len(ppdb_train_data.index))

('-- keys : ', Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], dtype='int64'))
('---text_data:',                  1                     2
0   used to treat              to treat 
1    education is    education programs 
2    are reviewed      are under review )
('--- Results :', array([' ForwardEntailment', ' Independent', ' Equivalence',
       ' ReverseEntailment', ' OtherRelated'], dtype=object))
('--- #of rows:', 1000)


In [4]:
#Displaying one data-point 
print "-"*80
for key in ppdb_train_data.columns:
    print key,":", ppdb_train_data.iloc[0][key]


--------------------------------------------------------------------------------
0 : [VP/NP] 
1 :  used to treat 
2 :  to treat 
3 :  PPDB2.0Score=4.76476 PPDB1.0Score=12.744950 -logp(LHS
4 : e1)=0.30786 -logp(LHS
5 : e2)=0.89456 -logp(e1
6 : LHS)=14.95995 -logp(e1
7 : e2)=7.90234 -logp(e1
8 : e2,LHS)=6.75647 -logp(e2
9 : LHS)=12.48692 -logp(e2
10 : e1)=4.84261 -logp(e2
11 : e1,LHS)=4.28343 AGigaSim=0.80561 Abstract=0 Adjacent=0 CharCountDiff=-5 CharLogCR=-0.48551 ContainsX=0 Equivalence=0.088485 Exclusion=0.047945 GlueRule=0 GoogleNgramSim=0.33742 Identity=0 Independent=0.133904 Lex(e1
12 : e2)=63.30685 Lex(e2
13 : e1)=63.30685 Lexical=1 LogCount=0 MVLSASim=NA Monotonic=1 OtherRelated=0.018057 PhrasePenalty=1 RarityPenalty=0.01832 ForwardEntailment=0.711609 SourceTerminalsButNoTarget=0 SourceWords=3 TargetComplexity=0.97133 TargetTerminalsButNoSource=0 TargetWords=2 UnalignedSource=0 UnalignedTarget=0 WordCountDiff=-1 WordLenDiff=-0.16667 WordLogCR=-0.40547 
14 : nan
15 : nan
16 :  0-

### Confusing data points 

#### Reference : the above columns 

* Columns 4th, 5th, 6th, 7th, 8th, 9th and 10th are very confusing and not making sense.
* I think there is an issue in making this csv file.

In [5]:
#training_data preparation 
columns = ["text1", "text2", "result"]

#Creating blank dataframe
dummy_data = np.random.randn(len(ppdb_train_data.index),len(columns))
train_data = pd.DataFrame(dummy_data, columns=columns)

In [6]:
train_data.head(2)

Unnamed: 0,text1,text2,result
0,1.871196,0.752788,-0.172124
1,-0.550445,-0.11368,0.701426


In [7]:
# Preprocessing of result column
print "-- Unique Results : ",ppdb_train_data[17].unique()
print pd.value_counts(ppdb_train_data[17])

for index,element in enumerate(ppdb_train_data[17].unique()):
    indices = ppdb_train_data[ppdb_train_data[17] == element].index.tolist()
    train_data.loc[indices, "result"] = index

print "-- Updated Results : ",train_data["result"].unique()
print pd.value_counts(train_data["result"])

-- Unique Results :  [' ForwardEntailment' ' Independent' ' Equivalence' ' ReverseEntailment'
 ' OtherRelated']
 Equivalence          303
 ReverseEntailment    301
 Independent          212
 ForwardEntailment    176
 OtherRelated           8
Name: 17, dtype: int64
-- Updated Results :  [ 0.  1.  2.  3.  4.]
2.0    303
3.0    301
1.0    212
0.0    176
4.0      8
Name: result, dtype: int64


### Processing of each column

In [8]:
#Processing of text column
train_data["text1"] = ppdb_train_data[1]
train_data["text2"] = ppdb_train_data[2]

#Adding the two columns of text
train_data["text"] = train_data["text1"]+' '+train_data["text2"]  
train_data["result"] = train_data["result"].astype('int')

In [9]:
train_data.head(2)

Unnamed: 0,text1,text2,result,text
0,used to treat,to treat,0,used to treat to treat
1,education is,education programs,1,education is education programs


In [15]:
"""Example of Estimator for DNN-based text classification with DBpedia data."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import sys

import numpy as np
import pandas
from sklearn import metrics
import tensorflow as tf
from tensorflow.contrib.layers.python.layers import encoders

learn = tf.contrib.learn

MAX_DOCUMENT_LENGTH = 10
EMBEDDING_SIZE = 50
n_words = 0
num_classes = 5

In [16]:
def rnn_model(features, target):
    global num_classes
    """RNN model to predict from sequence of words to a class."""
    # Convert indexes of words into embeddings.
    # This creates embeddings matrix of [n_words, EMBEDDING_SIZE] and then
    # maps word indexes of the sequence into [batch_size, sequence_length,
    # EMBEDDING_SIZE].
    word_vectors = tf.contrib.layers.embed_sequence(
      features, vocab_size=n_words, embed_dim=EMBEDDING_SIZE, scope='words')

    # Split into list of embedding per word, while removing doc length dim.
    # word_list results to be a list of tensors [batch_size, EMBEDDING_SIZE].
    word_list = tf.unstack(word_vectors, axis=1)

    # Create a Gated Recurrent Unit cell with hidden size of EMBEDDING_SIZE.
    cell = tf.contrib.rnn.GRUCell(EMBEDDING_SIZE)

    # Create an unrolled Recurrent Neural Networks to length of
    # MAX_DOCUMENT_LENGTH and passes word_list as inputs for each unit.
    _, encoding = tf.contrib.rnn.static_rnn(cell, word_list, dtype=tf.float32)

    # Given encoding of RNN, take encoding of last step (e.g hidden size of the
    # neural network of last step) and pass it as features for logistic
    # regression over output classes.
    target = tf.one_hot(target, num_classes, 1, 0)
    logits = tf.contrib.layers.fully_connected(encoding, num_classes, activation_fn=None)
    loss = tf.contrib.losses.softmax_cross_entropy(logits, target)

    # Create a training op.
    train_op = tf.contrib.layers.optimize_loss(
      loss,
      tf.contrib.framework.get_global_step(),
      optimizer='Adam',
      learning_rate=0.01)

    return ({
      'class': tf.argmax(logits, 1),
      'prob': tf.nn.softmax(logits)
    }, loss, train_op)



In [17]:
pd.DataFrame(train_data["text"])

Unnamed: 0,text
0,used to treat to treat
1,education is education programs
2,are reviewed are under review
3,"yeah , it 's yes , it is"
4,is no doubt is no dispute
5,it is essential for it is necessary for
6,approved for sale approved for use
7,rights and liberties of rights and freedoms...
8,s. alber delivered his opinion alber delive...
9,these exist they are available


In [18]:
def classify(unused_argv):
    global n_words, train_data
    # Prepare training and testing data
    # x_train =  pd.DataFrame(train_data[["text1","text2"]])
    x_train = pd.DataFrame(train_data["text"])
    y_train = pd.Series(train_data["result"].values)
    x_test =  pd.DataFrame(train_data["text"].head(10))
    y_test =  pd.Series(train_data["result"].values[:10])
    
    
    # Process vocabulary
    vocab_processor = learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH)

    x_transform_train = vocab_processor.fit_transform(x_train)
    x_transform_test = vocab_processor.transform(x_test)

    x_train = np.array(list(x_transform_train))
    x_test = np.array(list(x_transform_test))

    n_words = len(vocab_processor.vocabulary_)
    print('Total words: %d' % n_words)

    # Build model
    model_fn = rnn_model
    classifier = learn.Estimator(model_fn=model_fn)

    # Train and predict
    classifier.fit(x_train, y_train, steps=100)
    print("----------------classifier.predict(x_test)\n")
    for p in classifier.predict(x_test):
        print("---------------------",p)
    
    y_predicted = [ p['class'] for p in classifier.predict(x_test) ]
    
    print("-- x_train     ",np.shape(x_train))
    print("-- x_test     ",np.shape(x_test))
    
    print("-- y_train     ",np.shape(y_train))
    print("-- y_test     ",np.shape(y_test))
    print("-- y_predicted",np.shape(y_predicted))
    
    print("-- y_test     ",y_test)
    print("-- y_predicted",y_predicted)
    
    score = metrics.accuracy_score(y_test, y_predicted)
    print('Accuracy: {0:f}'.format(score))

In [None]:
tf.app.run(main=classify, argv=[])

Total words: 2
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_num_ps_replicas': 0, '_keep_checkpoint_max': 5, '_tf_random_seed': None, '_task_type': None, '_environment': 'local', '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f716032e290>, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_task_id': 0, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_evaluation_master': '', '_keep_checkpoint_every_n_hours': 10000, '_master': ''}
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Ar