In [90]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [23]:
train = pd.read_csv('train.csv')

In [27]:
train = train[~train['question2'].isnull()]

In [155]:
##############################
#Edit Distance Function
##############################
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
from pylev import levenshtein

def homogenized(strings):
    lengths = [len(s) for s in strings]
    n = max(lengths)
    for s in strings:
        k = len(s)
        yield [k] + [ord(c) for c in s] + [0] * (n - k)

def dehomogenized(points):
    for p in points:
        k = int(p[0])
        yield ''.join(chr(int(x)) for x in p[1:k+1])

def mylev(u, v):
    return levenshtein(*list(dehomogenized((u, v))))

strings = ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']
points = np.array(list(homogenized(strings)))

print 'homogenized:\n%s\n' % points
print 'dehomogenized:\n%s\n' % list(dehomogenized(points))
print 'distances:\n%s\n' % pairwise_distances(points, metric=mylev)

homogenized:
[[  9  99 105 116 121  98 108 111  99 107]
 [  6  99 111 115 105 110 101   0   0   0]
 [  9 101 117  99 108 105 100 101  97 110]
 [  2 108  49   0   0   0   0   0   0   0]
 [  2 108  50   0   0   0   0   0   0   0]
 [  9 109  97 110 104  97 116 116  97 110]]

dehomogenized:
['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']

distances:
[[ 0.  8.  9.  8.  8.  9.]
 [ 8.  0.  7.  6.  6.  9.]
 [ 9.  7.  0.  8.  8.  7.]
 [ 8.  6.  8.  0.  1.  9.]
 [ 8.  6.  8.  1.  0.  9.]
 [ 9.  9.  7.  9.  9.  0.]]



Try the logit regression model

In [162]:
lev_distance_strings = [([a,b], c) for a,b,c in zip(train['question1'], train['question2'], train['is_duplicate'])]
lev_distance = [
    (float(levenshtein(pair[0][0], pair[0][1])) /
    (float(sum([x.count('') for x in pair[0][0]])) + 
    float(sum([x.count('') for x in pair[0][1]]))), 
     pair[1]) for pair in lev_distance_strings 
]

In [189]:
lev_df = pd.DataFrame({'lev_distance': [tup[0] for tup in lev_distance], 
                       'is_duplicate': [tup[1] for tup in lev_distance]})
lev_df.to_csv('lev_scores.csv')

In [184]:
X_train, X_test, y_train, y_test = tts(lev_df['lev_distance'], lev_df['is_duplicate'], test_size=0.2)
log_clf = RandomForestClassifier()
X_train = X_train.reshape((len(X_train),1)) 
y_train = y_train.reshape((len(y_train),1)) 
log_clf.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [185]:
test_results = log_clf.predict(X_test.reshape((len(X_test),1)) )
print(classification_report(y_test,test_results))

             precision    recall  f1-score   support

          0       0.73      0.75      0.74     51032
          1       0.55      0.53      0.54     29826

avg / total       0.67      0.67      0.67     80858



In [188]:
print lev_df[lev_df['lev_distance'] <= 0.15][0:10]
print train.loc[1316,]['question1']
print train.loc[1316,]['question2']

    is_duplicate  lev_distance
0              0      0.036585
5              1      0.127841
7              1      0.140845
8              0      0.055556
11             1      0.086420
12             1      0.060000
13             1      0.030864
14             0      0.010676
16             1      0.008772
19             0      0.039474
Which is the best destination for honeymoon in December within a budget of INR 2 lakhs for a couple?
Which is the best destination for honeymoon in december within a budget of INR 4 lakhs for a couple?
