In [218]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split

import pandas as pd

In [320]:


questions = {}
pairs_train = []
y_train = []
with open('train.csv','r',encoding='utf8') as f:
    for line in f:
        l = line.split(',')
        if l[1] not in questions:
            questions[l[1]] = l[3]
        if l[2] not in questions:
            questions[l[2]] = l[4]

        pairs_train.append([l[1], l[2]])
        y_train.append(int(l[5][:-1]))

pairs_test = []
with open('test.csv','r',encoding='utf8') as f:
    for line in f:
        l = line.split(',')
        if l[1] not in questions:
            questions[l[1]] = l[3]
        if l[2] not in questions:
            questions[l[2]] = l[4][:-1]

        pairs_test.append([l[1], l[2]])


ids2ind = {}
for qid in questions:
    ids2ind[qid] = len(ids2ind) 

vec = TfidfVectorizer()
A = vec.fit_transform(questions.values())


pairs_train_everything = pairs_train
pairs_test_submission = pairs_test


In [194]:
questions

{'199954': '"What are the some of the best novels?"',
 '384085': '"What are some of the greatest novels of all time? Why are they great?"',
 '128681': '"What are the pictures that made you look twice?"',
 '237407': '"What are some amazing pictures one has to see twice to understand?"',
 '170846': '"Have the ellectoral college members ever voted differently then the popular vote suggested they should vote?"',
 '240621': '"When has the electoral college voted against the popular vote?"',
 '55110': '"Did Ravana really have 10 heads?"',
 '177468': '"Why did Ravana have 10 heads?"',
 '425513': '"What\'s a book that you feel helped you to improve intellectually?"',
 '400256': '"What books or magazines should I read to improve my English?"',
 '105990': '"Is astrology true? Should we believe it or not?"',
 '256943': '"Should you believe in astrology and astrologers (pandits)?"',
 '366314': '"What are some of the biggest lies that you ever told?"',
 '224793': '"What is the biggest lie ever told

In [321]:
len(list(questions.values()))

58940

In [173]:
corpus = [questions[k] for k in questions]

In [174]:
corpus

['"What are the some of the best novels?"',
 '"What are some of the greatest novels of all time? Why are they great?"',
 '"What are the pictures that made you look twice?"',
 '"What are some amazing pictures one has to see twice to understand?"',
 '"Have the ellectoral college members ever voted differently then the popular vote suggested they should vote?"',
 '"When has the electoral college voted against the popular vote?"',
 '"Did Ravana really have 10 heads?"',
 '"Why did Ravana have 10 heads?"',
 '"What\'s a book that you feel helped you to improve intellectually?"',
 '"What books or magazines should I read to improve my English?"',
 '"Is astrology true? Should we believe it or not?"',
 '"Should you believe in astrology and astrologers (pandits)?"',
 '"What are some of the biggest lies that you ever told?"',
 '"What is the biggest lie ever told by any government?"',
 '"How can I advertise my YouTube Channel to get more views?"',
 '"How can I get more traffic to my YouTube videos?"

In [322]:
Pourcentage_test = 0.05

pairs_split_train, pairs_split_test = train_test_split(np.c_[ np.array(pairs_train), np.array(y_train) ], test_size=Pourcentage_test, random_state=42)

print(pairs_split_train.shape)
print(pairs_split_test.shape)

(76095, 3)
(4005, 3)


In [323]:
pairs_train = pairs_split_train[:,0:2]
y_train = pairs_split_train[:,2]
pairs_test = pairs_split_test[:,0:2]
y_test = pairs_split_test[:,2]

# cleaning

https://www.kaggle.com/currie32/the-importance-of-cleaning-text
https://towardsdatascience.com/finding-similar-quora-questions-with-bow-tfidf-and-random-forest-c54ad88d1370

In [215]:
SPECIAL_TOKENS = {
    'quoted': 'quoted_item',
    'non-ascii': 'non_ascii_word',
    'undefined': 'something'
}

def clean(text, stem_words=True):
    import re
    from string import punctuation
    from nltk.stem import SnowballStemmer
    from nltk.corpus import stopwords
    
    def pad_str(s):
        return ' '+s+' '
    
    if pd.isnull(text):
        return ''

#    stops = set(stopwords.words("english"))
    # Clean the text, with the option to stem words.
    
    # Empty question
    
    if type(text) != str or text=='':
        return ''

    # Clean the text
    text = re.sub("\'s", " ", text) # we have cases like "Sam is" or "Sam's" (i.e. his) these two cases aren't separable, I choose to compromise are kill "'s" directly
    text = re.sub(" whats ", " what is ", text, flags=re.IGNORECASE)
    text = re.sub("\'ve", " have ", text)
    text = re.sub("can't", "can not", text)
    text = re.sub("n't", " not ", text)
    text = re.sub("i'm", "i am", text, flags=re.IGNORECASE)
    text = re.sub("\'re", " are ", text)
    text = re.sub("\'d", " would ", text)
    text = re.sub("\'ll", " will ", text)
    text = re.sub("e\.g\.", " eg ", text, flags=re.IGNORECASE)
    text = re.sub("b\.g\.", " bg ", text, flags=re.IGNORECASE)
    text = re.sub("(\d+)(kK)", " \g<1>000 ", text)
    text = re.sub("e-mail", " email ", text, flags=re.IGNORECASE)
    text = re.sub("(the[\s]+|The[\s]+)?U\.S\.A\.", " America ", text, flags=re.IGNORECASE)
    text = re.sub("(the[\s]+|The[\s]+)?United State(s)?", " America ", text, flags=re.IGNORECASE)
    text = re.sub("\(s\)", " ", text, flags=re.IGNORECASE)
    text = re.sub("[c-fC-F]\:\/", " disk ", text)
    
    # remove comma between numbers, i.e. 15,000 -> 15000
    
    text = re.sub('(?<=[0-9])\,(?=[0-9])', "", text)
    
#     # all numbers should separate from words, this is too aggressive
    
#     def pad_number(pattern):
#         matched_string = pattern.group(0)
#         return pad_str(matched_string)
#     text = re.sub('[0-9]+', pad_number, text)
    
    # add padding to punctuations and special chars, we still need them later
    
    text = re.sub('\$', " dollar ", text)
    text = re.sub('\%', " percent ", text)
    text = re.sub('\&', " and ", text)
    
#    def pad_pattern(pattern):
#        matched_string = pattern.group(0)
#       return pad_str(matched_string)
#    text = re.sub('[\!\?\@\^\+\*\/\,\~\|\`\=\:\;\.\#\\\]', pad_pattern, text) 
        
    text = re.sub('[^\x00-\x7F]+', pad_str(SPECIAL_TOKENS['non-ascii']), text) # replace non-ascii word with special word
    
    # indian dollar
    
    text = re.sub("(?<=[0-9])rs ", " rs ", text, flags=re.IGNORECASE)
    text = re.sub(" rs(?=[0-9])", " rs ", text, flags=re.IGNORECASE)
    
    # clean text rules get from : https://www.kaggle.com/currie32/the-importance-of-cleaning-text
    text = re.sub(r" (the[\s]+|The[\s]+)?US(A)? ", " America ", text)
    text = re.sub(r" UK ", " England ", text, flags=re.IGNORECASE)
    text = re.sub(r" india ", " India ", text)
    text = re.sub(r" switzerland ", " Switzerland ", text)
    text = re.sub(r" china ", " China ", text)
    text = re.sub(r" chinese ", " Chinese ", text) 
    text = re.sub(r" imrovement ", " improvement ", text, flags=re.IGNORECASE)
    text = re.sub(r" intially ", " initially ", text, flags=re.IGNORECASE)
    text = re.sub(r" quora ", " Quora ", text, flags=re.IGNORECASE)
    text = re.sub(r" dms ", " direct messages ", text, flags=re.IGNORECASE)  
    text = re.sub(r" demonitization ", " demonetization ", text, flags=re.IGNORECASE) 
    text = re.sub(r" actived ", " active ", text, flags=re.IGNORECASE)
    text = re.sub(r" kms ", " kilometers ", text, flags=re.IGNORECASE)
    text = re.sub(r" cs ", " computer science ", text, flags=re.IGNORECASE) 
    text = re.sub(r" upvote", " up vote", text, flags=re.IGNORECASE)
    text = re.sub(r" iPhone ", " phone ", text, flags=re.IGNORECASE)
    text = re.sub(r" \0rs ", " rs ", text, flags=re.IGNORECASE)
    text = re.sub(r" calender ", " calendar ", text, flags=re.IGNORECASE)
    text = re.sub(r" ios ", " operating system ", text, flags=re.IGNORECASE)
    text = re.sub(r" gps ", " GPS ", text, flags=re.IGNORECASE)
    text = re.sub(r" gst ", " GST ", text, flags=re.IGNORECASE)
    text = re.sub(r" programing ", " programming ", text, flags=re.IGNORECASE)
    text = re.sub(r" bestfriend ", " best friend ", text, flags=re.IGNORECASE)
    text = re.sub(r" dna ", " DNA ", text, flags=re.IGNORECASE)
    text = re.sub(r" III ", " 3 ", text)
    text = re.sub(r" banglore ", " Banglore ", text, flags=re.IGNORECASE)
    text = re.sub(r" J K ", " JK ", text, flags=re.IGNORECASE)
    text = re.sub(r" J\.K\. ", " JK ", text, flags=re.IGNORECASE)
    
    # replace the float numbers with a random number, it will be parsed as number afterward, and also been replaced with word "number"
    
    text = re.sub('[0-9]+\.[0-9]+', " 87 ", text)
  
    
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation]).lower()
       # Return a list of words
    return text
    
#df['question1'] = df['question1'].apply(clean)
#df['question2'] = df['question2'].apply(clean)

In [324]:
questions_cleaned = {}
for qid in questions:
    questions_cleaned[qid] = clean(questions[qid])

In [325]:
 
vec_cleaned = TfidfVectorizer()
A_cleaned = vec_cleaned.fit_transform(questions_cleaned.values())

'what are the some of the best novels'

In [None]:
from sklearn.metrics import f1_score, classification_report, accuracy_score
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(pd.concat((df['question1'],df['question2'])).unique())
trainq1_trans = tfidf_vect_ngram_chars.transform(df['question1'].values)
trainq2_trans = tfidf_vect_ngram_chars.transform(df['question2'].values)
labels = df['is_duplicate'].values
X = scipy.sparse.hstack((trainq1_trans,trainq2_trans))
y = labels
X_train,X_valid,y_train,y_valid = train_test_split(X,y, test_size = 0.33, random_state = 42)
xgb_model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train) 
xgb_prediction = xgb_model.predict(X_valid)
print('character level tf-idf training score:', f1_score(y_train, xgb_model.predict(X_train), average='macro'))
print('character level tf-idf validation score:', f1_score(y_valid, xgb_model.predict(X_valid), average='macro'))
print(classification_report(y_valid, xgb_prediction))

In [None]:
list(questions.values())

# Features

In [326]:
N_train = len(pairs_train)
X_train = np.zeros((N_train, 3))

counter = 0
for i in range(len(pairs_train)):
    q1 = pairs_train[i][0]
    q2 = pairs_train[i][1]
    X_train[i,0] = cosine_similarity(A_cleaned[ids2ind[q1],:], A_cleaned[ids2ind[q2],:])
    X_train[i,1] = len(questions_cleaned[q1].split()) + len(questions_cleaned[q2].split())
    X_train[i,2] = abs(len(questions_cleaned[q1].split()) - len(questions_cleaned[q2].split()))
    counter += 1
    if counter % 1000 == True:
        print (counter, "training examples processsed")

N_test = len(pairs_test)
X_test = np.zeros((N_test, 3))

counter = 0
for i in range(len(pairs_test)):
    q1 = pairs_test[i][0]
    q2 = pairs_test[i][1]
    X_test[i,0] = cosine_similarity(A_cleaned[ids2ind[q1],:], A_cleaned[ids2ind[q2],:])
    X_test[i,1] = len(questions_cleaned[q1].split()) + len(questions_cleaned[q2].split())
    X_test[i,2] = abs(len(questions_cleaned[q1].split()) - len(questions_cleaned[q2].split()))
    counter += 1
    if counter % 1000 == True:
        print (counter, "testing examples processsed")

1 training examples processsed
1001 training examples processsed
2001 training examples processsed
3001 training examples processsed
4001 training examples processsed
5001 training examples processsed
6001 training examples processsed
7001 training examples processsed
8001 training examples processsed
9001 training examples processsed
10001 training examples processsed
11001 training examples processsed
12001 training examples processsed
13001 training examples processsed
14001 training examples processsed
15001 training examples processsed
16001 training examples processsed
17001 training examples processsed
18001 training examples processsed
19001 training examples processsed
20001 training examples processsed
21001 training examples processsed
22001 training examples processsed
23001 training examples processsed
24001 training examples processsed
25001 training examples processsed
26001 training examples processsed
27001 training examples processsed
28001 training examples processse

In [118]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(corpus)]

max_epochs = 20
vec_size = 100
alpha = 0.025

modeldoc = Doc2Vec(size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
  
modeldoc.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    modeldoc.train(tagged_data,
                total_examples=modeldoc.corpus_count,
                epochs=modeldoc.iter)
    # decrease the learning rate
    modeldoc.alpha -= 0.0002
    # fix the learning rate, no decay
    modeldoc.min_alpha = modeldoc.alpha



iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19


In [137]:

#X_train = np.c_[ X_train, np.zeros([N_train,2]) ]

doc2vec_train1 = []
doc2vec_train2 = []
X_train
counter = 0
for i in range(len(pairs_train)):
    q1 = pairs_train[i][0]
    q2 = pairs_train[i][1]
    doc2vec_train1.append(modeldoc.infer_vector(questions[q1]))
    doc2vec_train2.append(modeldoc.infer_vector(questions[q2]))
    counter += 1
    if counter % 1000 == True:
        print (counter, "training examples processsed")
        
doc2vec_test1 = []
doc2vec_test2 = []
X_test = np.c_[ X_test, np.zeros([N_test,2]) ]
counter = 0
for i in range(len(pairs_test)):
    q1 = pairs_test[i][0]
    q2 = pairs_test[i][1]
    doc2vec_test1.append(modeldoc.infer_vector(questions[q1]))
    doc2vec_test2.append(modeldoc.infer_vector(questions[q2]))
    counter += 1
    if counter % 1000 == True:
        print (counter, "training examples processsed")
        
        
X_train = np.c_[ X_train, np.array(doc2vec_train1),np.array(doc2vec_train2) ]
X_test = np.c_[ X_test, np.array(doc2vec_test1),np.array(doc2vec_test2) ]

1 training examples processsed
1001 training examples processsed
2001 training examples processsed
3001 training examples processsed
4001 training examples processsed
5001 training examples processsed
6001 training examples processsed
7001 training examples processsed
8001 training examples processsed
9001 training examples processsed
10001 training examples processsed
11001 training examples processsed
12001 training examples processsed
13001 training examples processsed
14001 training examples processsed
15001 training examples processsed
16001 training examples processsed
17001 training examples processsed
18001 training examples processsed
19001 training examples processsed
20001 training examples processsed
21001 training examples processsed
22001 training examples processsed
23001 training examples processsed
24001 training examples processsed
25001 training examples processsed
26001 training examples processsed
27001 training examples processsed
28001 training examples processse

In [275]:
import gensim
 
# Load Google's pre-trained Word2Vec model.
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [279]:
w = filter(lambda x: x in model.vocab, questions_cleaned[pairs_train[0][0]])

In [284]:
'ok' in model.vocab

True

In [296]:
def filter(tokens):
    tokens_filtered = []
    for x in tokens:
        if x in model.vocab:
            tokens_filtered.append(x)
    if not tokens_filtered:
        return(['empty'])
    return(tokens_filtered)

In [295]:
not [1]

False

In [291]:
filter(questions_cleaned[q1].split())

['how', 'can', 'i', 'write', 'blog', 'post', 'on']

In [327]:
text_sim_training = []
counter = 0
for i in range(len(pairs_train)): 
    q1 = pairs_train[i][0]
    q2 = pairs_train[i][1]
    text_sim_training.append(model.n_similarity(filter(questions_cleaned[q1].split()),filter(questions_cleaned[q2].split())))
    counter += 1
    if counter % 1000 == True:
        print (counter, "training examples processsed")
text_sim_training

text_sim_testing = []
counter = 0
for i in range(len(pairs_test)): 
    q1 = pairs_test[i][0]
    q2 = pairs_test[i][1]
    text_sim_testing.append(model.n_similarity(filter(questions_cleaned[q1].split()),filter(questions_cleaned[q2].split())))

    if counter % 1000 == True:
        print (counter, "testing examples processsed")
text_sim_testing

1 training examples processsed
1001 training examples processsed
2001 training examples processsed
3001 training examples processsed
4001 training examples processsed
5001 training examples processsed
6001 training examples processsed
7001 training examples processsed
8001 training examples processsed
9001 training examples processsed
10001 training examples processsed
11001 training examples processsed
12001 training examples processsed
13001 training examples processsed
14001 training examples processsed
15001 training examples processsed
16001 training examples processsed
17001 training examples processsed
18001 training examples processsed
19001 training examples processsed
20001 training examples processsed
21001 training examples processsed
22001 training examples processsed
23001 training examples processsed
24001 training examples processsed
25001 training examples processsed
26001 training examples processsed
27001 training examples processsed
28001 training examples processse

[0.9326621049345598,
 0.819878981154933,
 1.0000000000000002,
 0.8923301676893552,
 0.8519041061512101,
 0.6486055106799459,
 0.964390222992645,
 0.6467596784609834,
 0.9118286571949542,
 0.929760214647138,
 0.7794132384243071,
 0.9580420498469603,
 0.9660224845690211,
 0.8984868324809153,
 0.936637898486928,
 0.7238578566229555,
 0.9595545451922722,
 0.9457049500747813,
 0.6539569031580856,
 0.8594263826214501,
 0.7888270050638193,
 0.9464436404480508,
 0.9503720823662182,
 0.843460093613612,
 0.8990986502550224,
 0.8053488055766934,
 0.6807434403734584,
 0.7515080550258024,
 0.9168963030695235,
 0.8161923347001173,
 0.7515433961305503,
 0.7770844286999551,
 0.8258575321411353,
 0.8004212297147018,
 0.9757227284857248,
 0.9031849540888321,
 0.8904504896090162,
 0.878177804552345,
 0.8552721629043952,
 0.666873064072227,
 0.8869279191366575,
 0.7813014087345419,
 0.7357455402971971,
 0.9711790863044515,
 0.9711970741611472,
 0.8680279394795778,
 0.7350489022730828,
 0.9077346530141847,

In [328]:
np.array(text_sim_training).shape

(76095,)

In [332]:
X_train = np.c_[ X_train, np.array(text_sim_training) ]
X_test = np.c_[ X_test, np.array(text_sim_testing) ]

In [None]:
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate([questions_cleaned[k] for k in questions_cleaned])]

max_epochs = 20
vec_size = 100
alpha = 0.025

  
modeldoc.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    modeldoc.train(tagged_data,
                total_examples=modeldoc.corpus_count,
                epochs=modeldoc.iter)
    # decrease the learning rate
    modeldoc.alpha -= 0.0002
    # fix the learning rate, no decay
    modeldoc.min_alpha = modeldoc.alpha

In [145]:
#X_test = np.delete(X_test, [3,4], 1)

In [9]:
with open('features_training.csv', 'w') as f:
    f.write('Id,f1,f2,f3\n')
    for i in range(len(pairs_train)):
        f.write(str(i)+','+str(X_train[i,0])+','+str(X_train[i,1])+','+str(X_train[i,2])+'\n')
        
with open('features_testing.csv', 'w') as f:
    f.write('Id,f1,f2,f3\n')
    for i in range(len(pairs_test)):
        f.write(str(i)+','+str(X_test[i,0])+','+str(X_test[i,1])+','+str(X_test[i,2])+'\n')

# LDA

In [182]:
questions

{'199954': '"What are the some of the best novels?"',
 '384085': '"What are some of the greatest novels of all time? Why are they great?"',
 '128681': '"What are the pictures that made you look twice?"',
 '237407': '"What are some amazing pictures one has to see twice to understand?"',
 '170846': '"Have the ellectoral college members ever voted differently then the popular vote suggested they should vote?"',
 '240621': '"When has the electoral college voted against the popular vote?"',
 '55110': '"Did Ravana really have 10 heads?"',
 '177468': '"Why did Ravana have 10 heads?"',
 '425513': '"What\'s a book that you feel helped you to improve intellectually?"',
 '400256': '"What books or magazines should I read to improve my English?"',
 '105990': '"Is astrology true? Should we believe it or not?"',
 '256943': '"Should you believe in astrology and astrologers (pandits)?"',
 '366314': '"What are some of the biggest lies that you ever told?"',
 '224793': '"What is the biggest lie ever told

In [226]:
import nltk
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
import nltk.internals
from gensim import corpora, models

corpus_cleaned = [questions_cleaned[k] for k in questions_cleaned]


list_of_list_of_tokens =[]
for i in range(len(list(questions_cleaned.values()))):
    text=corpus_cleaned[i].lower()
    nltk_tokens = nltk.word_tokenize(text)
    stemmed_tokens=[]
    for w in nltk_tokens:
        if (len(w)>3):
            lemmatized_token=wordnet_lemmatizer.lemmatize(w)
            stemmed_tokens.append(porter_stemmer.stem(lemmatized_token))
    list_of_list_of_tokens.append(stemmed_tokens)

dictionary_LDA = corpora.Dictionary(list_of_list_of_tokens)
dictionary_LDA.filter_extremes(no_below=3)
corpusLDA = [dictionary_LDA.doc2bow(list_of_tokens) for list_of_tokens in list_of_list_of_tokens]

num_topics = 20
lda_model = models.LdaModel(corpusLDA, num_topics=num_topics, 
                                  id2word=dictionary_LDA, 
                                  passes=4, alpha=[0.01]*num_topics,
                                  eta=[0.01]*len(dictionary_LDA.keys()))
for i,topic in lda_model.show_topics(formatted=True, num_topics=num_topics, num_words=10):
    print(str(i)+": "+ topic)
    print()




  diff = np.log(self.expElogbeta)


0: 0.094*"that" + 0.088*"peopl" + 0.072*"there" + 0.051*"know" + 0.051*"someon" + 0.044*"what" + 0.041*"love" + 0.037*"some" + 0.026*"about" + 0.020*"with"

1: 0.117*"learn" + 0.080*"what" + 0.059*"best" + 0.044*"program" + 0.036*"languag" + 0.032*"which" + 0.032*"studi" + 0.030*"develop" + 0.022*"while" + 0.021*"drug"

2: 0.151*"your" + 0.110*"what" + 0.077*"life" + 0.032*"after" + 0.029*"chang" + 0.021*"day" + 0.018*"biggest" + 0.018*"from" + 0.018*"posit" + 0.018*"made"

3: 0.115*"what" + 0.050*"other" + 0.043*"major" + 0.039*"compani" + 0.035*"cultur" + 0.031*"comput" + 0.028*"hillari" + 0.027*"data" + 0.027*"clinton" + 0.025*"scienc"

4: 0.136*"what" + 0.087*"differ" + 0.078*"between" + 0.041*"lose" + 0.041*"engin" + 0.037*"weight" + 0.035*"possibl" + 0.028*"month" + 0.024*"servic" + 0.020*"them"

5: 0.083*"what" + 0.083*"about" + 0.080*"think" + 0.050*"stop" + 0.043*"compar" + 0.042*"doe" + 0.038*"video" + 0.030*"just" + 0.029*"still" + 0.022*"view"

6: 0.165*"doe" + 0.064*"phone

In [227]:
from numpy import dot
from numpy.linalg import norm

counter=0
lda_similarity_training=[]
for i in range (len(pairs_train)):
    q1 = pairs_train[i][0]
    q2 = pairs_train[i][1]
    all_topics_source=[0]*num_topics
    all_topics_target=[0]*num_topics
    topics_source= lda_model[corpusLDA[ids2ind[q1]]]
    topics_target=lda_model[corpusLDA[ids2ind[q2]]]
    for k in topics_source:
        all_topics_source[k[0]]=k[1]
    for k in topics_target:
        all_topics_target[k[0]]=k[1]
    cos_sim = dot(all_topics_source, all_topics_target)/(norm(all_topics_source)*norm(all_topics_target))
    lda_similarity_training.append(cos_sim)
    
    counter += 1
    if counter % 1000 == True:
        print(counter, "training examples processsed")
        
lda_similarity_testing=[]
for i in range (len(pairs_test)):
    q1 = pairs_test[i][0]
    q2 = pairs_test[i][1]
    
    all_topics_source=[0]*num_topics
    all_topics_target=[0]*num_topics
    topics_source= lda_model[corpusLDA[ids2ind[q1]]]
    topics_target=lda_model[corpusLDA[ids2ind[q2]]]
    for k in topics_source:
        all_topics_source[k[0]]=k[1]
    for k in topics_target:
        all_topics_target[k[0]]=k[1]
    cos_sim = dot(all_topics_source, all_topics_target)/(norm(all_topics_source)*norm(all_topics_target))
    lda_similarity_testing.append(cos_sim)
    
    counter += 1
    if counter % 1000 == True:
        print(counter, "training examples processsed")

1 training examples processsed
1001 training examples processsed
2001 training examples processsed
3001 training examples processsed
4001 training examples processsed
5001 training examples processsed
6001 training examples processsed
7001 training examples processsed
8001 training examples processsed
9001 training examples processsed
10001 training examples processsed
11001 training examples processsed
12001 training examples processsed
13001 training examples processsed
14001 training examples processsed
15001 training examples processsed
16001 training examples processsed
17001 training examples processsed
18001 training examples processsed
19001 training examples processsed
20001 training examples processsed
21001 training examples processsed
22001 training examples processsed
23001 training examples processsed
24001 training examples processsed
25001 training examples processsed
26001 training examples processsed
27001 training examples processsed
28001 training examples processse

In [228]:
X_train.shape

(56070, 3)

In [206]:
np.array(lda_similarity_training).shape
np.array(lda_similarity_testing).shape

(24030,)

In [229]:
X_train = np.c_[ X_train, np.array(lda_similarity_training) ]
X_test = np.c_[ X_test, np.array(lda_similarity_testing) ]

In [None]:
counter = 0
for i in range(len(pairs_train)):
    q1 = pairs_train[i][0]
    q2 = pairs_train[i][1]
    X_train[i,0] = cosine_similarity(A[ids2ind[q1],:], A[ids2ind[q2],:])
    X_train[i,1] = len(questions[q1].split()) + len(questions[q2].split())
    X_train[i,2] = abs(len(questions[q1].split()) - len(questions[q2].split()))
    counter += 1
    if counter % 1000 == True:
        print (counter, "training examples processsed")

In [168]:
lda_model[questions[pairs_train[0][0]]] 

ValueError: not enough values to unpack (expected 2, got 1)

# Prediction

In [109]:
import math
from sklearn.metrics import log_loss

In [256]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support as score


In [103]:
[int(x) for x in y_test.tolist()]

[0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,


In [107]:
y_pred[:,1].tolist()

[0.7058812860758572,
 0.679787579453902,
 0.670747400418775,
 0.6420274415709281,
 0.709794864204055,
 0.7015520398717969,
 0.7074093146268341,
 0.6568795138514849,
 0.7012992209783039,
 0.6441183381756939,
 0.24849749485336026,
 0.7079815939154763,
 0.670747400418775,
 0.6082631742626415,
 0.7074093146268341,
 0.6999685697859422,
 0.6186743463975011,
 0.7064698908450634,
 0.2769425859826244,
 0.70582807616984,
 0.6800291169464941,
 0.7102471830663575,
 0.411710167595385,
 0.7102471830663575,
 0.6282425939869447,
 0.7102471830663575,
 0.6527238429010536,
 0.6962469514479999,
 0.6909207557520426,
 0.7041156650186049,
 0.7043465736037355,
 0.6950750247938451,
 0.7102471830663575,
 0.7102471830663575,
 0.70582807616984,
 0.7074093146268341,
 0.6441183381756939,
 0.7015520398717969,
 0.6538913570188037,
 0.29075815805821037,
 0.708817553612967,
 0.7056113846062196,
 0.7010296561303062,
 0.4604772600164111,
 0.6441183381756939,
 0.6446546253987911,
 0.670747400418775,
 0.70582807616984,
 0.

In [313]:
log_loss(np.array([int(x) for x in y_test.tolist()]), y_pred[:,1])

0.5578217384180721

In [242]:
print(y_pred[:,1])

[0.71142923 0.47627297 0.59650062 ... 0.74526595 0.6494905  0.75188475]


In [311]:
clf = RandomForestClassifier(n_estimators=1000, max_depth=4, n_jobs=-1)
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)




In [312]:
y_pred

array([[0.32926748, 0.67073252],
       [0.25921133, 0.74078867],
       [0.25819912, 0.74180088],
       ...,
       [0.26027501, 0.73972499],
       [0.63699632, 0.36300368],
       [0.30119714, 0.69880286]])

In [253]:
print(f1_score(np.array([int(x) for x in y_test.tolist()]), y_pred[:,1]))

ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [238]:
import xgboost as xgb

In [333]:
xgb_model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='reg:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train) 
xgb_prediction = xgb_model.predict_proba(X_test)

In [334]:
xgb_prediction

array([[0.3198529 , 0.6801471 ],
       [0.46754056, 0.53245944],
       [0.95954734, 0.04045268],
       ...,
       [0.4798389 , 0.5201611 ],
       [0.25807273, 0.74192727],
       [0.37058812, 0.6294119 ]], dtype=float32)

In [245]:
log_loss(np.array([int(x) for x in y_test.tolist()]), np.array([int(x) for x in xgb_prediction.tolist()]))

8.592460726668211

In [335]:
log_loss(np.array([int(x) for x in y_test.tolist()]), xgb_prediction[:,1])

0.5264824890986215

In [246]:
list(y_test)

['0',
 '1',
 '0',
 '0',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '0',
 '0',
 '1',
 '1',
 '1',
 '0',
 '1',
 '0',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '0',
 '0',
 '0',
 '1',
 '0',
 '1',
 '1',
 '1',
 '0',
 '0',
 '1',
 '1',
 '0',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '0',
 '0',
 '0',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '0',
 '0',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '0',
 '1',
 '1',
 '0',
 '1',
 '0',
 '1',
 '0',
 '0',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '0',
 '1',
 '0',
 '1',
 '1',
 '0',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '0',
 '1',
 '0',
 '0',
 '1',
 '1',
 '0',
 '0',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '0'

In [247]:
list(xgb_prediction)

['0',
 '1',
 '0',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '0',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '0',
 '0',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '0',
 '0',
 '1',
 '1',
 '0',
 '0',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1'

In [250]:
from sklearn.metrics import f1_score

In [262]:
print(f1_score(np.array([int(x) for x in y_test.tolist()]), np.array([int(x) for x in xgb_prediction.tolist()]) ))

0.8303439663980021


In [35]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout

Using TensorFlow backend.


In [210]:
model = keras.Sequential([
    keras.layers.Dense(100, kernel_regularizer=keras.regularizers.l2(0.001),activation='linear'),
    keras.layers.Dropout(0.4),
    keras.layers.Dense(100, kernel_regularizer=keras.regularizers.l2(0.001),activation='linear'),
    keras.layers.Dropout(0.4),
    keras.layers.Dense(2, activation=tf.nn.softmax)
    
])
    
model.compile(optimizer=tf.train.AdamOptimizer(), 
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy'])


history=model.fit(X_train, np.array(y_train), epochs=30, batch_size=1000, verbose=1)
#predictions
predictions = model.predict(X_test)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [66]:
predictions

array([[0.5169747 , 0.48302525],
       [0.28670162, 0.7132984 ],
       [0.3464067 , 0.65359336],
       ...,
       [0.37640372, 0.6235963 ],
       [0.7687476 , 0.23125242],
       [0.42432055, 0.5756794 ]], dtype=float32)

In [211]:
log_loss(np.array([int(x) for x in y_test.tolist()]), predictions[:,1])

0.6301935509182094

On enregistre le résultat

In [67]:
with open('submission_file_2.csv', 'w') as f:
	f.write('Id,Score\n')
	for i in range(predictions.shape[0]):
		f.write(str(i)+','+str(predictions[i][1])+'\n')

24029
