In [1]:
import numpy as np
import pandas as pd

In [2]:
param = {}
param['learning_rate'] = 0.14
param['boosting)type'] = 'dart'
param['objective'] = 'binary'
param['metric'] = 'binary_logloss'
param['sub_feature'] = 0.5
param['num_leaves'] = 512
param['min_data'] = 50
param['min_hessian'] = 1

In [3]:
df_train = pd.read_csv('./data/kaggle/train.csv')
df_test = pd.read_csv('./data/kaggle/test.csv')

In [4]:
print("Original data: X_train: {}, X_test: {}".format(df_train.shape, df_test.shape))

Original data: X_train: (404290, 6), X_test: (2345796, 3)


In [5]:
df_train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [6]:
df_test.head()

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


In [7]:
train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str)

In [8]:
words = (" ".join(train_qs)).lower().split()

In [9]:

from collections import Counter
counts = Counter(words)

In [10]:
def get_weight(count, eps = 10000, min_count=2):
    return 0 if count < min_count else 1/(count+eps)
weights = {word: get_weight(count) for word, count in counts.items()}

In [11]:
from nltk.corpus import stopwords
stops = set(stopwords.words("english"))

In [12]:
df = pd.concat([df_train, df_test])

In [13]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,test_id
0,0.0,1.0,2.0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0.0,
1,1.0,3.0,4.0,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0.0,
2,2.0,5.0,6.0,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0.0,
3,3.0,7.0,8.0,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0.0,
4,4.0,9.0,10.0,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0.0,


# Tạo các đặc trưng 

In [14]:
def word_shares(row):
    # Lấy danh sách các từ trong q1 khác stop word
    q1_list = str(row['question1']).lower().split()
    q1 = set(q1_list)
    q1words = q1.difference(stops)
    if len(q1words) == 0:
        return '0:0:0:0:0:0:0:0'
    
    # Lấy danh sách các từ trong q2 khác stop word
    q2_list = str(row['question2']).lower().split()
    q2 = set(q2_list)
    q2words = q2.difference(stops)
    if len(q2words) == 0:
        return '0:0:0:0:0:0:0:0'

    words_hamming = sum(1 for i in zip(q1_list, q2_list) if i[0]==i[1])/max(len(q1_list), len(q2_list))

    # Trả về các từ chung giữa qi và stops
    q1stops = q1.intersection(stops)
    q2stops = q2.intersection(stops)

    # Trả về các cặp 2gram của câu hỏi
    q1_2gram = set([i for i in zip(q1_list, q1_list[1:])])
    q2_2gram = set([i for i in zip(q2_list, q2_list[1:])])

    # Các cặp 2gram chung giữa 2 cây hỏi
    shared_2gram = q1_2gram.intersection(q2_2gram)

    # Các từ chung giữa 2 câu hỏi (Đã loại bỏ stop word)
    shared_words = q1words.intersection(q2words)

    # weight là dict chứa từ và trọng số của từ
    # Lấy vector chứa các trọng số của từ chung giữa 2 câu
    shared_weights = [weights.get(w, 0) for w in shared_words]
    # Lấy vector chứa các trọng số của từng câu
    q1_weights = [weights.get(w, 0) for w in q1words]
    q2_weights = [weights.get(w, 0) for w in q2words]
    # Nối 2 list lại
    total_weights = q1_weights + q2_weights

    # Tính toán các đặc trưng
    R1 = np.sum(shared_weights)/np.sum(total_weights) # TF-IDF share
    R2 = len(shared_words)/(len(q1words)+len(q2words)-len(shared_words)) # count share
    R31 = len(q1stops) / len(q1words) #stops in q1
    R32 = len(q2stops) / len(q2words) #stops in q2

    Rcosine_denominator = (np.sqrt(np.dot(q1_weights,q1_weights))*np.sqrt(np.dot(q2_weights,q2_weights)))
    Rcosine = np.dot(shared_weights, shared_weights)/Rcosine_denominator # Khoảng cách consine

    if len(q1_2gram) + len(q2_2gram) == 0:
        R2gram = 0
    else:
        R2gram = len(shared_2gram) / (len(q1_2gram) + len(q2_2gram))
    return '{}:{}:{}:{}:{}:{}:{}:{}'.format(R1, R2, len(shared_words), R31, R32, R2gram, Rcosine, words_hamming)


In [15]:
df['word_shares'] = df.apply(word_shares, axis=1)



In [16]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,test_id,word_shares
0,0.0,1.0,2.0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0.0,,0.386082181176695:0.5714285714285714:4:1.0:1.2...
1,1.0,3.0,4.0,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0.0,,0.1808787650162546:0.18181818181818182:2:1.0:0...
2,2.0,5.0,6.0,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0.0,,0.17759555605674818:0.2222222222222222:2:1.333...
3,3.0,7.0,8.0,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0.0,,0.0:0.0:0:1.5:0.8:0.0:0.0:0.0
4,4.0,9.0,10.0,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0.0,,0.0:0.0:0:0.3:0.4:0.0:0.0:0.07692307692307693


# Chuẩn bị đặc trưng

In [17]:
x = pd.DataFrame()

x['word_match'] = df['word_shares'].apply(lambda x: float(x.split(':')[0]))
x['word_match_2root'] = np.sqrt(x['word_match'])
x['tfidf_word_match'] = df['word_shares'].apply(lambda x: float(x.split(':')[1]))
x['shared_count'] = df['word_shares'].apply(lambda x: float(x.split(':')[2]))

x['stops1_ratio'] = df['word_shares'].apply(lambda x: float(x.split(':')[3]))
x['stops2_ratio'] = df['word_shares'].apply(lambda x: float(x.split(':')[4]))
x['shared_2gram'] = df['word_shares'].apply(lambda x: float(x.split(':')[5]))
x['cosine'] = df['word_shares'].apply(lambda x: float(x.split(':')[6]))
x['words_hamming'] = df['word_shares'].apply(lambda x: float(x.split(':')[7]))

x['diff_stops_r'] = x['stops1_ratio'] - x['stops2_ratio']

In [18]:
x.head()

Unnamed: 0,word_match,word_match_2root,tfidf_word_match,shared_count,stops1_ratio,stops2_ratio,shared_2gram,cosine,words_hamming,diff_stops_r
0,0.386082,0.621355,0.571429,4.0,1.0,1.2,0.416667,0.795192,0.785714,-0.2
1,0.180879,0.425298,0.181818,2.0,1.0,0.333333,0.052632,0.410927,0.076923,0.666667
2,0.177596,0.421421,0.222222,2.0,1.333333,1.0,0.045455,0.340883,0.142857,0.333333
3,0.0,0.0,0.0,0.0,1.5,0.8,0.0,0.0,0.0,0.7
4,0.0,0.0,0.0,0.0,0.3,0.4,0.0,0.0,0.076923,-0.1


In [19]:
x['len_q1'] = df['question1'].apply(lambda x: len(str(x)))
x['len_q2'] = df['question2'].apply(lambda x: len(str(x)))
x['diff_len'] = x['len_q1'] - x['len_q2']

In [20]:
x['caps_count_q1'] = df['question1'].apply(lambda x: sum(1 for i in str(x) if i.isupper()))
x['caps_count_q2'] = df['question2'].apply(lambda x: sum(1 for i in str(x) if i.isupper()))
x['diff_caps'] = x['caps_count_q1'] - x['caps_count_q2']

In [21]:
# Đếm xem mỗi câu có bao nhiêu ký tự khác trắng
x['len_char_q1'] = df['question1'].apply(lambda x: len(str(x).replace(' ','')))
x['len_char_q2'] = df['question2'].apply(lambda x: len(str(x).replace(' ','')))
x['diff_len_char'] = x['len_char_q1'] - x['len_char_q2']

In [22]:
# Đếm số từ ở mỗi câu
x['len_word_q1'] = df['question1'].apply(lambda x: len(str(x).split()))
x['len_word_q2'] = df['question2'].apply(lambda x: len(str(x).split()))
x['diff_len_word'] = x['len_word_q1'] - x['len_word_q2']

In [23]:
# Tỉ lệ ký tự trên từ
x['avg_world_len1'] = x['len_char_q1']/x['len_word_q1']
x['avg_world_len2'] = x['len_char_q2']/x['len_word_q2']
x['diff_avg_word'] = x['avg_world_len1'] = x['avg_world_len2']

In [24]:
x['exactly_same'] = (df['question1'] == df['question2']).astype(int) # giống nhau hoàn toàn hay không
x['duplicated'] = df.duplicated(['question1', 'question2']).astype(int)

In [25]:
x.head()

Unnamed: 0,word_match,word_match_2root,tfidf_word_match,shared_count,stops1_ratio,stops2_ratio,shared_2gram,cosine,words_hamming,diff_stops_r,...,len_char_q2,diff_len_char,len_word_q1,len_word_q2,diff_len_word,avg_world_len1,avg_world_len2,diff_avg_word,exactly_same,duplicated
0,0.386082,0.621355,0.571429,4.0,1.0,1.2,0.416667,0.795192,0.785714,-0.2,...,46,7,14,12,2,3.833333,3.833333,3.833333,0,0
1,0.180879,0.425298,0.181818,2.0,1.0,0.333333,0.052632,0.410927,0.076923,0.666667,...,76,-32,8,13,-5,5.846154,5.846154,5.846154,0,0
2,0.177596,0.421421,0.222222,2.0,1.333333,1.0,0.045455,0.340883,0.142857,0.333333,...,50,10,14,10,4,5.0,5.0,5.0,0,0
3,0.0,0.0,0.0,0.0,1.5,0.8,0.0,0.0,0.0,0.7,...,57,-17,11,9,2,6.333333,6.333333,6.333333,0,0
4,0.0,0.0,0.0,0.0,0.3,0.4,0.0,0.0,0.076923,-0.1,...,33,31,13,7,6,4.714286,4.714286,4.714286,0,0


In [26]:
# Đếm số lần xuất hiện của từ word trong câu
def add_word_count(x, df, word):
    x['q1_' + word] = df['question1'].apply(lambda x: (word in str(x).lower())*1)
    x['q2_' + word] = df['question2'].apply(lambda x: (word in str(x).lower())*1)
    x[word + '_both'] = x['q1_' + word] + x['q2_' + word]

In [27]:
add_word_count(x, df,'how')
add_word_count(x, df,'what')
add_word_count(x, df,'which')
add_word_count(x, df,'who')
add_word_count(x, df,'where')
add_word_count(x, df,'when')
add_word_count(x, df,'why')

In [28]:
x.columns

Index(['word_match', 'word_match_2root', 'tfidf_word_match', 'shared_count',
       'stops1_ratio', 'stops2_ratio', 'shared_2gram', 'cosine',
       'words_hamming', 'diff_stops_r', 'len_q1', 'len_q2', 'diff_len',
       'caps_count_q1', 'caps_count_q2', 'diff_caps', 'len_char_q1',
       'len_char_q2', 'diff_len_char', 'len_word_q1', 'len_word_q2',
       'diff_len_word', 'avg_world_len1', 'avg_world_len2', 'diff_avg_word',
       'exactly_same', 'duplicated', 'q1_how', 'q2_how', 'how_both', 'q1_what',
       'q2_what', 'what_both', 'q1_which', 'q2_which', 'which_both', 'q1_who',
       'q2_who', 'who_both', 'q1_where', 'q2_where', 'where_both', 'q1_when',
       'q2_when', 'when_both', 'q1_why', 'q2_why', 'why_both'],
      dtype='object')

In [29]:
x.describe()

Unnamed: 0,word_match,word_match_2root,tfidf_word_match,shared_count,stops1_ratio,stops2_ratio,shared_2gram,cosine,words_hamming,diff_stops_r,...,who_both,q1_where,q2_where,where_both,q1_when,q2_when,when_both,q1_why,q2_why,why_both
count,2749809.0,2749809.0,2750086.0,2750086.0,2750086.0,2750086.0,2750086.0,2739171.0,2750086.0,2750086.0,...,2750086.0,2750086.0,2750086.0,2750086.0,2750086.0,2750086.0,2750086.0,2750086.0,2750086.0,2750086.0
mean,0.1473579,0.316405,0.2019946,1.600848,0.9544433,0.9587843,0.07278536,0.2970622,0.1303406,-0.004340919,...,0.06825677,0.02500795,0.02491122,0.04991917,0.0327357,0.03223208,0.06496779,0.09631372,0.09600136,0.1923151
std,0.1282001,0.2173611,0.2017327,1.54397,0.5095573,0.5118105,0.09986519,0.2630138,0.1996098,0.6139485,...,0.2914741,0.1561491,0.1558546,0.2489788,0.1779441,0.1766159,0.2759281,0.295021,0.2945932,0.4697682
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-8.666667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.625,0.6363636,0.0,0.0,0.0,-0.325,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.1340544,0.3661345,0.1666667,1.0,0.8571429,0.8571429,0.03333333,0.2676487,0.02702703,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.2319422,0.4816038,0.3,2.0,1.2,1.2,0.1111111,0.4735937,0.1818182,0.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.5,0.7071068,1.0,32.0,10.0,9.0,0.5,1.0,1.0,8.4,...,2.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,2.0


In [30]:
feature_names = list(x.columns.values)
print("Features: {}".format(feature_names))

Features: ['word_match', 'word_match_2root', 'tfidf_word_match', 'shared_count', 'stops1_ratio', 'stops2_ratio', 'shared_2gram', 'cosine', 'words_hamming', 'diff_stops_r', 'len_q1', 'len_q2', 'diff_len', 'caps_count_q1', 'caps_count_q2', 'diff_caps', 'len_char_q1', 'len_char_q2', 'diff_len_char', 'len_word_q1', 'len_word_q2', 'diff_len_word', 'avg_world_len1', 'avg_world_len2', 'diff_avg_word', 'exactly_same', 'duplicated', 'q1_how', 'q2_how', 'how_both', 'q1_what', 'q2_what', 'what_both', 'q1_which', 'q2_which', 'which_both', 'q1_who', 'q2_who', 'who_both', 'q1_where', 'q2_where', 'where_both', 'q1_when', 'q2_when', 'when_both', 'q1_why', 'q2_why', 'why_both']


In [31]:
x_train = x[:df_train.shape[0]]
x_test  = x[df_train.shape[0]:]
y_train = df_train['is_duplicate'].values

In [32]:
pos_train = x_train[y_train == 1]
neg_train = x_train[y_train == 0]

In [33]:
print("pos: {}, neg: {}".format(len(pos_train), len(neg_train)))

pos: 149263, neg: 255027


### Sử dụng oversampling cho negative class có vẻ tốt hơn

In [34]:
# oversample the negative class
# print("Oversampling started for proportion: {}".format(len(pos_train) / (len(pos_train) + len(neg_train))))
# p = 0.165
# scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1
# while scale > 1:
#     neg_train = pd.concat([neg_train, neg_train])
#     scale -=1
# neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
# print("Oversampling done, new proportion: {}".format(len(pos_train) / (len(pos_train) + len(neg_train))))

# x_train = pd.concat([pos_train, neg_train])
# y_train = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()

In [35]:
# print("pos: {}, neg: {}".format(len(pos_train), len(neg_train)))

In [36]:
 print("Training data: X_train: {}, Y_train: {}, X_test: {}".format(x_train.shape, len(y_train), x_test.shape))

Training data: X_train: (404290, 48), Y_train: 404290, X_test: (2345796, 48)


# Train model

In [37]:
from sklearn.model_selection import train_test_split
import lightgbm as lgb
RS = 123457
ROUNDS = 190
def train_xgb(X, y, params):
    print("Will train XGB for {} rounds, RandomSeed: {}".format(ROUNDS, RS))
    x, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=RS)
    xg_train = lgb.Dataset(x, label=y_train)
    xg_val = lgb.Dataset(X_val, label=y_val)
    watchlist  = [xg_val]
    clf = lgb.train(params, xg_train, ROUNDS, watchlist)
    return clf

def predict_xgb(clr, X_test):
    return clr.predict(X_test)

clr = train_xgb(x_train.fillna(0), y_train, param)
preds = predict_xgb(clr, x_test.fillna(0))

Will train XGB for 190 rounds, RandomSeed: 123457
[LightGBM] [Info] Number of positive: 119164, number of negative: 204268
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4639
[LightGBM] [Info] Number of data points in the train set: 323432, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.368436 -> initscore=-0.538932
[LightGBM] [Info] Start training from score -0.538932
[1]	valid_0's binary_logloss: 0.61244
[2]	valid_0's binary_logloss: 0.580625
[3]	valid_0's binary_logloss: 0.551635
[4]	valid_0's binary_logloss: 0.52903
[5]	valid_0's binary_logloss: 0.509478
[6]	valid_0's binary_logloss: 0.495775
[7]	valid_0's binary_logloss: 0.482306
[8]	valid_0's binary_logloss: 0.470577
[9]	valid_0's binary_logloss: 0.462051
[10]	valid_0's binary_logloss: 0.453985
[11]	valid_0's binary_logloss: 0.447038
[12]	valid_0's binary_logloss: 0.441683
[13]	valid_0's binary_lo

In [38]:
# print("Writing output...")
# sub = pd.DataFrame()
# sub['test_id'] = df_test['test_id']
# sub['is_duplicate'] = preds *.75
# sub.to_csv("lgb_seed{}_n{}.csv".format(RS, ROUNDS), index=False)