In [4]:
# %load model_weights_cv.py
#!/usr/bin/env python3
"""
Created on Sun May 14 17:08:27 2017

@author: meiyi
"""

import platform
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


def setPath():
    if platform.system() == 'Darwin':
        path_w2v = '/Volumes/MyPassport/kaggle_quora/w2v_pretrained/'
        path_data= '/Volumes/MyPassport/kaggle_quora/data/'
        path_feature = '/Volumes/MyPassport/kaggle_quora/features/'
 
        return path_w2v,path_data,path_feature 
    elif platform.system() == 'Windows':
        path_w2v = 'D:\\kaggle_quora\\w2v_pretrained\\'
        path_data= 'D:\\kaggle_quora\\data\\'
        path_feature = 'D:\\kaggle_quora\\features\\'
        return path_w2v,path_data,path_feature 
        
path_w2v,path_data,path_feature  = setPath()


# basic features ---- features engineering

test_data = pd.DataFrame()

for i in range(0,10):
    filename = 'test_'+str(i)+'_quora_features.pkl'
    data = pd.read_pickle(path_feature+filename)
    test_data = test_data.append(data)
    
    
train_data = pd.read_pickle(path_feature + 'train_quora_features.pkl')



train_porter_intersec = pd.DataFrame(pd.read_pickle(path_feature+'train_porter_interaction.pkl'),
                                     columns = ['porter_intersec'])
test_porter_intersec = pd.DataFrame(pd.read_pickle(path_feature+'test_porter_interaction.pkl'),
                                     columns = ['porter_intersec'])



train_intersec = pd.read_pickle(path_feature + 'train_intersect.pkl')
test_intersec = pd.read_pickle(path_feature + 'test_intersect.pkl')


train_angle = pd.DataFrame(pd.read_pickle(path_feature+'train_q1_q2_angle.pickle'),
                                     columns = ['angle'])

test_angle = pd.DataFrame(pd.read_pickle(path_feature+'test_q1_q2_angle.pickle'),columns = ['angle'])


# magic features 

train_comb = pd.read_pickle(path_feature+'magic_feature_train.pkl')
test_comb = pd.read_pickle(path_feature+'magic_feature_test.pkl')


# features stacking
 

train_data['weights']= [ np.random.uniform(0.2,0.21) if x == 1 else
                         np.random.uniform(0.8,0.81) for x in train_data['is_duplicate']]


train_features = pd.concat([train_data[train_data.columns.difference(['question1', 'question2'])],
                                       train_porter_intersec,
                                       train_intersec,
                                       train_angle,
                             train_comb[train_comb.columns.difference(['id','is_duplicate','q1_hash', 'q2_hash'])]], axis=1)
  
    

test_features = pd.concat([test_data[test_data.columns.difference(['question1', 'question2'])],
                                     test_porter_intersec,
                                     test_intersec,
                                     test_angle,
                            test_comb[test_comb.columns.difference(['q1_hash', 'q2_hash','id'])]],axis=1)


In [8]:

import xgboost as xgb
import pickle
from tqdm import tqdm_notebook
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss


gamma_0 = 1.30905513329
gamma_1 = 0.472008228977

def link_function(x):
    return gamma_1*x/(gamma_1*x + gamma_0*(1 - x))

def link_function_rev(y):
    return -((gamma_0 * y) / (gamma_1 * y - gamma_1 - gamma_0 * y))

def get_mean_val(models):
    return np.mean([model.booster().best_score for model in models])

xgb_models = []
y_test = []
y_train = []

def run_classifier(test_full, train_full, K = 5, run_test_set = True):
    xgb_models = []
    predictors = test_full.columns[:]
    print(predictors)
    y_test = []
    y_train = []
    folder = StratifiedKFold(n_splits=K, shuffle=True)
    print('..1..\n')
    splits = folder.split(np.zeros(train_full.shape[0]), train_full['is_duplicate'])
    for ix_first, ix_second in tqdm_notebook(splits, total=K):
        #print (len(ix_first), len(ix_second), len(train_full))
        model = xgb.XGBClassifier(silent=True).set_params(**xgb_params)
        model = model.fit(train_full.loc[ix_first, predictors], train_full.loc[ix_first, 'is_duplicate'], 
                              eval_set=[(train_full.loc[ix_second, predictors], train_full.loc[ix_second, 'is_duplicate'])], 
                              eval_metric='logloss',
                              early_stopping_rounds=100, 
                              verbose=False)
        print('..2..\n')
        if run_test_set: 
            y_test.append(model.predict_proba(test_full[predictors])[:, 1])
        y_train.append(model.predict_proba(train_full[predictors])[:, 1])
        xgb_models.append(model)

    if run_test_set: 
        y_test_pred = np.array(y_test).T.mean(axis=1)
        y_test_pred_fixed = link_function(y_test_pred)
    y_train_pred = np.array(y_train).T.mean(axis=1)
    train_full['prediction'] = y_train_pred

    #make some keys for file save 
    print('..3..\n')
    lr = str(int(xgb_params['learning_rate'] * 100))
    val_score = str(int(get_mean_val(xgb_models) * 100000))
    sub_sample = str(int(xgb_params['subsample'] * 100))
    predictor_len = str(len(predictors))
    train_score = str(int(log_loss(train_full['is_duplicate'].values, train_full['prediction'].values) * 100000))

    print(predictor_len, K, sub_sample, train_score, val_score)
    key = predictor_len + '_cv_' + str(K) + '_lr_' + lr + '_sub_' + sub_sample + '_' + train_score + '_' + val_score

    if run_test_set:
        pred = pd.DataFrame()
        pred['test_id'] = test_comb['id']
        pred['is_duplicate'] = y_test_pred_fixed
        pred.to_csv(path_data + 'submits/submission_' + key + '.csv', index=False)
    
    print('..4..\n')
    train_full[['prediction','is_duplicate']].to_csv(path_data + 'submits/train_prediction_' + key + '.tsv', sep='\t', index=False)
    pickle.dump(xgb_models, open(path_data + 'submits/models_' + key + '.pkl', 'wb'))

    return xgb_models



xgb_params = {
    'max_depth':9, 
    'learning_rate': 0.05,
    'n_estimators': 2500, 
    'objective': 'binary:logistic',
    'nthread': 16, 
    'gamma': 0, 
    'subsample': 0.6, 
    'colsample_bytree': 0.6, 
    'colsample_bylevel': 1,
    'reg_alpha': 0, 
    'reg_lambda': 1, 
    'scale_pos_weight': 1
}

xgb_models = run_classifier(test_features, train_features, 5, True)


Index(['braycurtis_distance', 'canberra_distance', 'cityblock_distance',
       'common_words', 'cosine_distance', 'diff_len', 'euclidean_distance',
       'fuzz_WRatio', 'fuzz_partial_ratio', 'fuzz_partial_token_set_ratio',
       'fuzz_partial_token_sort_ratio', 'fuzz_qratio', 'fuzz_token_set_ratio',
       'fuzz_token_sort_ratio', 'jaccard_distance', 'kur_q1vec', 'kur_q2vec',
       'len_char_q1', 'len_char_q2', 'len_q1', 'len_q2', 'len_word_q1',
       'len_word_q2', 'minkowski_distance', 'norm_wmd', 'skew_q1vec',
       'skew_q2vec', 'wmd', 'porter_intersec', 'q1_q2_intersect', 'angle',
       'q1_freq', 'q2_freq'],
      dtype='object')
..1..



..2..

..2..

..2..

..2..

..2..


..3..

33 5 60 18089 26009


FileNotFoundError: [Errno 2] No such file or directory: '/Volumes/MyPassport/kaggle_quora/data/submits/submission_33_cv_5_lr_5_sub_60_18089_26009.csv'

In [7]:
test_comb['id']

0                0
1                1
2                2
3                3
4                4
5                5
6                6
7                7
8                8
9                9
10              10
11              11
12              12
13              13
14              14
15              15
16              16
17              17
18              18
19              19
20              20
21              21
22              22
23              23
24              24
25              25
26              26
27              27
28              28
29              29
            ...   
2345766    2345766
2345767    2345767
2345768    2345768
2345769    2345769
2345770    2345770
2345771    2345771
2345772    2345772
2345773    2345773
2345774    2345774
2345775    2345775
2345776    2345776
2345777    2345777
2345778    2345778
2345779    2345779
2345780    2345780
2345781    2345781
2345782    2345782
2345783    2345783
2345784    2345784
2345785    2345785
2345786    2345786
2345787    2