In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk import word_tokenize, ngrams
from sklearn import ensemble
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
import xgboost as xgb
from collections import Counter
from sklearn.model_selection import train_test_split
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords

eng_stopwords = set(stopwords.words('english'))
color = sns.color_palette()

%matplotlib inline

pd.options.mode.chained_assignment = None  # default='warn'

  from numpy.core.umath_tests import inner1d


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alchemist\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Alchemist\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Use XGBoost to test of the above features do increase accuracy

In [2]:
# Parameters:
# gpu_boost: If true, use GPU boost. Note: Using GPU boost may result in slight worse accuracy than using only CPU for calculation
# data_oversample: If true, make positive rate decrease from 0.369 to 0.17 (original test data has 0.17 positive rate), this should NOT make sense since we have used different test data and the positive rate in test data is the same as the training data
# objective_hinge: If true, make "accuracy" as the objective (achieve highest accuracy) and the output will be boolean values (0 or 1). If false, make "logloss" as the objective (achieve lowest logloss) and the output will be float values (between 0 and 1) and needs to be converted to boolean values (< 0.5 means 0 and 1 otherwise)
# output_csv: If true, output corresponding csv file. Note: May overwrite previous file, please double check before running

def xgb_model(train_df, test_df, data_clean_type, gpu_boost = True, data_oversample = False, objective_hinge = True, output_csv = True):
    if data_oversample: 
        # Rebalance the data to make it closer to the test set
        pos_boostrap_sample = train_df[train_df["is_duplicate"] == 0].sample(n = 500000, replace = True)
        rebalanced_df = pd.concat((pos_boostrap_sample, train_df))
        print("Positive rate: {}%".format(round(rebalanced_df['is_duplicate'].value_counts()[1] / (rebalanced_df['is_duplicate'].value_counts()[0] + rebalanced_df['is_duplicate'].value_counts()[1]), 3)))
        
        x_train = pd.DataFrame()
        x_test = pd.DataFrame()
        y_train = pd.DataFrame()
        y_test = pd.DataFrame()

        x_train['unigrams_common_count'] = rebalanced_df['unigrams_common_count']
        x_train['unigrams_common_ratio'] = rebalanced_df['unigrams_common_ratio']
        x_train['q1_q2_intersect'] = rebalanced_df['q1_q2_intersect']
        x_train['q1_freq'] = rebalanced_df['q1_freq']
        x_train['q2_freq'] = rebalanced_df['q2_freq']
        x_train['word_match'] = rebalanced_df['word_match']
        x_train['tfidf_word_match'] = rebalanced_df['tfidf_word_match']

        x_test['unigrams_common_count'] = test_df['unigrams_common_count']
        x_test['unigrams_common_ratio'] = test_df['unigrams_common_ratio']
        x_test['q1_q2_intersect'] = test_df['q1_q2_intersect']
        x_test['q1_freq'] = test_df['q1_freq']
        x_test['q2_freq'] = test_df['q2_freq']
        x_test['word_match'] = test_df['word_match']
        x_test['tfidf_word_match'] = test_df['tfidf_word_match']

        y_train['is_duplicate'] = rebalanced_df['is_duplicate']
        y_test['is_duplicate (Ture Value)'] = test_df['is_duplicate (Ture Value)']
        
    else:
        print("Positive rate: {}%".format(round(train_df['is_duplicate'].value_counts()[1] / (train_df['is_duplicate'].value_counts()[0] + train_df['is_duplicate'].value_counts()[1]), 3)))
        x_train = pd.DataFrame()
        x_test = pd.DataFrame()
        y_train = pd.DataFrame()
        y_test = pd.DataFrame()

        x_train['unigrams_common_count'] = train_df['unigrams_common_count']
        x_train['unigrams_common_ratio'] = train_df['unigrams_common_ratio']
        x_train['q1_q2_intersect'] = train_df['q1_q2_intersect']
        x_train['q1_freq'] = train_df['q1_freq']
        x_train['q2_freq'] = train_df['q2_freq']
        x_train['word_match'] = train_df['word_match']
        x_train['tfidf_word_match'] = train_df['tfidf_word_match']

        x_test['unigrams_common_count'] = test_df['unigrams_common_count']
        x_test['unigrams_common_ratio'] = test_df['unigrams_common_ratio']
        x_test['q1_q2_intersect'] = test_df['q1_q2_intersect']
        x_test['q1_freq'] = test_df['q1_freq']
        x_test['q2_freq'] = test_df['q2_freq']
        x_test['word_match'] = test_df['word_match']
        x_test['tfidf_word_match'] = test_df['tfidf_word_match']

        y_train['is_duplicate'] = train_df['is_duplicate']
        y_test['is_duplicate (Ture Value)'] = test_df['is_duplicate (Ture Value)']
    
    # Finally, we split some of the data off for validation
    x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=4242)

    d_train = xgb.DMatrix(x_train, label=y_train)
    d_valid = xgb.DMatrix(x_valid, label=y_valid)
    
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    
    params = {}
    
    # The following parameters may be fine-tuned in the future
    #params["eta"] = 0.02
    #params["gamma"] = 0
    #params["max_depth"] = 6
    #params["min_child_weight"] = 4
    #params["max_bin"] = 256
    #params["subsample"] = 0.8
    #params["colsample_bytree"] = 0.9

    if objective_hinge:
        params['objective'] = 'binary:hinge'
        params['eval_metric'] = 'error'
    else:
        params["objective"] = "binary:logistic"
        params["eval_metric"] = "logloss"

    if gpu_boost:
        params["tree_method"] = "gpu_hist"
        bst = xgb.train(params, d_train, 100000, watchlist, early_stopping_rounds=2000, verbose_eval=10)
    else:
        bst = xgb.train(params, d_train, 100000, watchlist, early_stopping_rounds=200, verbose_eval=10)
        
    d_test = xgb.DMatrix(x_test)
    p_test = bst.predict(d_test)
    
    if not objective_hinge:
        # Convert percentage to binary predictions
        result = []
        for i in p_test:
            if i < 0.5:
                result.append(0)
            else:
                result.append(1)
        result = np.array(result)
        sub = pd.DataFrame()
        sub['test_id'] = test_df['test_id']
        sub['is_duplicate'] = result
         
        # Get the accuracy on the test data
        true_values = test_df["is_duplicate (Ture Value)"]

        score = 0
        for i in range(0, len(result)):
            if result[i] == true_values.tolist()[i]:
                score = score + 1
        accuracy = score / len(result)
        print("Accuracy on test data: {}%".format(round(accuracy*100, 3)))
    else:
        sub = pd.DataFrame()
        sub['test_id'] = test_df['test_id']
        sub['is_duplicate'] = p_test
        
        # Get the accuracy on the test data
        true_values = test_df["is_duplicate (Ture Value)"]

        score = 0
        for i in range(0, len(p_test)):
            if p_test[i] == true_values.tolist()[i]:
                score = score + 1
        accuracy = score / len(p_test)
        print("Accuracy on test data: {}%".format(round(accuracy*100, 3)))
    
    file_name = 'xgb_with_features' + data_clean_type
    if (gpu_boost):
        file_name = file_name + '(gpu_boost)'
    if (data_oversample):
        file_name = file_name + '(data_oversample)'
    if (objective_hinge):
        file_name = file_name + '(objective_accuracy)'
    else:
        file_name = file_name + '(objective_logloss)'
    
    if output_csv:
        sub.to_csv(file_name + '.csv', index=False)
    
   
    

# Load features

In [13]:
# Supported data_clean_type (DO NOT forget to put on "()", also, if data_clean_type is NOT empty string, please put a " " before (cleaned)):
# empty string, no character
# (cleaned)
# (cleaned)(hyper_cleaned)
# (cleaned)(hyper_cleaned)(punctuation_removed)
# (cleaned)(hyper_cleaned)(punctuation_removed)(stopwords_removed)
# (cleaned)(hyper_cleaned)(punctuation_removed)(stopwords_removed)(words_shortened)
# (cleaned)(hyper_cleaned)(punctuation_removed)(stopwords_removed)(alternative_stopwords_used)
# (cleaned)(hyper_cleaned)(punctuation_removed)(stopwords_removed)(alternative_stopwords_used)(words_shortened)
data_clean_type = " (cleaned)(hyper_cleaned)(punctuation_removed)(stopwords_removed)(alternative_stopwords_used)(words_shortened)"
train_df = pd.read_csv("train_with_features" + data_clean_type + ".csv")
test_df = pd.read_csv("test_with_features" + data_clean_type + ".csv")

xgb_model(train_df, test_df, data_clean_type)

Positive rate: 0.369%
[0]	train-error:0.63048	valid-error:0.63099
[10]	train-error:0.13652	valid-error:0.13686
[20]	train-error:0.13363	valid-error:0.13464
[30]	train-error:0.13273	valid-error:0.13342
[40]	train-error:0.13202	valid-error:0.13274
[50]	train-error:0.13177	valid-error:0.13279
[60]	train-error:0.13154	valid-error:0.13224
[70]	train-error:0.13138	valid-error:0.13235
[80]	train-error:0.13130	valid-error:0.13229
[90]	train-error:0.13100	valid-error:0.13215
[100]	train-error:0.13092	valid-error:0.13203
[110]	train-error:0.13072	valid-error:0.13180
[120]	train-error:0.13077	valid-error:0.13176
[130]	train-error:0.13048	valid-error:0.13149
[140]	train-error:0.13045	valid-error:0.13167
[150]	train-error:0.13044	valid-error:0.13150
[160]	train-error:0.13037	valid-error:0.13142
[170]	train-error:0.13029	valid-error:0.13156
[180]	train-error:0.13021	valid-error:0.13154
[190]	train-error:0.13020	valid-error:0.13150
[200]	train-error:0.13017	valid-error:0.13161
[210]	train-error:0.130

[1770]	train-error:0.12499	valid-error:0.13100
[1780]	train-error:0.12500	valid-error:0.13103
[1790]	train-error:0.12497	valid-error:0.13126
[1800]	train-error:0.12490	valid-error:0.13103
[1810]	train-error:0.12489	valid-error:0.13090
[1820]	train-error:0.12490	valid-error:0.13099
[1830]	train-error:0.12494	valid-error:0.13103
[1840]	train-error:0.12490	valid-error:0.13096
[1850]	train-error:0.12491	valid-error:0.13093
[1860]	train-error:0.12485	valid-error:0.13100
[1870]	train-error:0.12484	valid-error:0.13099
[1880]	train-error:0.12483	valid-error:0.13100
[1890]	train-error:0.12478	valid-error:0.13103
[1900]	train-error:0.12476	valid-error:0.13104
[1910]	train-error:0.12477	valid-error:0.13094
[1920]	train-error:0.12476	valid-error:0.13086
[1930]	train-error:0.12465	valid-error:0.13076
[1940]	train-error:0.12462	valid-error:0.13082
[1950]	train-error:0.12460	valid-error:0.13086
[1960]	train-error:0.12460	valid-error:0.13096
[1970]	train-error:0.12460	valid-error:0.13107
[1980]	train-