In [1]:
import pandas as pd
from helper import *
from gensim.models import word2vec
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix, roc_auc_score
import lightgbm as lgb
from sklearn.svm import SVC
from sklearn.preprocessing import FunctionTransformer
import math
import time
import warnings
warnings.filterwarnings('ignore')
from sklearn import linear_model
from sklearn.decomposition import PCA

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Zhijun\AppData\Local\Temp\jieba.cache
Loading model cost 1.115 seconds.
Prefix dict has been built succesfully.


In [2]:
ff_train = pd.read_csv('ff_train.csv', encoding = 'utf-8')
mg_train = pd.read_csv('mg_train.csv', encoding = 'utf-8')
mf_train = pd.read_csv('mf_train.csv', encoding = 'utf-8')

## Step1: Feature Engineering

#### 1.1  Create features indicating name entity of each keyword

In [3]:
#def get_pos_info(df, key):
#    pos_list = []
#    n = 0
#    for name in df['Product_Name_s'].tolist():
#        n += 1
#        if key in name.strip().split():
#            pos_list.append((name.strip().split().index(key)+1)/len(name.strip().split()))
#    if len(pos_list) < 1:
#        pos_list = [0]
#    min_pos, max_pos, mean_pos, idf = min(pos_list), max(pos_list), sum(pos_list)/len(pos_list), math.log(n/(len(pos_list)+1))
#    return min_pos, max_pos, mean_pos, idf

In [3]:
def get_word_info(df, w2vmodel):
    word_info = {}
    model, keys, wordvector = read_word2vec(w2vmodel)
    for key in keys:
        min_pos, max_pos, mean_pos, idf = get_pos_info(df, key)
        word_info[key] = np.array((min_pos, max_pos, mean_pos, idf))
    return word_info

In [4]:
word_info_mg = get_word_info(mg_train, 'model_mg')
word_info_ff = get_word_info(ff_train, 'model_ff')
word_info_mf = get_word_info(mf_train, 'model_mf')

#### 1.2 Create features indicating the position of a keyword in a particular sentence  &
#### 1.3 Dimensionality reduction (PCA)


In [5]:
def construct_input_X(df, word2vec_file, word_info):
    
    model, keys, wordvector = read_word2vec(word2vec_file)
    
    df['product_length'] = df.apply(lambda row: len(split_words(row['Product_Name_s'], mode = 'simplified')), axis = 1)
    df['query_length'] = df.apply(lambda row: len(split_words(row['Query_s'], mode = 'simplified')), axis = 1)
    df['min_pos'] = df.apply(lambda row: find_keyword_info(row['Query_s'], row['Product_Name_s'], 'min_pos', keys, word_info), axis = 1)
    df['max_pos'] = df.apply(lambda row: find_keyword_info(row['Query_s'], row['Product_Name_s'], 'max_pos', keys, word_info), axis = 1)
    df['mean_pos'] = df.apply(lambda row: find_keyword_info(row['Query_s'], row['Product_Name_s'], 'mean_pos', keys, word_info), axis = 1)
    df['tf'] = df.apply(lambda row: find_keyword_info(row['Query_s'], row['Product_Name_s'], 'tf', keys, word_info), axis = 1)
    df['tmin_pos'] = df.apply(lambda row: find_keyword_info(row['Query_s'], row['Product_Name_s'], 'tmin_pos', keys, word_info), axis = 1)
    df['tmax_pos'] = df.apply(lambda row: find_keyword_info(row['Query_s'], row['Product_Name_s'], 'tmax_pos', keys, word_info), axis = 1)
    df['tmean_pos'] = df.apply(lambda row: find_keyword_info(row['Query_s'], row['Product_Name_s'], 'tmean_pos', keys, word_info), axis = 1)   
    df['idf'] = df.apply(lambda row: find_keyword_info(row['Query_s'], row['Product_Name_s'], 'idf', keys, word_info), axis = 1)
    df['tfidf'] = df['tf'] * df['idf']
    df['min_pos_por'] = df['min_pos'] / df['product_length']
    df['max_pos_por'] = df['max_pos'] / df['product_length']
    df['mean_pos_por'] = df['mean_pos'] / df['product_length']

    lines = []
    for i in range(df.shape[0]):
        product_vec = vectorizer(df['Product_Name_s'].tolist()[i], model, keys, wordvector)
        query_vec = vectorizer(df['Query_s'].tolist()[i], model, keys, wordvector)
        lines.append(list(product_vec) + list(query_vec))
    
    X = pd.DataFrame(lines)
    pca = PCA(n_components = 200)
    X = pd.DataFrame(pca.fit_transform(X))

    
    X = pd.concat([X.reset_index(drop=True), df[['product_length', 'query_length', 'min_pos', 'max_pos', 'mean_pos','tf', 'tmin_pos', 'tmax_pos', 'tmean_pos', 'idf', 'tfidf','min_pos_por', 'max_pos_por', 'mean_pos_por']]], axis=1)
    
    return X

In [6]:
def construct_input_y(df):
    return np.where((df['num_of_clicks'] > 0), 1, 0).astype(bool)

In [7]:
X_mg = construct_input_X(mg_train, 'model_mg', word_info_mg)
y_mg = construct_input_y(mg_train)
X_ff = construct_input_X(ff_train, 'model_ff', word_info_ff)
y_ff = construct_input_y(ff_train)
X_mf = construct_input_X(mf_train, 'model_mf', word_info_mf)
y_mf = construct_input_y(mf_train)

In [8]:
X_mg.shape

(4169, 214)

## Step 2: Train supervised learning model
Default choice is to train both lightgbm and svm models, and the best set of parameters would be selected by using cross validation, based on F1 score.

In [9]:
def model_training(X, y, models = ['lgb', 'svm'], scoring = 'f1', random_state = 0):

    X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=random_state)

    opt_params = {}
    
    #LR model
    if 'lr' in models:
        print('Start training Logistics Regression model.')
        start = time.time()
        clf = linear_model.Lasso(random_state=random_state)
        param_grid_clf = {'alpha': [0.1**i for i in range(1, 6)]}
        grid_clf = GridSearchCV(clf, param_grid_clf, cv=5, scoring=scoring)
        grid_clf.fit(X_train, y_train)
        print("Completed. Time_used: " + str(int(time.time()-start)) + 's, best score: ' + str(grid_clf.best_score_))
        opt_params['clf'] = grid_clf.best_params_
    
    #print("\n")
    
    #RF model
    if 'rf' in models:
        print('Start training Random Forest model.')
        start = time.time()
        rf = RandomForestClassifier(random_state=random_state, n_jobs = -1)
        param_grid_rf = { 
            'n_estimators': [200*i for i in range(1, 6)],
            'max_features': ['auto', 'sqrt', 'log2'],
            'max_depth' : [10,15,20],
        }
        grid_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring=scoring)
        grid_rf.fit(X_train, y_train)
        print("Completed. Time_used: " + str(int(time.time()-start)) + 's, best score: ' + str(grid_rf.best_score_))
        opt_params['rf'] = grid_rf.best_params_
    
    print("\n")
    
    #lightgbm model
    if 'lgb' in models:
        print('Start training LightGBM model.')
        start = time.time()
        lgb_m = lgb.LGBMClassifier(objective = 'binary', n_jobs = 1, random_state=random_state)
        param_grid_lgb = {
            'n_estimators': [200*i for i in range(1, 6)],
            'max_depth' : [5,6, 7, 8],
            'num_leaves': range(10,40,5)
        }
        grid_lgb = GridSearchCV(lgb_m, param_grid_lgb, cv=5, scoring=scoring)
        grid_lgb.fit(X_train, y_train)
        print("Completed. Time_used: " + str(int(time.time()-start)) + 's, best score: ' + str(grid_lgb.best_score_))
        opt_params['lgb'] = grid_lgb.best_params_
    
    print("\n")
    
    #SVM model
    if 'svm' in models:
        print('Start training SVM model.')
        start = time.time()
        svm = SVC(kernel='rbf', random_state=0)    
        param_grid_svm = {'gamma': [1e-4, 1e-3, 1e-2, 1e-1, 1],
                         'C': [1, 10, 100, 1000, 10000]}
        grid_svm = GridSearchCV(svm, param_grid_svm, cv=5, scoring=scoring)
        grid_svm.fit(X_train, y_train)
        print("Completed. Time_used: " + str(int(time.time()-start)) + 's, best score: ' + str(grid_svm.best_score_))
        opt_params['svm'] = grid_svm.best_params_
    
    return opt_params


In [11]:
opt_params_mg = model_training(X_mg, y_mg)



Start training LightGBM model.
Completed. Time_used: 6143s, best score: 0.10758692045932304


Start training SVM model.
Completed. Time_used: 623s, best score: 0.268568358732892


In [12]:
opt_params_ff = model_training(X_ff, y_ff)



Start training LightGBM model.
Completed. Time_used: 5027s, best score: 0.08151179687028483


Start training SVM model.
Completed. Time_used: 325s, best score: 0.2703067085658053


In [13]:
opt_params_mf = model_training(X_mf, y_mf)



Start training LightGBM model.
Completed. Time_used: 5349s, best score: 0.18059824138179376


Start training SVM model.
Completed. Time_used: 346s, best score: 0.3106627538153659


In [14]:
opt_params_mf

{'lgb': {'max_depth': 10, 'n_estimators': 1000, 'num_leaves': 10},
 'svm': {'C': 10000, 'gamma': 0.001}}

In [15]:
opt_params_mg 

{'lgb': {'max_depth': 15, 'n_estimators': 400, 'num_leaves': 15},
 'svm': {'C': 10000, 'gamma': 0.0001}}

In [16]:
opt_params_ff

{'lgb': {'max_depth': 10, 'n_estimators': 800, 'num_leaves': 10},
 'svm': {'C': 10000, 'gamma': 0.001}}

## Step3: Ensemble model and model measurement
Train lightgbm model and svm model based on the optimal set of parameters, combine two classfication model to ensure stability, and measure the performance of combined model by using F1, recall and precision.

In [17]:
def opt_model(X_train, y_train, params, random_state = None):
    

    # Logistics Regression
    #opt_lr = LogisticRegression(random_state=random_state, n_jobs = -1, C = params['lr']['C'])
    #opt_lr.fit(X_train, y_train)
    
    # Random Forest
    #opt_rf = RandomForestClassifier(random_state=random_state, n_jobs = -1, n_estimators = params['rf']['n_estimators'], max_features = params['rf']['max_features'], max_depth = params['rf']['max_depth'], criterion = params['rf']['criterion'])
    #opt_rf.fit(X_train, y_train)
    
    # Lightgbm
    opt_lgb = lgb.LGBMClassifier(objective = 'binary', n_jobs = 1, random_state=random_state, n_estimators = params['lgb']['n_estimators'], max_depth = params['lgb']['max_depth'], num_leaves = params['lgb']['num_leaves'])
    opt_lgb.fit(X_train, y_train)
    
    # svm
    opt_svm = SVC(kernel='rbf', random_state=random_state, gamma = params['svm']['gamma'], C = params['svm']['C'], probability=True)
    opt_svm.fit(X_train, y_train)
    
    #return opt_lr, opt_rf, opt_lgb, opt_svm
    return opt_lgb, opt_svm

In [18]:

def predict_prob(X_test, weights, opt_lgb, opt_svm):
    # predict prob
    #opt_lr_prob = opt_lr.predict_proba(X_test)
    #opt_rf_prob = opt_rf.predict_proba(X_test)
    opt_lgb_prob = opt_lgb.predict_proba(X_test)
    opt_svm_prob = opt_svm.predict_proba(X_test)
    
    # ensemble
    y_prob = np.array([opt_lgb_prob[i][1] * weights['lgb'] + opt_svm_prob[i][1] * weights['svm'] for i in range(X_test.shape[0])])

    return y_prob

In [19]:
def model_measure(X, y, params, random_state, weights, threshold):
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=random_state)
    
    opt_lgb, opt_svm = opt_model(X_train, y_train, params, random_state = random_state)
    
    y_prob = predict_prob(X_valid, weights, opt_lgb, opt_svm)
    
    threshold_value = sorted(y_prob, reverse = True)[int(len(y_prob) * threshold)]
    y_pred = np.where(y_prob > threshold_value, 1, 0).astype(bool)
    
    print('recall rate: ' + str(recall_score(y_valid, y_pred)))
    print('precision rate: ' + str(precision_score(y_valid, y_pred)))
    print('f1_score' + str(f1_score(y_valid, y_pred)))
    



In [20]:
weights_ff = {'lgb': 0.3, 'svm': 0.7}
%time model_measure(X_ff, y_ff, opt_params_ff, 0, weights_ff, threshold = 0.9)

recall rate: 0.9080459770114943
precision rate: 0.22931785195936139
f1_score0.36616454229432216
Wall time: 25.4 s


In [21]:
weights_mg = {'lgb': 0.3, 'svm': 0.7}
%time model_measure(X_mg, y_mg, opt_params_mg, 0, weights_mg, threshold = 0.9)

recall rate: 0.9291666666666667
precision rate: 0.23773987206823027
f1_score0.37860780984719866
Wall time: 1min 12s


In [22]:
weights_mf = {'lgb': 0.3, 'svm': 0.7}
%time model_measure(X_mf, y_mf, opt_params_mf, 0, weights_mf, threshold = 0.9)

recall rate: 0.914572864321608
precision rate: 0.2633863965267728
f1_score0.40898876404494383
Wall time: 23.2 s


## Step4: Make prediction

In [23]:
df = pd.read_csv('test.csv', encoding = 'utf-8')
ff_test = df[df['Category'] == 'Female Clothes'].reset_index()
mg_test = df[df['Category'] == 'Mobile & Gadgets'].reset_index()
mf_test = df[df['Category'] == 'Male Clothes'].reset_index()

In [24]:
def make_prediction(df, word2vec_file, word_info, weights, opt_lgb, opt_svm, threshold = 3):
    
    model, keys, wordvector = read_word2vec(word2vec_file)
    
    # processing text data
    df['Product_Name_t'] = df.apply(lambda row: pre_processing(row['Product Name'], mode = 'traditional'), axis = 1)
    df['Product_Name_s'] = df.apply(lambda row: pre_processing(row['Product Name'], mode = 'simplified'), axis = 1)

    # construct (title, keyword) pairs
    n = df.shape[0]
    lines = []
    for i in range(n):
        name = df['Product_Name_t'][i]
        words = name.strip().split()
        for word in words:
            lines.append([df['Product Name'][i], word])
            
    df_test = pd.DataFrame(lines)
    df_test.columns = ['Product Name', 'Query']
    
    df_test = data_prep(df_test)
    
    # Vectorize text into word vector
    X = construct_input_X(df_test, word2vec_file, word_info)

    # predict probability based on ensemble model
    y_prob = predict_prob(X, weights, opt_lgb, opt_svm)
    
    df_test['predict_prob'] = y_prob

    # sort a list of keywords based on the predicted probability
    lines = []
    for product in df_test['Product Name'].unique().tolist():
        sub = df_test[df_test['Product Name'] == product].reset_index()
        word_list = sub.sort_values(by = 'predict_prob', ascending = False).Query_t.tolist()
        words = []
        seen = set()
        for ele in word_list:
            if ele not in seen:
                words.append(ele)
            seen.add(ele)
        text = ';'.join(words)
        lines.append([product, text])
        
    res = pd.DataFrame(lines)

    return res

In [25]:
X_train, X_valid, y_train, y_valid = train_test_split(X_mg, y_mg, random_state=0)
opt_lgb, opt_svm = opt_model(X_train, y_train, opt_params_mg, random_state = 0)
%time prediction = make_prediction(mg_test, 'model_mg', word_info_mg, weights_mg,opt_lgb, opt_svm)

Wall time: 32.6 s


In [144]:
prediction.to_csv('prediction_mg2.csv', encoding = 'utf-8-sig')

In [26]:
X_train, X_valid, y_train, y_valid = train_test_split(X_ff, y_ff, random_state=0)
opt_lgb, opt_svm = opt_model(X_train, y_train, opt_params_ff, random_state = 0)
%time prediction = make_prediction(ff_test, 'model_ff', word_info_ff, weights_ff,opt_lgb, opt_svm)

Wall time: 4min 17s


In [27]:
prediction.to_csv('prediction_ff2.csv', encoding = 'utf-8-sig')

In [28]:
X_train, X_valid, y_train, y_valid = train_test_split(X_mf, y_mf, random_state=0)
opt_lgb, opt_svm = opt_model(X_train, y_train, opt_params_mf, random_state = 0)
%time prediction = make_prediction(mf_test, 'model_mf', word_info_mf, weights_mf,opt_lgb, opt_svm)

Wall time: 3min 51s


In [29]:
prediction.to_csv('prediction_mf3.csv', encoding = 'utf-8-sig')