In [None]:
import pandas as pd
import numpy as np


# Visualization
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
import seaborn as sns
%matplotlib inline


# Preprocessing & Feature Engineering
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer 
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


# Hyperparameter Optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV


# Modeling
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from gensim.models import word2vec


# Evaluation
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
from sklearn.model_selection import cross_val_score


# Utility
import os
import time
import random
from IPython.display import Image
import pickle
from tqdm import tqdm
import platform
from itertools import combinations
from scipy.stats.mstats import gmean
import warnings

warnings.filterwarnings(action='ignore')
np.random.seed(123)

### Data Preprocessing

In [None]:
train = pd.read_csv('tr_train.csv', encoding='utf-8')
test = pd.read_csv('tr_test.csv', encoding='utf-8')
cust_train = pd.read_csv('cust_train.csv', encoding='utf-8')
cust_test = pd.read_csv('cust_test.csv', encoding = 'utf-8')

train['PD_BUY_AM'] = train['PD_BUY_AM'].map(lambda x : int(str(x).replace(',','')))
train['PD_BUY_CT']  = train['PD_BUY_CT'].map(lambda x: int(str(x).replace(',','')))
test['PD_BUY_AM']  = test['PD_BUY_AM'].map(lambda x: int(str(x).replace(',','')))
test['PD_BUY_CT']  = test['PD_BUY_CT'].map(lambda x: int(str(x).replace(',','')))

train['TOT_PAG_VIEW_CT'].fillna(0, inplace=True)
train['TOT_PAG_VIEW_CT'] = train['TOT_PAG_VIEW_CT'].apply(lambda x: int(x))

test['TOT_PAG_VIEW_CT'].fillna(0, inplace=True)
test['TOT_PAG_VIEW_CT'] = test['TOT_PAG_VIEW_CT'].apply(lambda x: int(x))

train['TOT_SESS_HR_V'].fillna(0, inplace=True)
train['TOT_SESS_HR_V'] = train['TOT_SESS_HR_V'].apply(lambda x: int(x.replace(',','')))

test['TOT_SESS_HR_V'].fillna(0, inplace=True)
test['TOT_SESS_HR_V'] = test['TOT_SESS_HR_V'].apply(lambda x: int(x.replace(',','')))

train['date'] = pd.to_datetime(train['SESS_DT'], format='%Y%m%d')
test['date'] = pd.to_datetime(test['SESS_DT'], format='%Y%m%d')

features_train = []
features_test = []
train_add = []
test_add = []
indices = []

In [None]:
def add_feature(train, test, name):
    features_train.append(train)
    features_test.append(test)
    train_add.append(True)
    test_add.append(True)
    indices.append(name)

def oversample(x, n):
    lst = []
    for i in x:
        tmp = []
        for j in range(n):
            random.shuffle(i)
            tmp += list(i)
        lst.append(tmp)
    return lst

def feature_vector(data, w2v):
    mean_vector = []
    for words in tqdm(data):
        tmp = np.zeros(30)
        cnt = 0
        for word in words:
            try:
                tmp += w2v[word]
                cnt += 1
            except:
                pass
            tmp /= cnt
        mean_vector.append(tmp)
    return np.array(mean_vector)

def w2v_features(train_data, test_data):
    w2v_input = oversample(train_data, 5)
    w2v = word2vec.Word2Vec(sentences = w2v_input, size = 30, window = 3, min_count = 1, sg = 1)
    return feature_vector(train_data, w2v), feature_vector(test_data, w2v)

### Word2Vector

In [None]:
w2v_columns = ['CLAC3_NM','CLAC2_NM','PD_BRA_NM','KWD_NM']

for col in w2v_columns:
    train_data = list(train.groupby('CLNT_ID')[col].unique())
    test_data = list(test.groupby('CLNT_ID')[col].unique())

    train_mean_vector, test_mean_vector = w2v_features(train_data, test_data)

    ftr = pd.concat([pd.DataFrame(cust_train.CLNT_ID),pd.DataFrame(train_mean_vector)], axis=1)
    fte = pd.concat([pd.DataFrame(cust_test.CLNT_ID),pd.DataFrame(test_mean_vector)], axis=1)
    add_feature(ftr, fte, 'w2v_'+col)

### Feature Engineering

In [None]:
final = pd.DataFrame(data=indices, columns=['feature'])
final['train'] = train_add
final['test'] = test_add

In [None]:
final.iloc[:,1:] = False
final.iloc[[29,31,32],1:] = True
final

In [None]:
final_train = [fe for idx, fe in enumerate(features_train) if final.train[idx]==True]
final_test = [fe for idx, fe in enumerate(features_test) if final.test[idx]==True]

data_train = pd.DataFrame({'CLNT_ID':np.sort(train.CLNT_ID.unique())})
for f in final_train:
    data_train = pd.merge(data_train, f, on=['CLNT_ID'], how='left')
    
data_test = pd.DataFrame({'CLNT_ID':np.sort(test.CLNT_ID.unique())})
for f in final_test:
    data_test = pd.merge(data_test, f, on=['CLNT_ID'], how='left')
    
data_train.fillna(0, inplace=True)
data_test.fillna(0, inplace=True)

In [None]:
preds = []

### LGBM

In [None]:
kfold = StratifiedKFold(n_splits=4)
n_it = 10
params = {'n_estimators':[100], 'objective' : ['multiclass'], 'learning_rate' : [0.1], 'max_depth' : [10]}
model = RandomizedSearchCV(LGBMClassifier(), param_distributions=params, n_iter=n_it, cv=kfold, verbose=1, n_jobs=-1, scoring='neg_log_loss')
model.fit(data_train, cust_train.LABEL)
print('========BEST_SCORE = ', model.best_score_)
model_LGBM = model.best_estimator_

pred_LGBMs = pd.DataFrame(model_LGBM.predict_proba(data_test))
preds.append(pred_LGBMs)

### RandomForest

In [None]:
kfold = StratifiedKFold(n_splits=4)
n_it = 10
params = {'max_features':list(np.arange(1, train.shape[1])), 'min_samples_split' : [3, 5, 7], 'min_samples_leaf' : [1, 3, 5], 'max_depth' : [4,6,8], 'bootstrap':[True,False], 'n_estimators': [300, 500], 'criterion':['gini','entropy']}
model = RandomizedSearchCV(RandomForestClassifier(), param_distributions=params, n_iter=n_it, cv=kfold, verbose=1, n_jobs=-1, scoring='neg_log_loss')
model.fit(data_train, cust_train.LABEL)
print('========BEST_SCORE = ', model.best_score_)
model_RF = model.best_estimator_

pred_RF = pd.DataFrame(model_RF.predict_proba(data_test))
preds.append(pred_RF)

### XGBoost

In [None]:
kfold = StratifiedKFold(n_splits=4)
n_it = 10
params = {'n_estimators': [300], 'max_depth':[7], 'objective':['multi:softmax'], 'learning_rate':[0.1]}
model = RandomizedSearchCV(XGBClassifier(), param_distributions=params, n_iter=n_it, cv=kfold, verbose=1, n_jobs=-1, scoring='neg_log_loss')
model.fit(data_train, cust_train.LABEL)
print('========BEST_SCORE = ', model.best_score_)
model_XGB = model.best_estimator_

pred_XGB = pd.DataFrame(model_XGB.predict_proba(data_test))
preds.append(pred_XGB)

### LogisticRegression

In [None]:
kfold = StratifiedKFold(n_splits=4)
n_it = 10
params = {'penalty':['l1','l2','elasticnet'], 'C':[1, 0.5, 0.1], 'max_iter':[100,300,500]}
model = RandomizedSearchCV(LogisticRegression(), param_distributions=params, n_iter=n_it, cv=kfold, verbose=1, n_jobs=-1, scoring='neg_log_loss')
model.fit(data_train, cust_train.LABEL)
print('========BEST_SCORE = ', model.best_score_)
model_LR = model.best_estimator_

pred_LR = pd.DataFrame(model_LR.predict_proba(data_test))
preds.append(pred_LR)

### Soft Voting

In [None]:
last_pred = 0
for p in preds:
    last_pred += p
last_pred /= len(preds)

### Submission

In [None]:
result = pd.concat([cust_test.CLNT_ID,pred_LGBMs],axis=1)
result.columns = ['CLNT_ID','F20','F30','F40','M20','M30','M40']

result.to_csv('submit/submission.csv',index=False)