In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
import operator
import gc
import time

import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
import string
import re

import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from scipy.sparse import csr_matrix

In [2]:
train = pd.read_csv('./data/BEST&MOST200/train-best200.csv')
dev = pd.read_csv('./data/BEST&MOST200/dev-best200.csv')
test = pd.read_csv('./data/BEST&MOST200/test-best200.csv')

In [3]:
print('Train shape:', train.shape)
print('Dev shape  :', dev.shape)
print('Test shape :', test.shape)

Train shape: (96585, 457)
Dev shape  : (34028, 457)
Test shape : (32977, 457)


In [4]:
def merge_same_user(df):
    df_class = df[['user-id', 'class']].groupby('user-id').apply(lambda x: x['class'].mode()).reset_index()
    df_class = df_class.rename(columns={0: 'class'})
    df_new = df.groupby('user-id').sum().reset_index()
    df_new = pd.merge(df_new, df_class, on='user-id')
    
    return df_new

In [5]:
train_per_user = merge_same_user(train)
dev_per_user = merge_same_user(dev)

train_per_user = pd.concat([train_per_user, dev_per_user], axis=0)
test_per_user = merge_same_user(test)

In [6]:
print('Train per user shape:', train_per_user.shape)
print('Test per user shape :', test_per_user.shape)

Train per user shape: (3190, 457)
Test per user shape : (802, 457)


In [7]:
def accuracy(data, pred):
    y_true = data
    reshaped_pred = pred.reshape(3, len(pred) // 3)
    y_pred = np.argmax(reshaped_pred, axis=0)
    acc = np.mean(y_true == y_pred)
    return 'accuracy', acc, True

def train_model(X, y, X_test, params, fold_num):
    result_dict = {}
    oof = np.zeros((len(X), 3))
    prediction = np.zeros((len(X_test), 3))
    scores = []
    features = X.columns
    feature_importance = pd.DataFrame()

    folds = StratifiedKFold(n_splits=fold_num)
    
    for fold, (train_idx, val_idx) in enumerate(folds.split(X, y)):
        print(f'\nFold {fold + 1} started at {time.ctime()}')
        
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        model = lgb.LGBMClassifier(
            **params,
            )
        
        model.fit(X_train, y_train,
                  eval_set=[(X_val, y_val)],
                  eval_metric=accuracy,
                  verbose=False,
                  early_stopping_rounds=100)
        
        y_pred_val_proba = model.predict_proba(X_val)
        y_pred_val = model.predict(X_val)
        y_pred_proba = model.predict_proba(X_test)
        
        accuracy_val = accuracy_score(y_val, y_pred_val)
        scores.append(accuracy_val)
        
        print('\nFold {0: d} accuracy: {1: .4f}'.format(fold + 1, accuracy_val))
        
        oof[val_idx] += y_pred_val_proba.reshape(-1, 3)
        prediction += y_pred_proba / fold_num
        
        fold_importance = pd.DataFrame()
        fold_importance['feature'] = features
        fold_importance['importance'] = model.feature_importances_
        fold_importance['fold'] = fold + 1
        feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
        
    print('\nCV mean accuracy {0: .4f}'.format(np.mean(scores)))
    
    cols = feature_importance[['feature', 'importance']].groupby('feature').mean().sort_values(by='importance', ascending=False).index
    best_features = feature_importance[['feature', 'importance']].groupby('feature').mean().sort_values(by='importance', ascending=False).reset_index()

    result_dict['oof'] = oof
    result_dict['prediction'] = prediction
    result_dict['scores'] = scores
    result_dict['feature_importance'] = best_features
    
#     plt.figure(figsize=(16,128))
#     sns.barplot(x='importance', y='feature', data=best_features)
#     plt.title('LGB Features (avg over folds)')
        
    return result_dict

In [8]:
X = train_per_user.drop(['tweet-id', 'user-id', 'class'], axis=1)
X_test = test_per_user.drop(['tweet-id', 'user-id', 'class'], axis=1)

y = train_per_user['class']
encoder = LabelEncoder()
y = encoder.fit_transform(y)


params = {
    'n_estimators': 5000,
}

result_dict = train_model(X, y, X_test, params, 10)


Fold 1 started at Fri Oct 11 17:41:51 2019

Fold  1 accuracy:  0.8224

Fold 2 started at Fri Oct 11 17:42:02 2019

Fold  2 accuracy:  0.8193

Fold 3 started at Fri Oct 11 17:42:11 2019

Fold  3 accuracy:  0.8219

Fold 4 started at Fri Oct 11 17:42:22 2019

Fold  4 accuracy:  0.8281

Fold 5 started at Fri Oct 11 17:42:33 2019

Fold  5 accuracy:  0.8019

Fold 6 started at Fri Oct 11 17:42:44 2019

Fold  6 accuracy:  0.8365

Fold 7 started at Fri Oct 11 17:42:54 2019

Fold  7 accuracy:  0.8239

Fold 8 started at Fri Oct 11 17:43:07 2019

Fold  8 accuracy:  0.8176

Fold 9 started at Fri Oct 11 17:43:18 2019

Fold  9 accuracy:  0.8396

Fold 10 started at Fri Oct 11 17:43:24 2019

Fold  10 accuracy:  0.8365

CV mean accuracy  0.8248


In [9]:
validation = train_per_user[['user-id', 'class']]
validation['prediction'] = encoder.inverse_transform(np.argmax(result_dict['oof'], axis=1))
validation[encoder.classes_[0]] = result_dict['oof'][:, 0]
validation[encoder.classes_[1]] = result_dict['oof'][:, 1]
validation[encoder.classes_[2]] = result_dict['oof'][:, 2]
validation.to_csv('./result-per-user/lgbm-val.csv', index=False)
accuracy_score(validation['class'], validation['prediction'])

0.8247648902821316

In [10]:
submission = test_per_user[['user-id', 'class']]
submission['prediction'] = encoder.inverse_transform(np.argmax(result_dict['prediction'], axis=1))
submission[encoder.classes_[0]] = result_dict['prediction'][:, 0]
submission[encoder.classes_[1]] = result_dict['prediction'][:, 1]
submission[encoder.classes_[2]] = result_dict['prediction'][:, 2]
submission.to_csv('./result-per-user/lgbm-test.csv', index=False)

In [11]:
remove_cols = ['aha', 'ahah', 'ahaha', 'ahahah', 'ahahaha', 'ahahahaha', 'bahaha', 'bahahaha', 'haha', 'hahah', 'hahaha', 'hahahah', 'hahahaha', 'hahahahaha', 'hahahahahaha']

for w in stop_words:
    if w in train.columns:
        remove_cols.append(w)
        
train_per_user = train_per_user.drop(remove_cols, axis=1)
test_per_user = test_per_user.drop(remove_cols, axis=1)

In [12]:
print('Train per user shape:', train_per_user.shape)
print('Test per user shape :', test_per_user.shape)

Train per user shape: (3190, 401)
Test per user shape : (802, 401)


In [13]:
X = train_per_user.drop(['tweet-id', 'user-id', 'class'], axis=1)
X_test = test_per_user.drop(['tweet-id', 'user-id', 'class'], axis=1)

y = train_per_user['class']
encoder = LabelEncoder()
y = encoder.fit_transform(y)


params = {
    'n_estimators': 5000,
}

result_dict = train_model(X, y, X_test, params, 10)


Fold 1 started at Fri Oct 11 17:43:37 2019

Fold  1 accuracy:  0.8224

Fold 2 started at Fri Oct 11 17:43:43 2019

Fold  2 accuracy:  0.8193

Fold 3 started at Fri Oct 11 17:43:49 2019

Fold  3 accuracy:  0.8219

Fold 4 started at Fri Oct 11 17:43:54 2019

Fold  4 accuracy:  0.8063

Fold 5 started at Fri Oct 11 17:44:02 2019

Fold  5 accuracy:  0.7956

Fold 6 started at Fri Oct 11 17:44:09 2019

Fold  6 accuracy:  0.8270

Fold 7 started at Fri Oct 11 17:44:15 2019

Fold  7 accuracy:  0.8208

Fold 8 started at Fri Oct 11 17:44:23 2019

Fold  8 accuracy:  0.8082

Fold 9 started at Fri Oct 11 17:44:33 2019

Fold  9 accuracy:  0.8113

Fold 10 started at Fri Oct 11 17:44:41 2019

Fold  10 accuracy:  0.7987

CV mean accuracy  0.8132


In [14]:
validation = train_per_user[['user-id', 'class']]
validation['prediction'] = encoder.inverse_transform(np.argmax(result_dict['oof'], axis=1))
validation[encoder.classes_[0]] = result_dict['oof'][:, 0]
validation[encoder.classes_[1]] = result_dict['oof'][:, 1]
validation[encoder.classes_[2]] = result_dict['oof'][:, 2]
validation.to_csv('./result-per-user/lgbm-remove-stopwords-val.csv', index=False)
accuracy_score(validation['class'], validation['prediction'])

0.8131661442006269

In [15]:
submission = test_per_user[['user-id', 'class']]
submission['prediction'] = encoder.inverse_transform(np.argmax(result_dict['prediction'], axis=1))
submission[encoder.classes_[0]] = result_dict['prediction'][:, 0]
submission[encoder.classes_[1]] = result_dict['prediction'][:, 1]
submission[encoder.classes_[2]] = result_dict['prediction'][:, 2]
submission.to_csv('./result-per-user/lgbm-remove-stopwords-test.csv', index=False)