In [24]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
import operator
import gc
import time

import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
import string
import re

import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import StratifiedKFold
from scipy.sparse import csr_matrix

from keras.layers import add, Dense, Input, Dropout, Activation, BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Model, Sequential
from keras.optimizers import Adam
from keras import backend as K
from keras import models
from keras.engine.topology import Layer
from keras.utils import np_utils

In [45]:
train = pd.read_csv('./data/BEST&MOST200/train-best200.csv')
dev = pd.read_csv('./data/BEST&MOST200/dev-best200.csv')
test = pd.read_csv('./data/BEST&MOST200/test-best200.csv')

In [26]:
print('Train shape:', train.shape)
print('Dev shape  :', dev.shape)
print('Test shape :', test.shape)

Train shape: (96585, 457)
Dev shape  : (34028, 457)
Test shape : (32977, 457)


In [27]:
def merge_same_user(df):
    df_class = df[['user-id', 'class']].groupby('user-id').apply(lambda x: x['class'].mode()).reset_index()
    df_class = df_class.rename(columns={0: 'class'})
    df_new = df.groupby('user-id').sum().reset_index()
    df_new = pd.merge(df_new, df_class, on='user-id')
    
    return df_new

In [28]:
train_per_user = merge_same_user(train)
dev_per_user = merge_same_user(dev)

train_per_user = pd.concat([train_per_user, dev_per_user], axis=0)
test_per_user = merge_same_user(test)

In [29]:
print('Train per user shape:', train_per_user.shape)
print('Test per user shape :', test_per_user.shape)

Train per user shape: (3190, 457)
Test per user shape : (802, 457)


In [31]:
X = train_per_user.drop(['tweet-id', 'user-id', 'class'], axis=1)
X_test = test_per_user.drop(['tweet-id', 'user-id', 'class'], axis=1)

# mmsc = MinMaxScaler()
# X = mmsc.fit_transform(X)
# X_test = mmsc.transform(X_test)

ss = StandardScaler()
X = ss.fit_transform(X)
X_test = ss.transform(X_test)

y = train_per_user['class']
encoder = LabelEncoder()
encoder.fit(y)
y_one_hot = encoder.transform(y)
y_one_hot = np_utils.to_categorical(y_one_hot)

In [8]:
folds = StratifiedKFold(n_splits=10, random_state=2019)
fold_num = 1

for train_idx, val_idx in folds.split(X, y):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y_one_hot[train_idx], y_one_hot[val_idx]

    gc.collect()
    K.clear_session()

    model = Sequential()
    model.add(Dense(128, input_shape=(X.shape[1],)))
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Dense(64))
    model.add(Dropout(0.3))
    model.add(BatchNormalization())
    model.add(Dense(32))
    model.add(Dropout(0.1))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dense(3))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    es = EarlyStopping(monitor='val_acc', mode='max', verbose=1, patience=5)
    mcp = ModelCheckpoint(filepath=f'./NN-models/models/fold-{fold_num}.hdf5', 
                          monitor='val_acc', mode='max', 
                          verbose=1, 
                          save_best_only=True)
    
    model.fit(X_train, y_train,
              validation_data=(X_val, y_val),
              batch_size=32,
              epochs=50,
              verbose=1,
              callbacks=[mcp, es]
             )
    
    fold_num += 1






Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 2869 samples, validate on 321 samples
Epoch 1/50

Epoch 00001: val_acc improved from -inf to 0.78816, saving model to ./models/fold-1.hdf5
Epoch 2/50

Epoch 00002: val_acc improved from 0.78816 to 0.85047, saving model to ./models/fold-1.hdf5
Epoch 3/50

Epoch 00003: val_acc improved from 0.85047 to 0.87539, saving model to ./models/fold-1.hdf5
Epoch 4/50

Epoch 00004: val_acc improved from 0.87539 to 0.87850, saving model to ./models/fold-1.hdf5
Epoch 5/50

Epoch 00005: val_acc did not improve from 0.87850
Epoch 6/50

Epoch 00006: val_acc improved from 0.87850 to 0.88474, saving model to ./models/fold-1.hdf5
Epoch 7/50

Epoch 00007: val_acc did not improve from 0.88474
Epoch 8/50

Epoch 00008: val_acc did not improve from 0.88474
Epoch 9/50

Epoch 00009: val_acc di


Epoch 00007: val_acc did not improve from 0.84906
Epoch 8/50

Epoch 00008: val_acc did not improve from 0.84906
Epoch 9/50

Epoch 00009: val_acc did not improve from 0.84906
Epoch 00009: early stopping
Train on 2872 samples, validate on 318 samples
Epoch 1/50

Epoch 00001: val_acc improved from -inf to 0.78302, saving model to ./models/fold-7.hdf5
Epoch 2/50

Epoch 00002: val_acc improved from 0.78302 to 0.84277, saving model to ./models/fold-7.hdf5
Epoch 3/50

Epoch 00003: val_acc improved from 0.84277 to 0.87107, saving model to ./models/fold-7.hdf5
Epoch 4/50

Epoch 00004: val_acc did not improve from 0.87107
Epoch 5/50

Epoch 00005: val_acc did not improve from 0.87107
Epoch 6/50

Epoch 00006: val_acc improved from 0.87107 to 0.87421, saving model to ./models/fold-7.hdf5
Epoch 7/50

Epoch 00007: val_acc did not improve from 0.87421
Epoch 8/50

Epoch 00008: val_acc did not improve from 0.87421
Epoch 9/50

Epoch 00009: val_acc did not improve from 0.87421
Epoch 10/50

Epoch 00010: v


Epoch 00010: val_acc did not improve from 0.83333
Epoch 11/50

Epoch 00011: val_acc did not improve from 0.83333
Epoch 12/50

Epoch 00012: val_acc did not improve from 0.83333
Epoch 13/50

Epoch 00013: val_acc did not improve from 0.83333
Epoch 00013: early stopping


In [32]:
val_prob = np.zeros((len(train_per_user), 3), dtype=np.float32)
pred_prob = np.zeros((len(test_per_user), 3), dtype=np.float32)

folds = StratifiedKFold(n_splits=10, random_state=2019)
fold_num = 1

for train_idx, val_idx in folds.split(X, y):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y_one_hot[train_idx], y_one_hot[val_idx]

    gc.collect()
    K.clear_session()
    
    model = models.load_model(f'./NN-models/models/fold-{fold_num}.hdf5')
    val_prob[val_idx] = model.predict(X_val, batch_size=32, verbose=1)
    pred_prob = model.predict(X_test, batch_size=32, verbose=1) / 10
    fold_num += 1



In [33]:
validation = train_per_user[['user-id', 'class']]
validation['prediction'] = encoder.inverse_transform(np.argmax(val_prob, axis=1))
validation[encoder.classes_[0]] = val_prob[:, 0]
validation[encoder.classes_[1]] = val_prob[:, 1]
validation[encoder.classes_[2]] = val_prob[:, 2]
validation.to_csv('./result-per-user/cnn-val.csv', index=False)
accuracy_score(validation['class'], validation['prediction'])

0.8554858934169279

In [41]:
submission = test_per_user[['user-id', 'class']]
submission['prediction'] = encoder.inverse_transform(np.argmax(pred_prob, axis=1))
submission[encoder.classes_[0]] = pred_prob[:, 0]
submission[encoder.classes_[1]] = pred_prob[:, 1]
submission[encoder.classes_[2]] = pred_prob[:, 2]
submission.to_csv('./result-per-user/cnn-sub.csv', index=False)

In [12]:
remove_cols = ['aha', 'ahah', 'ahaha', 'ahahah', 'ahahaha', 'ahahahaha', 'bahaha', 'bahahaha', 'haha', 'hahah', 'hahaha', 'hahahah', 'hahahaha', 'hahahahaha', 'hahahahahaha']

for w in stop_words:
    if w in train.columns:
        remove_cols.append(w)
        
train_per_user = train_per_user.drop(remove_cols, axis=1)
test_per_user = test_per_user.drop(remove_cols, axis=1)

In [13]:
X = train_per_user.drop(['tweet-id', 'user-id', 'class'], axis=1)
X_test = test_per_user.drop(['tweet-id', 'user-id', 'class'], axis=1)

# mmsc = MinMaxScaler()
# X = mmsc.fit_transform(X)
# X_test = mmsc.transform(X_test)

ss = StandardScaler()
X = ss.fit_transform(X)
X_test = ss.transform(X_test)

y = train_per_user['class']
encoder = LabelEncoder()
encoder.fit(y)
y_one_hot = encoder.transform(y)
y_one_hot = np_utils.to_categorical(y_one_hot)

In [14]:
folds = StratifiedKFold(n_splits=10, random_state=2019)
folds = folds.split(X, y)
fold_num = 1

for train_idx, val_idx in folds:
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y_one_hot[train_idx], y_one_hot[val_idx]

    gc.collect()
    K.clear_session()
    
    model = Sequential()
    model.add(Dense(128, input_shape=(X.shape[1],)))
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Dense(64))
    model.add(Dropout(0.3))
    model.add(BatchNormalization())
    model.add(Dense(32))
    model.add(Dropout(0.1))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dense(3))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    es = EarlyStopping(monitor='val_acc', mode='max', verbose=1, patience=5)
    mcp = ModelCheckpoint(filepath=f'./models-remove-stopwords/fold-{fold_num}.hdf5', 
                          monitor='val_acc', mode='max', 
                          verbose=1, 
                          save_best_only=True)
    
    model.fit(X_train, y_train,
              validation_data=(X_val, y_val),
              batch_size=32,
              epochs=50,
              verbose=1,
              callbacks=[mcp, es]
             )
    
    fold_num += 1

Train on 2869 samples, validate on 321 samples
Epoch 1/50

Epoch 00001: val_acc improved from -inf to 0.77882, saving model to ./models-remove-stopwords/fold-1.hdf5
Epoch 2/50

Epoch 00002: val_acc improved from 0.77882 to 0.84735, saving model to ./models-remove-stopwords/fold-1.hdf5
Epoch 3/50

Epoch 00003: val_acc improved from 0.84735 to 0.87227, saving model to ./models-remove-stopwords/fold-1.hdf5
Epoch 4/50

Epoch 00004: val_acc improved from 0.87227 to 0.87539, saving model to ./models-remove-stopwords/fold-1.hdf5
Epoch 5/50

Epoch 00005: val_acc improved from 0.87539 to 0.88785, saving model to ./models-remove-stopwords/fold-1.hdf5
Epoch 6/50

Epoch 00006: val_acc did not improve from 0.88785
Epoch 7/50

Epoch 00007: val_acc did not improve from 0.88785
Epoch 8/50

Epoch 00008: val_acc improved from 0.88785 to 0.89097, saving model to ./models-remove-stopwords/fold-1.hdf5
Epoch 9/50

Epoch 00009: val_acc did not improve from 0.89097
Epoch 10/50

Epoch 00010: val_acc did not im


Epoch 00007: val_acc did not improve from 0.85000
Epoch 8/50

Epoch 00008: val_acc did not improve from 0.85000
Epoch 9/50

Epoch 00009: val_acc did not improve from 0.85000
Epoch 10/50

Epoch 00010: val_acc improved from 0.85000 to 0.85313, saving model to ./models-remove-stopwords/fold-4.hdf5
Epoch 11/50

Epoch 00011: val_acc did not improve from 0.85313
Epoch 12/50

Epoch 00012: val_acc did not improve from 0.85313
Epoch 13/50

Epoch 00013: val_acc did not improve from 0.85313
Epoch 14/50

Epoch 00014: val_acc did not improve from 0.85313
Epoch 15/50

Epoch 00015: val_acc did not improve from 0.85313
Epoch 00015: early stopping
Train on 2872 samples, validate on 318 samples
Epoch 1/50

Epoch 00001: val_acc improved from -inf to 0.81447, saving model to ./models-remove-stopwords/fold-5.hdf5
Epoch 2/50

Epoch 00002: val_acc did not improve from 0.81447
Epoch 3/50

Epoch 00003: val_acc improved from 0.81447 to 0.83019, saving model to ./models-remove-stopwords/fold-5.hdf5
Epoch 4/50




Epoch 00005: val_acc improved from 0.85220 to 0.86478, saving model to ./models-remove-stopwords/fold-7.hdf5
Epoch 6/50

Epoch 00006: val_acc did not improve from 0.86478
Epoch 7/50

Epoch 00007: val_acc improved from 0.86478 to 0.86792, saving model to ./models-remove-stopwords/fold-7.hdf5
Epoch 8/50

Epoch 00008: val_acc improved from 0.86792 to 0.87107, saving model to ./models-remove-stopwords/fold-7.hdf5
Epoch 9/50

Epoch 00009: val_acc did not improve from 0.87107
Epoch 10/50

Epoch 00010: val_acc did not improve from 0.87107
Epoch 11/50

Epoch 00011: val_acc did not improve from 0.87107
Epoch 12/50

Epoch 00012: val_acc did not improve from 0.87107
Epoch 13/50

Epoch 00013: val_acc did not improve from 0.87107
Epoch 00013: early stopping
Train on 2872 samples, validate on 318 samples
Epoch 1/50

Epoch 00001: val_acc improved from -inf to 0.76415, saving model to ./models-remove-stopwords/fold-8.hdf5
Epoch 2/50

Epoch 00002: val_acc improved from 0.76415 to 0.81132, saving model

Train on 2872 samples, validate on 318 samples
Epoch 1/50

Epoch 00001: val_acc improved from -inf to 0.77987, saving model to ./models-remove-stopwords/fold-10.hdf5
Epoch 2/50

Epoch 00002: val_acc improved from 0.77987 to 0.80503, saving model to ./models-remove-stopwords/fold-10.hdf5
Epoch 3/50

Epoch 00003: val_acc improved from 0.80503 to 0.81447, saving model to ./models-remove-stopwords/fold-10.hdf5
Epoch 4/50

Epoch 00004: val_acc improved from 0.81447 to 0.82390, saving model to ./models-remove-stopwords/fold-10.hdf5
Epoch 5/50

Epoch 00005: val_acc improved from 0.82390 to 0.82390, saving model to ./models-remove-stopwords/fold-10.hdf5
Epoch 6/50

Epoch 00006: val_acc did not improve from 0.82390
Epoch 7/50

Epoch 00007: val_acc did not improve from 0.82390
Epoch 8/50

Epoch 00008: val_acc did not improve from 0.82390
Epoch 9/50

Epoch 00009: val_acc did not improve from 0.82390
Epoch 10/50

Epoch 00010: val_acc did not improve from 0.82390
Epoch 00010: early stopping


In [15]:
val_prob = np.zeros((len(train_per_user), 3), dtype=np.float32)
pred_prob = np.zeros((len(test_per_user), 3), dtype=np.float32)

folds = StratifiedKFold(n_splits=10, random_state=2019)
fold_num = 1

for train_idx, val_idx in folds.split(X, y):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y_one_hot[train_idx], y_one_hot[val_idx]

    gc.collect()
    K.clear_session()
    
    model = models.load_model(f'./models-remove-stopwords/fold-{fold_num}.hdf5')
    val_prob[val_idx] = model.predict(X_val, batch_size=32, verbose=1)
    pred_prob = model.predict(X_test, batch_size=32, verbose=1) / 10
    fold_num += 1



In [16]:
validation = train_per_user[['user-id', 'class']]
validation['prediction'] = encoder.inverse_transform(np.argmax(val_prob, axis=1))
validation[encoder.classes_[0]] = val_prob[:, 0]
validation[encoder.classes_[1]] = val_prob[:, 1]
validation[encoder.classes_[2]] = val_prob[:, 2]
validation.to_csv('./result-per-user/cnn-remove-stopwords-val.csv', index=False)
accuracy_score(validation['class'], validation['prediction'])

0.8479623824451411

In [17]:
submission = test_per_user[['user-id', 'class']]
submission['prediction'] = encoder.inverse_transform(np.argmax(pred_prob, axis=1))
submission[encoder.classes_[0]] = pred_prob[:, 0]
submission[encoder.classes_[1]] = pred_prob[:, 1]
submission[encoder.classes_[2]] = pred_prob[:, 2]
submission.to_csv('./result-per-user/cnn-remove-stopwords-sub.csv', index=False)