In [184]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup as bs
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer

from tqdm.notebook import tqdm
from nltk.corpus import stopwords
import nltk
from nltk.stem.snowball import SnowballStemmer
nltk.download('stopwords')

from catboost import CatBoostClassifier, Pool
import spacy

import optuna
from optuna.samplers import TPESampler

seed = 42

In [102]:
np.random.seed(seed)
path = r'../input/covid-19-nlp-text-classification'
train = pd.read_csv(path+'/Corona_NLP_train.csv', encoding='latin1')
test = pd.read_csv(path+'/Corona_NLP_test.csv')

In [7]:
train.sample(4)

In [8]:
train.Sentiment.value_counts(normalize=True)

In [99]:
train.UserName.nunique() == len(train)

In [10]:
train.describe(include=['object'])

In [11]:
train.info()

In [94]:
stop = stopwords.words('english')

def report(y_train, y_train_pred, y_test, y_test_pred, y_train_proba=None, y_test_proba=None):
    print('Train\n', classification_report(y_train, y_train_pred, digits=3))
    print('Test\n', classification_report(y_test, y_test_pred, digits=3))
    if y_train_proba is not None and y_test_proba is not None:
        roc_train, roc_test = roc_auc_score(y_train, y_train_proba), roc_auc_score(y_test, y_test_proba)
        print(f'Train ROC_AUC: {roc_train:.3f}, Test ROC_AUC: {roc_test:.3f}') 
    print('Confusion Matrix', '\n', pd.crosstab(y_test, y_test_pred))

In [149]:
class Preprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        X_ = X.copy()
        X_.TweetAt = pd.to_datetime(X_.TweetAt)
        X_['days'] = X_.TweetAt.dt.day
        X_['months'] = X_.TweetAt.dt.month
        self.max_day = X_.days.max()
        self.max_month = X_.months.max()
        del X_ 
        return self
    
    def transform(self, X):
        X_ = X.copy()
        X_.set_index('UserName', drop=True, inplace=True)
        X_index = X_.index
        X_.Location.fillna('UNK', inplace=True)
        X_.drop('ScreenName', axis=1, inplace=True)
        X_.OriginalTweet = X_.OriginalTweet.apply(lambda x: self.preprocessor(x))
        X_.OriginalTweet = X_.OriginalTweet.apply(lambda x: self.snow_stem(x))
        X_.TweetAt = pd.to_datetime(X_.TweetAt)
        X_['day'] = X_.TweetAt.dt.day
        X_['month'] = X_.TweetAt.dt.month
        X_.Location = X_.Location.apply(lambda x: x if '?' not in x else 'UNK')
        month = pd.Series(data=X_['month'], index=X_index, name='month')
        day = pd.Series(data=X_['day'], index=X_index, name='day')
        m_df = self.cyclical_encode(month, self.max_month)
        d_df = self.cyclical_encode(day, self.max_day)
        X_ = pd.concat([X_, m_df], axis=1)
        X_ = pd.concat([X_, d_df], axis=1)
        X_.drop(['month', 'day', 'TweetAt'], axis=1, inplace=True)
        X_.index = X_index
        #X_.drop('TweetAt', axis=1, inplace=True)
        return X_
    
    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X)

    def preprocessor(self, text: str) -> str:
        text = text.replace(u'\xa0', u' ').replace(r'\r', '').replace(r'\n', '').replace(r'\x', '')
        text = re.sub('<[^>]*>', '', text)
        text = re.sub(r'(https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*', '', text.lower())
        text = re.sub(r'@\w+', '', text.lower())
        text = re.sub(r'#\w+', '', text.lower())
        text = re.sub(r'\d+', '', text.lower())
        emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
        text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
        text = re.sub(r'[^\w]', ' ', text)
        text = [w for w in text.split() if w not in stop]
        text = [re.sub(r"[^a-zA-Z0-9]+", '', k) for k in text]
        return re.sub(' +', ' ', ' '.join(text))

    def snow_stem(self, text: str) -> str:
        snow_stemmer = SnowballStemmer(language='english')
        new_text = ''
        for word in text.split():
            new_text += snow_stemmer.stem(word) + ' '
        return new_text
    
    @staticmethod
    def cyclical_encode(x: pd.Series, x_max: float) -> pd.DataFrame:
        return pd.DataFrame(data=pd.concat([
                          np.cos(2*np.pi * x / x_max),
                          np.sin(2*np.pi * x / x_max)
        ], axis=1).reset_index(drop=True).values, columns=[f'{x.name}_cos', f'{x.name}_sin'], index=x.index)

In [167]:
X, y = train.drop('Sentiment', axis=1), train['Sentiment']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state=seed)
print(f'Train size: {len(X_train)}, validation size: {len(X_valid)}')

In [168]:
X_test, y_test = test.drop('Sentiment', axis=1), test['Sentiment']

In [174]:
y_train.index = X_train.index
y_valid.index = X_valid.index
y_test.index = X_test.index

In [170]:
prep = Preprocessor()
X_train = prep.fit_transform(X_train)
X_valid = prep.transform(X_valid)
X_test = prep.transform(X_test)

In [175]:
X_train.drop(X_train[X_train.OriginalTweet.str.len() == 0].index, inplace=True)
X_valid.drop(X_valid[X_valid.OriginalTweet.str.len() == 0].index, inplace=True)
X_test.drop(X_test[X_test.OriginalTweet.str.len() == 0].index, inplace=True)
y_train = y_train[X_train.index]
y_valid = y_valid[X_valid.index]
y_test = y_test[X_test.index]
print(f'Train size: {len(X_train)}, valid size: {len(X_valid)}, test size: {len(X_test)}')

In [176]:
X_train.head(1)

In [177]:
cat_cols = ['Location']
text_cols = ['OriginalTweet']

train_pool = Pool(X_train, y_train, cat_features=cat_cols, text_features=text_cols)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_cols, text_features=text_cols)

In [189]:
params_model = {
    'eval_metric': 'Accuracy', 
    'iterations': 1000,
    'objective': 'MultiClass',
    'silent': True,
    'one_hot_max_size': 5,
    'early_stopping_rounds': 10,
    'allow_writing_files': False,
    'use_best_model': True,
    'bootstrap_type': 'Bernoulli',
    'random_seed': seed,
    }

In [194]:
def objective(trial):
    param_trials = {
                    'depth': trial.suggest_int('depth', 3, 9),
                    'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5),
                    'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1., 5.),
                    'subsample': trial.suggest_float('subsample', 0.3, 1.),
                    'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.3, 1.),
                    'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
                    'auto_class_weights': trial.suggest_categorical('auto_class_weights', ['Balanced', 'SqrtBalanced']),
                    'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
                    'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 255),
                    'max_leaves': trial.suggest_int('max_leaves', 5, 64),
                    }
    param_trials.update(params_model)
    if param_trials['grow_policy'] == 'SymmetricTree':
        param_trials['min_data_in_leaf'] = None
        param_trials['max_leaves'] = None
    if param_trials['grow_policy'] == 'Depthwise':
        param_trials['max_leaves'] = None
    if param_trials['grow_policy'] != 'SymmetricTree':
        param_trials['boosting_type'] = 'Plain'

    opt_model = CatBoostClassifier(**param_trials)
    opt_model.fit(train_pool, eval_set=valid_pool)
    
    y_pred = opt_model.predict(X_valid)
    score = accuracy_score(y_valid, y_pred)
    
    return score

In [195]:
optuna.logging.set_verbosity(optuna.logging.FATAL)
study = optuna.create_study(sampler=TPESampler(seed=seed), direction="maximize")
study.optimize(objective, n_trials=400, timeout=6000, n_jobs=-1)

print(f'Number of completed trials: {len(study.trials)}')
print('Best trial')
trial = study.best_trial
print(f'Best score: {trial.value}')
print('Best params')
for key, value in trial.params.items():
    print(f'{key}: {value}')

In [198]:
params_model = {
    'eval_metric': 'Accuracy', 
    'iterations': 1000,
    'objective': 'MultiClass',
    'silent': True,
    'one_hot_max_size': 5,
    'early_stopping_rounds': 10,
    'allow_writing_files': False,
    'use_best_model': True,
    'bootstrap_type': 'Bernoulli',
    'random_seed': seed,
    'depth': 9,
    'learning_rate': 0.3291016862107266,
    'l2_leaf_reg': 2.86661730004558,
    'subsample': 0.9932243896098077,
    'colsample_bylevel': 0.6926179357220379,
    'boosting_type': 'Plain',
    'auto_class_weights': 'SqrtBalanced',
    'grow_policy': 'Lossguide',
    'min_data_in_leaf': 182,
    'max_leaves': 51,
    #'combinations_ctr': ['Counter:CtrBorderCount=40:Prior=0.5/1'],
    }

In [199]:
cat = CatBoostClassifier(**params_model)
cat.fit(train_pool, eval_set=valid_pool)

train_pred = cat.predict(X_train)
valid_pred = cat.predict(X_valid)
test_pred = cat.predict(X_test)

report(y_train, train_pred, y_valid, valid_pred)
print(classification_report(y_test, test_pred.flatten(), digits=3))

In [205]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

cm = confusion_matrix(y_test, test_pred, labels=y_train.unique())
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=y_train.unique())
disp.plot()
plt.xticks(rotation=90)
plt.show()

Pipeline steps for LogisticRegression:
* Preprocessor
* ColumnTransformer for Location -> category_encoders.cat_boost.; day_sin, day_cos, month_sin, month_cos -> MinMaxScaler(), remainder = 'passthrough', OriginalTweet -> TfidVectorizer or HashingVectorizer (if out of memory)
* Model



In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.linear_model import LogisticRegression

Pipeline steps for keras Bert implementation:
* Preprocessor
* same as LogisticRegression, except preprocess OriginalTweet with Bert preprocessor
* Model: Dense model or Conv1d model (see github NLP repository for examples)

```
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    num_input = tf.keras.layers.Input(shape=(X_train.shape[-1]), dtype=tf.float32, name='numerical_input)
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['sequence_output'] # ['pooled_output'] # if next layer is linear (whole encoded sentence), otherwise sequence_output (all encoded tokens)
    conv = tf.keras.layers.Conv1D(64, (2), activation='relu')(net)
    conv = tf.keras.layers.Conv1D(128, (2), activation='relu')(conv)
    pool = tf.keras.layers.GlobalMaxPool1D()(conv) # or GlobalAveragePool1D
    pool = tf.concat([pool, num_input], 1)
    drop = tf.keras.layers.Dropout(0.25)(pool)
    net = tf.keras.layers.Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(5e-5))(drop)
    net = tf.keras.layers.Dropout(0.25)(net)
    net = tf.keras.layers.Dense(2, activation='softmax', name='classifier')(net)
    return tf.keras.Model([text_input, num_input], net)
```
or
define custom tf Model class in which:
* pass df.values as input
* get bert output for text feature
* pass bert output through conv layers
* concatenate pooled conv output with numerical part of input and return overall out from dense layers

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization