In [None]:
from tqdm.notebook import tqdm 
import pandas as pd
from gensim.parsing.preprocessing import preprocess_string
import optuna
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

from sklearn.linear_model import SGDClassifier, SGDRegressor
from sklearn.model_selection import cross_validate
from sklearn.metrics import (
    accuracy_score,
    classification_report
)
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB

from xgboost import XGBClassifier

from typing import List, Any, Tuple

tqdm.pandas()

In [4]:
df_train = pd.read_csv('./../../../data/train.csv')
df_val = pd.read_csv('./../../../data/val.csv')
df_test = pd.read_csv('./../../../data/test.csv')

In [5]:
df_train = pd.concat([df_train, df_val])

In [6]:
df_train.head()

Unnamed: 0,corrected_text,length,ratio_err,labels
0,the story making mona lisa smile the author ta...,423,0.037825,3
1,the last ship try land venus was three decades...,342,0.038012,2
2,electrola college not just college process the...,715,0.027972,1
3,many countries are pushing towards less cars m...,608,0.004934,5
4,voters some times get confused about the lecte...,277,0.028881,2


In [8]:
df_train['text'] = df_train['corrected_text'].progress_apply(lambda x: ' '.join(preprocess_string(x)))

  0%|          | 0/13845 [00:00<?, ?it/s]

In [9]:
print(f"vocabulary size: {len(set(' '.join(df_train['text']).split()))}")

vocabulary size: 18559


In [10]:
df_train["labels"] = LabelEncoder().fit_transform(df_train["labels"])

In [11]:
df_train.labels.value_counts()

labels
2    5024
1    3778
3    3141
0    1001
4     776
5     125
Name: count, dtype: int64

In [12]:
tfidf = TfidfVectorizer()

X_train = tfidf.fit_transform(df_train['text'])
y_train = df_train['labels']

X_train.shape

(13845, 18553)

In [13]:
def bayesian_model_hp_search(model: object,
                             X: pd.DataFrame,
                             y: pd.DataFrame,
                             dim: List[int])-> Tuple[List[int], List[dict]]:
    """
    Return the list of scores for each dim of TruncatedSVD
    """

    scores = []
    for d in tqdm(dim, desc='Computing scores for each dimension'):
        pipeline = make_pipeline(
            TruncatedSVD(n_components=d),
            model
        )
        score = cross_validate(pipeline, 
                                X, 
                                y, 
                                cv=3,
                                n_jobs=-1,
                                scoring=('accuracy', 
                                         'balanced_accuracy', 
                                         'f1_macro', 
                                         'f1_weighted', 
                                         'f1_micro', 
                                         'precision_macro', 
                                         'precision_weighted', 
                                         'precision_micro', 
                                         'recall_macro', 
                                         'recall_weighted', 
                                         'recall_micro', 
                                         'matthews_corrcoef'),
                                return_train_score=True)

        scores.append(score)


    return dim, scores

In [15]:
dim = [10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]

In [16]:
models = {
    'SGDClassifier': SGDClassifier(),
    'SGDRegressor': SGDRegressor(),
    'RandomForestClassifier': RandomForestClassifier(),
    'LogisticRegression': LogisticRegression(),
    'SVC': SVC(),
    'GaussianNB': GaussianNB(),
    'MultinomialNB': MultinomialNB(),
    'XGBClassifier': XGBClassifier()
}

In [17]:
scores = {}
dims = {}
for name, model in tqdm(models.items(), desc='Models'):
    dim, scores[name] = bayesian_model_hp_search(model, X_train, y_train, dim)
    dims[name] = dim

Models:   0%|          | 0/8 [00:00<?, ?it/s]

Computing scores for each dimension:   0%|          | 0/12 [00:00<?, ?it/s]

Computing scores for each dimension:   0%|          | 0/12 [00:00<?, ?it/s]

Computing scores for each dimension:   0%|          | 0/12 [00:00<?, ?it/s]