In [31]:
from tqdm.notebook import tqdm 
import pandas as pd
from gensim.parsing.preprocessing import preprocess_string
import optuna
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import (
    MultinomialNB,
    GaussianNB
)
from sklearn.linear_model import SGDClassifier, SGDRegressor
from sklearn.model_selection import cross_validate
from sklearn.metrics import (
    accuracy_score,
    classification_report
)
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

tqdm.pandas()

In [13]:
df_train = pd.read_csv('../data/train.csv')
df_val = pd.read_csv('../data/val.csv')
df_test = pd.read_csv('../data/test.csv')

In [14]:
df_train = pd.concat([df_train, df_val])

In [15]:
df_train.head()

Unnamed: 0,text,labels
0,the article the challenge exploring venus the ...,3
1,ever wanted live different planet scientist fe...,3
2,cowboy who rode the waves program where you ge...,3
3,boy named luke bomberger life was about change...,2
4,who tired driving who tired buying new cars ha...,4


In [16]:
df_train['text'] = df_train['text'].progress_apply(lambda x: ' '.join(preprocess_string(x)))

  0%|          | 0/13845 [00:00<?, ?it/s]

In [18]:
print(f"vocabulary size: {len(set(' '.join(df_train['text']).split()))}")

vocabulary size: 18559


In [23]:
df_train["labels"] = LabelEncoder().fit_transform(df_train["labels"])

In [24]:
df_train.labels.value_counts()

labels
2    5024
1    3778
3    3141
0    1001
4     776
5     125
Name: count, dtype: int64

In [26]:
tfidf = TfidfVectorizer()

X_train = tfidf.fit_transform(df_train['text'])
y_train = df_train['labels']

In [34]:
def bayesian_model_hp_search(model,
                   X,
                   y,
                   dim):
    """
    Return the list of scores for each dim of TruncatedSVD
    """

    scores = []
    for d in tqdm(dim, desc='Computing scores for each dimension'):
        pipeline = make_pipeline(
            TruncatedSVD(n_components=d),
            model
        )
        score = cross_validate(pipeline, 
                                X, 
                                y, 
                                cv=3,
                                n_jobs=-1,
                                scoring=('accuracy', 
                                         'balanced_accuracy', 
                                         'f1_macro', 
                                         'f1_weighted', 
                                         'f1_micro', 
                                         'precision_macro', 
                                         'precision_weighted', 
                                         'precision_micro', 
                                         'recall_macro', 
                                         'recall_weighted', 
                                         'recall_micro', 
                                         'matthews_corrcoef'),
                                return_train_score=True)

        scores.append(score)


    return dim, scores
    