In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from sklearn.model_selection import GridSearchCV
import itertools
from tqdm import tqdm

In [2]:
FILE = "../data/w2v-vectors_goog_pren.parquet.gzip"
data = pd.read_parquet(FILE)

In [3]:
train_idx = pickle.load(open("../data/wip/train_idx.pkl", 'rb'))

In [4]:
training_data = data.loc[train_idx]

In [5]:
SEED = 3742
N_SAMPLE = 100_000

In [6]:
tuning_data = training_data.sample(N_SAMPLE,random_state=SEED)

In [7]:
del data
del training_data

In [8]:
features = 'google-news_w2v_mean_prenorm'
target = 'class'

In [9]:
tuning_data[target] = tuning_data[target].map({2:0,7:1,6:2,5:3,1:3,3:3,4:3,0:3})

In [10]:
X_train = np.stack(tuning_data[features])
y_train = tuning_data[target]

In [11]:
y_train.value_counts()

0    54852
1    21878
2    17970
3     5300
Name: class, dtype: int64

In [12]:
from datetime import datetime

from sklearn.utils.class_weight import compute_class_weight

classes = np.unique(y_train)  # list of unique class labels

class_weights = compute_class_weight('balanced', classes= np.unique(y_train), y=y_train)

In [13]:
def expand_weight_range(value, k=3, amplitude=.1):
    '''
      Creates additional values around an original given value.

      value: the starting weight value
      k: the total numbe of weights to obtain
      amplitude: expressed as a fraction, it is the threshold
             considered when creating the range of values.

      Example: 
      expand_weight_range(value=100, k=3, amplitude=0.20)

      result:
      array([110., 100.,  90.])

      '''
    a = value + value * amplitude
    b = value - value * amplitude
    res = np.linspace(a,b,k)
    return res



ranges = [expand_weight_range(x) for x in class_weights]

catesian_pr_weights = list(itertools.product(*ranges))

weight_ranges = [dict(zip(classes,x)) for x in catesian_pr_weights]

len(weight_ranges)

81

In [15]:
start = datetime.now()
print(f"start: {start}")


gsc = GridSearchCV(
    estimator = LogisticRegression(),
    param_grid={
        'class_weight' : [x for x in weight_ranges]
    },
    scoring = 'f1_macro',
    cv = 5,
    verbose=1
)



grid_result = gsc.fit(X_train, y_train)


print(f"Best params: {grid_result.best_params_}")


pickle.dump(grid_result, open("tmp/grid_result.pkl", 'wb'))

end = datetime.now()
print(f"end: {end}")

start: 2023-04-15 00:31:46.542976
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best params: {'class_weight': {0: 0.5013490848100343, 1: 1.2569704726208977, 2: 1.530328324986088, 3: 4.245283018867925}}
end: 2023-04-15 01:01:51.725756


In [16]:
grid_result = pickle.load(open("tmp/grid_result.pkl", 'rb'))

In [30]:
print(f"initial weights: {list(class_weights)}")
print(f"after gridsearchcv: {list(grid_result.best_params_['class_weight'].values())}")

initial weights: [0.45577189528184936, 1.1427004296553616, 1.391207568169171, 4.716981132075472]
after gridsearchcv: [0.5013490848100343, 1.2569704726208977, 1.530328324986088, 4.245283018867925]
