In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import hstack
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, log_loss
from skopt import forest_minimize
from sklearn.preprocessing import MaxAbsScaler, StandardScaler
from scipy.sparse import csr_matrix

In [2]:
df = pd.read_csv('../data/raw_data_labeled.csv')

df = df[df['y'].notnull()]

In [3]:
df.isnull().sum()

watch-title                       0
y                                 0
watch-view-count                 21
watch-time-text                   0
watch-extras-section              0
content_watch-info-tag-list       0
og:image                          0
og:image:width                    0
og:image:height                   0
og:description                    0
og:video:width                   18
og:video:height                  18
og:video:tag                    198
channel_link_0                    8
Unnamed: 14                    1976
dtype: int64

In [4]:
# Clean date attribute
clean_date = df['watch-time-text'].str.extract(r"(\d+) de ([a-z]+)\. de (\d+)")
clean_date[0] = clean_date[0].dropna().astype(str)
clean_date[2] = clean_date[2].dropna().astype(str)

month_map = {"jan": "Jan",
             "fev": "Feb",
             "mar": "Mar",
             "abr": "Apr",
             "mai": "May",
             "jun": "Jun",
             "jul": "Jul",
             "ago": "Aug",
             "set": "Sep",
             "out": "Oct",
             "nov": "Nov",
             "dez": "Dec"}

clean_date[1] = clean_date[1].map(month_map)
clean_date = clean_date.dropna().apply(lambda x: " ".join(x), axis=1)
clean_date = pd.to_datetime(clean_date, format="%d %b %Y")

# Clean view number
views = df['watch-view-count'].str.extract(r"(\d+\.?\d*)", expand=False)
views = views.str.replace(".", "").fillna(0).astype(int)

In [5]:
features = pd.DataFrame()
y = df['y'].copy()

In [6]:
features['time_since_pub'] = (pd.to_datetime("2020-03-29") -  # HARDCODED
                              clean_date) / np.timedelta64(1, 'D')

# Extracting n of view feature
features['views'] = views

# Extracting n of view/day feature
features['views_per_day'] = features['views'] / features['time_since_pub']

# Droping time_since_pub to prevent bias
features = features.drop(['time_since_pub'], axis=1)

# Dropping problematic features
y = y[features.index]
df = df.loc[features.index]

In [7]:
resolutions = []
for height, width in zip(df['og:video:height'], df['og:video:width']):
    try:
        height = float(height)
        width = float(width)
    except:
        resolutions.append(np.nan)
        continue
        
    resolutions.append(height*width)
    
features['resolution'] = resolutions

In [8]:
features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2097 entries, 0 to 2275
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   views          2097 non-null   int64  
 1   views_per_day  2097 non-null   float64
 2   resolution     1957 non-null   float64
dtypes: float64(2), int64(1)
memory usage: 65.5 KB


In [9]:
# Around 75% train and 25% to validation
split_date = '2020-02-27'
mask_train = (clean_date < split_date) & (clean_date.notnull())
mask_val = (clean_date >= split_date) & (clean_date.notnull())

X_train, X_val = features[mask_train.values], features[mask_val.values]
y_train, y_val = y[mask_train.values], y[mask_val.values]
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((1827, 3), (270, 3), (1827,), (270,))

In [10]:
# Filling NaNs
X_train['resolution'] = X_train['resolution'].fillna(X_train['resolution'].mean())
X_val['resolution'] = X_val['resolution'].fillna(X_train['resolution'].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [11]:
X_val.isnull().sum()

views            0
views_per_day    0
resolution       0
dtype: int64

In [23]:
# Extracting features from title
train_titles = df[mask_train]['watch-title']
val_titles = df[mask_val]['watch-title']

title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
title_bow_train = title_vec.fit_transform(train_titles)
title_bow_val = title_vec.transform(val_titles)

# Concat the BoW into features df
X_train_title = hstack([X_train, title_bow_train])
X_val_title = hstack([X_val, title_bow_val])

# Random Forest

In [24]:
# Random Forest Model
rfc = RandomForestClassifier(n_estimators=200, random_state=42,
                             class_weight="balanced", n_jobs=8)
rfc.fit(X_train_title, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=8,
                       oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [25]:
# Predicting
rf_train_proba = rfc.predict_proba(X_train_title)
rf_train_preds = rfc.predict(X_train_title)
rf_val_proba = rfc.predict_proba(X_val_title)
rf_val_preds = rfc.predict(X_val_title)

In [26]:
# Getting the metrics
print('TRAIN METRICS:')
print('log_loss: ', log_loss(y_train, rf_train_proba))
print('avg_precision_score: ', average_precision_score(y_train, rf_train_proba[:, 1]))
print('roc_auc: ', roc_auc_score(y_train, rf_train_proba[:, 1]))

print('\nVALIDATION METRICS:')
print('log_loss: ', log_loss(y_val, rf_val_preds))
print('avg_precision_score: ', average_precision_score(y_val, rf_val_proba[:, 1]))
print('roc_auc: ', roc_auc_score(y_val, rf_val_proba[:, 1]))

TRAIN METRICS:
log_loss:  0.1537853590465786
avg_precision_score:  0.9999978657105049
roc_auc:  0.9999993604600594

VALIDATION METRICS:
log_loss:  6.651992450614706
avg_precision_score:  0.4893890775561317
roc_auc:  0.792641975308642


# LGBM

In [13]:
lgbm = LGBMClassifier(random_state=42, class_weight="balanced", n_jobs=7)
lgbm.fit(X_train_title, y_train)

NameError: name 'X_train_title' is not defined

In [14]:
# Predicting
lgbm_train_proba = lgbm.predict_proba(X_train_title)
lgbm_train_preds = lgbm.predict(X_train_title)
lgbm_val_proba = lgbm.predict_proba(X_val_title)
lgbm_val_preds = lgbm.predict(X_val_title)

NameError: name 'X_train_title' is not defined

In [15]:
# Getting the metrics
print('TRAIN METRICS:')
print('log_loss: ', log_loss(y_train, lgbm_train_preds))
print('avg_precision_score: ', average_precision_score(y_train, lgbm_train_proba[:, 1]))
print('roc_auc: ', roc_auc_score(y_train, lgbm_train_proba[:, 1]))

print('\nVALIDATION METRICS:')
print('log_loss: ', log_loss(y_val, lgbm_val_preds))
print('avg_precision_score: ', average_precision_score(y_val, lgbm_val_proba[:, 1]))
print('roc_auc: ', roc_auc_score(y_val, lgbm_val_proba[:, 1]))

TRAIN METRICS:


NameError: name 'lgbm_train_preds' is not defined

In [17]:
def tune_lgbm(params):
    print(params)
    lr = params[0]
    max_depth = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4]
    n_estimators = params[5]
    
    min_df = params[6]
    ngram_range = (1, params[7])
    
    title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
    title_bow_train = title_vec.fit_transform(train_titles)
    title_bow_val = title_vec.transform(val_titles)
    
    X_train_title = hstack([X_train, title_bow_train])
    X_val_title = hstack([X_val, title_bow_val])

    mdl = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth, 
                         min_child_samples=min_child_samples, subsample=subsample,
                         colsample_bytree=colsample_bytree, bagging_freq=1,n_estimators=n_estimators, random_state=0, 
                         class_weight="balanced", n_jobs=8)
    mdl.fit(X_train_title, y_train)
    
    p = mdl.predict_proba(X_val_title)[:, 1]
    
    print(roc_auc_score(y_val, p))
    
    return -average_precision_score(y_val, p)


space = [(1e-3, 1e-1, 'log-uniform'), # lr
          (1, 10), # max_depth
          (1, 20), # min_child_samples
          (0.05, 1.), # subsample
          (0.05, 1.), # colsample_bytree
          (100,1000), # n_estimators
          (1,5), # min_df
          (1,5)] # ngram_range

res = forest_minimize(tune_lgbm, space, random_state=42, n_random_starts=50, n_calls=75, verbose=1)

Iteration No: 1 started. Evaluating function at random point.
[0.03918194347141743, 8, 7, 0.4735411152109116, 0.14497617002710275, 558, 5, 4]




0.7677037037037038
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.5326
Function value obtained: -0.4647
Current minimum: -0.4647
Iteration No: 2 started. Evaluating function at random point.
[0.0019307837536547132, 3, 2, 0.7358988336534836, 0.9416250735649627, 485, 4, 5]




0.7790617283950617
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.5521
Function value obtained: -0.4456
Current minimum: -0.4647
Iteration No: 3 started. Evaluating function at random point.
[0.017177621112338382, 10, 12, 0.07190930378934497, 0.5485359272454697, 610, 2, 4]




0.7468641975308642
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.4444
Function value obtained: -0.4643
Current minimum: -0.4647
Iteration No: 4 started. Evaluating function at random point.
[0.08861577452533079, 3, 12, 0.538522716492931, 0.6127938404189405, 230, 5, 3]
0.7559506172839506
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 0.1949
Function value obtained: -0.4439
Current minimum: -0.4647
Iteration No: 5 started. Evaluating function at random point.
[0.022941144328643994, 9, 7, 0.06260171310187321, 0.9450916679006103, 113, 2, 2]
0.7531358024691357
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 0.1732
Function value obtained: -0.4165
Current minimum: -0.4647
Iteration No: 6 started. Evaluating function at random point.
[0.004066563313514795, 5, 2, 0.7000213751865492, 0.46814486905262126, 554, 4, 5]




0.7885432098765432
Iteration No: 6 ended. Evaluation done at random point.
Time taken: 0.8298
Function value obtained: -0.4721
Current minimum: -0.4721
Iteration No: 7 started. Evaluating function at random point.
[0.04638630972397281, 3, 14, 0.42150757719457876, 0.22312428339865925, 487, 2, 2]




0.7722469135802469
Iteration No: 7 ended. Evaluation done at random point.
Time taken: 0.2661
Function value obtained: -0.4406
Current minimum: -0.4721
Iteration No: 8 started. Evaluating function at random point.
[0.013658426050382541, 2, 12, 0.9425239944859798, 0.9000859829062665, 369, 4, 1]
0.7809876543209876
Iteration No: 8 ended. Evaluation done at random point.
Time taken: 0.1474
Function value obtained: -0.4734
Current minimum: -0.4734
Iteration No: 9 started. Evaluating function at random point.
[0.08362652463906246, 5, 16, 0.7599541046305119, 0.5627075257696259, 891, 2, 1]




0.7133827160493827
Iteration No: 9 ended. Evaluation done at random point.
Time taken: 0.6149
Function value obtained: -0.3715
Current minimum: -0.4734
Iteration No: 10 started. Evaluating function at random point.
[0.0036464395589807202, 9, 15, 0.20700359210985236, 0.06485458640413425, 620, 1, 4]




0.7977283950617284
Iteration No: 10 ended. Evaluation done at random point.
Time taken: 0.6785
Function value obtained: -0.4219
Current minimum: -0.4734
Iteration No: 11 started. Evaluating function at random point.
[0.0038634593707206726, 3, 17, 0.7257748551112175, 0.8006667635046455, 478, 5, 2]




0.8036543209876543
Iteration No: 11 ended. Evaluation done at random point.
Time taken: 0.3029
Function value obtained: -0.4921
Current minimum: -0.4921
Iteration No: 12 started. Evaluating function at random point.
[0.005211124595788266, 9, 7, 0.8575366489003095, 0.4769781404312932, 132, 5, 3]




0.8024691358024691
Iteration No: 12 ended. Evaluation done at random point.
Time taken: 0.4513
Function value obtained: -0.5062
Current minimum: -0.5062
Iteration No: 13 started. Evaluating function at random point.
[0.015226341829186321, 6, 3, 0.49860417890385184, 0.16361453364138662, 561, 3, 1]




0.7793580246913581
Iteration No: 13 ended. Evaluation done at random point.
Time taken: 0.5144
Function value obtained: -0.4552
Current minimum: -0.5062
Iteration No: 14 started. Evaluating function at random point.
[0.027762530094438326, 7, 14, 0.5191058165461713, 0.5465961879128944, 114, 2, 2]




0.8165432098765433
Iteration No: 14 ended. Evaluation done at random point.
Time taken: 0.2632
Function value obtained: -0.5119
Current minimum: -0.5119
Iteration No: 15 started. Evaluating function at random point.
[0.007562632622090494, 3, 7, 0.5016017120230062, 0.5851117933775646, 999, 5, 5]




0.7988148148148148
Iteration No: 15 ended. Evaluation done at random point.
Time taken: 0.7456
Function value obtained: -0.5218
Current minimum: -0.5218
Iteration No: 16 started. Evaluating function at random point.
[0.012013849374287178, 4, 13, 0.12313091433735335, 0.3252638802680796, 185, 4, 2]
0.796395061728395
Iteration No: 16 ended. Evaluation done at random point.
Time taken: 0.1778
Function value obtained: -0.4674
Current minimum: -0.5218
Iteration No: 17 started. Evaluating function at random point.
[0.01772997389934229, 9, 6, 0.48370784195876476, 0.25751841535599196, 439, 3, 2]




0.7736296296296297
Iteration No: 17 ended. Evaluation done at random point.
Time taken: 0.4877
Function value obtained: -0.4576
Current minimum: -0.5218
Iteration No: 18 started. Evaluating function at random point.
[0.0017545910486566185, 9, 7, 0.9114870194684664, 0.3085256369154036, 382, 1, 4]




0.8103703703703704
Iteration No: 18 ended. Evaluation done at random point.
Time taken: 0.9424
Function value obtained: -0.4817
Current minimum: -0.5218
Iteration No: 19 started. Evaluating function at random point.
[0.005071514980544015, 3, 7, 0.20642306048579467, 0.55738494840667, 417, 5, 1]




0.8146172839506173
Iteration No: 19 ended. Evaluation done at random point.
Time taken: 0.2877
Function value obtained: -0.5152
Current minimum: -0.5218
Iteration No: 20 started. Evaluating function at random point.
[0.0017367237151593167, 8, 15, 0.20987649006428405, 0.25782600859441673, 571, 1, 4]




0.8032592592592593
Iteration No: 20 ended. Evaluation done at random point.
Time taken: 0.8746
Function value obtained: -0.4372
Current minimum: -0.5218
Iteration No: 21 started. Evaluating function at random point.
[0.005336533066379609, 1, 20, 0.2845322596966712, 0.711489059197799, 486, 1, 5]




0.8004938271604938
Iteration No: 21 ended. Evaluation done at random point.
Time taken: 0.5179
Function value obtained: -0.4784
Current minimum: -0.5218
Iteration No: 22 started. Evaluating function at random point.
[0.0037126241790405314, 1, 17, 0.9777842080410204, 0.44048516265231974, 954, 5, 1]




0.7729876543209877
Iteration No: 22 ended. Evaluation done at random point.
Time taken: 0.2739
Function value obtained: -0.4387
Current minimum: -0.5218
Iteration No: 23 started. Evaluating function at random point.
[0.018565330690729166, 2, 12, 0.47539400634443707, 0.5752484346177617, 346, 4, 3]




0.7908148148148149
Iteration No: 23 ended. Evaluation done at random point.
Time taken: 0.2480
Function value obtained: -0.4634
Current minimum: -0.5218
Iteration No: 24 started. Evaluating function at random point.
[0.005486670911103226, 1, 6, 0.3993939760832906, 0.6506905390639006, 297, 3, 1]




0.7848888888888889
Iteration No: 24 ended. Evaluation done at random point.
Time taken: 0.2273
Function value obtained: -0.4507
Current minimum: -0.5218
Iteration No: 25 started. Evaluating function at random point.
[0.002001866389140703, 6, 16, 0.08873638447702573, 0.6113482960288298, 502, 2, 3]




0.7787654320987655
Iteration No: 25 ended. Evaluation done at random point.
Time taken: 0.3179
Function value obtained: -0.4638
Current minimum: -0.5218
Iteration No: 26 started. Evaluating function at random point.
[0.04159126142111956, 4, 7, 0.2156481075547419, 0.7063908511973428, 739, 2, 1]




0.7007407407407409
Iteration No: 26 ended. Evaluation done at random point.
Time taken: 0.3217
Function value obtained: -0.4147
Current minimum: -0.5218
Iteration No: 27 started. Evaluating function at random point.
[0.022458473505101235, 9, 18, 0.9284589373646347, 0.8834723857119321, 710, 4, 2]




0.7672098765432098
Iteration No: 27 ended. Evaluation done at random point.
Time taken: 0.8515
Function value obtained: -0.3781
Current minimum: -0.5218
Iteration No: 28 started. Evaluating function at random point.
[0.09211634266628622, 4, 17, 0.27975967635542914, 0.13844762941560426, 751, 5, 5]




0.7808395061728395
Iteration No: 28 ended. Evaluation done at random point.
Time taken: 0.4830
Function value obtained: -0.4727
Current minimum: -0.5218
Iteration No: 29 started. Evaluating function at random point.
[0.05824688255969174, 7, 9, 0.3149277849625728, 0.7153399384741328, 307, 3, 4]




0.7359012345679011
Iteration No: 29 ended. Evaluation done at random point.
Time taken: 0.4276
Function value obtained: -0.3108
Current minimum: -0.5218
Iteration No: 30 started. Evaluating function at random point.
[0.0594528733047999, 8, 1, 0.6599300638465735, 0.12993296674529642, 519, 4, 3]




0.7600000000000001
Iteration No: 30 ended. Evaluation done at random point.
Time taken: 0.9471
Function value obtained: -0.4073
Current minimum: -0.5218
Iteration No: 31 started. Evaluating function at random point.
[0.005553484104252286, 9, 3, 0.9749806449175515, 0.31972492600059244, 977, 2, 2]




0.778074074074074
Iteration No: 31 ended. Evaluation done at random point.
Time taken: 3.1339
Function value obtained: -0.4239
Current minimum: -0.5218
Iteration No: 32 started. Evaluating function at random point.
[0.024198608614538684, 9, 4, 0.7265702602801591, 0.2753866331219601, 376, 4, 3]




0.7755061728395062
Iteration No: 32 ended. Evaluation done at random point.
Time taken: 0.8943
Function value obtained: -0.4528
Current minimum: -0.5218
Iteration No: 33 started. Evaluating function at random point.
[0.02066454201865008, 1, 16, 0.13899102943668784, 0.39933001290646186, 474, 5, 3]




0.8149135802469136
Iteration No: 33 ended. Evaluation done at random point.
Time taken: 0.4907
Function value obtained: -0.5139
Current minimum: -0.5218
Iteration No: 34 started. Evaluating function at random point.
[0.06082653960640123, 2, 1, 0.5980586903950412, 0.5178918091279208, 552, 4, 5]




0.768888888888889
Iteration No: 34 ended. Evaluation done at random point.
Time taken: 0.5250
Function value obtained: -0.4902
Current minimum: -0.5218
Iteration No: 35 started. Evaluating function at random point.
[0.009962922677707823, 4, 3, 0.7801263135909936, 0.09142358316671206, 271, 4, 5]




0.8095802469135803
Iteration No: 35 ended. Evaluation done at random point.
Time taken: 0.4112
Function value obtained: -0.4982
Current minimum: -0.5218
Iteration No: 36 started. Evaluating function at random point.
[0.0036234367402567624, 6, 3, 0.7603328352025434, 0.9554182546727557, 448, 1, 5]




0.7731851851851852
Iteration No: 36 ended. Evaluation done at random point.
Time taken: 1.6677
Function value obtained: -0.4525
Current minimum: -0.5218
Iteration No: 37 started. Evaluating function at random point.
[0.013950344686768644, 1, 20, 0.3297264474661064, 0.415842842171829, 260, 3, 2]
0.7979259259259259
Iteration No: 37 ended. Evaluation done at random point.
Time taken: 0.1571
Function value obtained: -0.4857
Current minimum: -0.5218
Iteration No: 38 started. Evaluating function at random point.
[0.012989801809330636, 6, 3, 0.5915581115848968, 0.14231766908223012, 387, 2, 3]




0.7997037037037038
Iteration No: 38 ended. Evaluation done at random point.
Time taken: 0.4959
Function value obtained: -0.4875
Current minimum: -0.5218
Iteration No: 39 started. Evaluating function at random point.
[0.0038956611726385913, 10, 17, 0.8562363051784408, 0.17979026486992739, 345, 5, 2]




0.8118518518518518
Iteration No: 39 ended. Evaluation done at random point.
Time taken: 0.4262
Function value obtained: -0.5127
Current minimum: -0.5218
Iteration No: 40 started. Evaluating function at random point.
[0.04156447611486069, 3, 18, 0.8737187026510986, 0.9175785249286479, 605, 1, 4]




0.8041975308641975
Iteration No: 40 ended. Evaluation done at random point.
Time taken: 0.4997
Function value obtained: -0.4754
Current minimum: -0.5218
Iteration No: 41 started. Evaluating function at random point.
[0.039499387197359, 1, 8, 0.7168685333948183, 0.8060030359642961, 982, 4, 2]




0.7893333333333333
Iteration No: 41 ended. Evaluation done at random point.
Time taken: 0.3091
Function value obtained: -0.5017
Current minimum: -0.5218
Iteration No: 42 started. Evaluating function at random point.
[0.0015415722348909036, 7, 19, 0.4923181172258372, 0.5655124029721978, 224, 2, 2]




0.8164938271604938
Iteration No: 42 ended. Evaluation done at random point.
Time taken: 0.2558
Function value obtained: -0.5032
Current minimum: -0.5218
Iteration No: 43 started. Evaluating function at random point.
[0.0011508017135300928, 4, 10, 0.8314705326266755, 0.3921811093406998, 543, 5, 1]




0.8048395061728395
Iteration No: 43 ended. Evaluation done at random point.
Time taken: 0.3161
Function value obtained: -0.4835
Current minimum: -0.5218
Iteration No: 44 started. Evaluating function at random point.
[0.04526244473698836, 5, 7, 0.13108009174407959, 0.09909763511017732, 983, 3, 4]




0.7718518518518519
Iteration No: 44 ended. Evaluation done at random point.
Time taken: 0.5582
Function value obtained: -0.4085
Current minimum: -0.5218
Iteration No: 45 started. Evaluating function at random point.
[0.0018428536387181335, 2, 10, 0.35680864929418366, 0.8054268850302686, 930, 3, 4]




0.8080987654320988
Iteration No: 45 ended. Evaluation done at random point.
Time taken: 0.8558
Function value obtained: -0.4724
Current minimum: -0.5218
Iteration No: 46 started. Evaluating function at random point.
[0.030767009526237375, 10, 5, 0.22511699061480156, 0.1268293182863378, 226, 1, 4]




0.7811358024691358
Iteration No: 46 ended. Evaluation done at random point.
Time taken: 1.3846
Function value obtained: -0.4415
Current minimum: -0.5218
Iteration No: 47 started. Evaluating function at random point.
[0.006575153390127364, 10, 1, 0.47023461824454926, 0.2777979911995331, 217, 1, 5]




0.7902222222222222
Iteration No: 47 ended. Evaluation done at random point.
Time taken: 11.3519
Function value obtained: -0.4564
Current minimum: -0.5218
Iteration No: 48 started. Evaluating function at random point.
[0.0036296754488993584, 7, 5, 0.6311347089226347, 0.4486200593064004, 175, 2, 3]




0.8154074074074075
Iteration No: 48 ended. Evaluation done at random point.
Time taken: 0.4200
Function value obtained: -0.4631
Current minimum: -0.5218
Iteration No: 49 started. Evaluating function at random point.
[0.03278628584511124, 9, 14, 0.1372091125227698, 0.1394491388551321, 193, 4, 5]




0.7906172839506174
Iteration No: 49 ended. Evaluation done at random point.
Time taken: 0.3296
Function value obtained: -0.4621
Current minimum: -0.5218
Iteration No: 50 started. Evaluating function at random point.
[0.008878664758716853, 10, 16, 0.8165673281982406, 0.3789891050349308, 353, 5, 2]




0.8081975308641975
Iteration No: 50 ended. Evaluation done at random point.
Time taken: 0.9049
Function value obtained: -0.5157
Current minimum: -0.5218
Iteration No: 51 started. Searching for the next optimal point.
[0.013638979867247535, 10, 4, 0.3379215984919367, 0.8179705267534322, 341, 5, 2]




0.7836049382716049
Iteration No: 51 ended. Search finished for the next optimal point.
Time taken: 1.3816
Function value obtained: -0.4941
Current minimum: -0.5218
Iteration No: 52 started. Searching for the next optimal point.
[0.03327046211206998, 6, 5, 0.10908177055128404, 0.7313263739010277, 320, 5, 1]




0.7003456790123457
Iteration No: 52 ended. Search finished for the next optimal point.
Time taken: 0.6457
Function value obtained: -0.3870
Current minimum: -0.5218
Iteration No: 53 started. Searching for the next optimal point.
[0.007132867548339458, 7, 7, 0.23183391217479438, 0.8350987498577197, 222, 5, 5]




0.7933827160493827
Iteration No: 53 ended. Search finished for the next optimal point.
Time taken: 0.7062
Function value obtained: -0.4560
Current minimum: -0.5218
Iteration No: 54 started. Searching for the next optimal point.
[0.0115012312441129, 10, 20, 0.5673415105446846, 0.5561334436060434, 811, 5, 2]




0.778074074074074
Iteration No: 54 ended. Search finished for the next optimal point.
Time taken: 1.1256
Function value obtained: -0.5061
Current minimum: -0.5218
Iteration No: 55 started. Searching for the next optimal point.
[0.020989796412759273, 10, 17, 0.5222203117406746, 0.8334342450464721, 597, 5, 2]




0.7723456790123457
Iteration No: 55 ended. Search finished for the next optimal point.
Time taken: 1.2340
Function value obtained: -0.4824
Current minimum: -0.5218
Iteration No: 56 started. Searching for the next optimal point.
[0.008483966234674387, 8, 13, 0.6857593345988641, 0.6038401687867051, 730, 5, 1]




0.7371851851851852
Iteration No: 56 ended. Search finished for the next optimal point.
Time taken: 1.1671
Function value obtained: -0.4491
Current minimum: -0.5218
Iteration No: 57 started. Searching for the next optimal point.
[0.029117674397454666, 10, 18, 0.3178388926493071, 0.6817940726226185, 481, 5, 2]




0.7792592592592592
Iteration No: 57 ended. Search finished for the next optimal point.
Time taken: 0.8593
Function value obtained: -0.5032
Current minimum: -0.5218
Iteration No: 58 started. Searching for the next optimal point.
[0.05305420938831092, 10, 3, 0.7808816161450246, 0.5474814795000535, 377, 5, 5]




0.7720493827160493
Iteration No: 58 ended. Search finished for the next optimal point.
Time taken: 2.3291
Function value obtained: -0.4542
Current minimum: -0.5218
Iteration No: 59 started. Searching for the next optimal point.
[0.0025264860031708773, 10, 14, 0.9196148075721063, 0.060403727948557334, 882, 5, 5]




0.8125432098765433
Iteration No: 59 ended. Search finished for the next optimal point.
Time taken: 1.3384
Function value obtained: -0.5184
Current minimum: -0.5218
Iteration No: 60 started. Searching for the next optimal point.
[0.026348442511975805, 10, 20, 0.9847727715341962, 0.20115423959918088, 965, 5, 4]




0.7537777777777778
Iteration No: 60 ended. Search finished for the next optimal point.
Time taken: 1.4435
Function value obtained: -0.4452
Current minimum: -0.5218
Iteration No: 61 started. Searching for the next optimal point.
[0.0021985814745350105, 10, 4, 0.20686308553500166, 0.23385836977667113, 154, 5, 2]




0.7939753086419753
Iteration No: 61 ended. Search finished for the next optimal point.
Time taken: 0.6541
Function value obtained: -0.5091
Current minimum: -0.5218
Iteration No: 62 started. Searching for the next optimal point.
[0.006362850377295692, 10, 3, 0.1053371880061324, 0.3622055684583405, 681, 5, 5]




0.8072098765432099
Iteration No: 62 ended. Search finished for the next optimal point.
Time taken: 1.6201
Function value obtained: -0.5384
Current minimum: -0.5384
Iteration No: 63 started. Searching for the next optimal point.
[0.0016725845140214777, 10, 3, 0.9889941485834416, 0.48285525363759024, 990, 5, 5]




0.770962962962963
Iteration No: 63 ended. Search finished for the next optimal point.
Time taken: 4.7203
Function value obtained: -0.4843
Current minimum: -0.5384
Iteration No: 64 started. Searching for the next optimal point.
[0.0050661720343894556, 9, 6, 0.18631641341650396, 0.05650259156907794, 575, 5, 4]




0.8253827160493827
Iteration No: 64 ended. Search finished for the next optimal point.
Time taken: 0.8388
Function value obtained: -0.5702
Current minimum: -0.5702
Iteration No: 65 started. Searching for the next optimal point.
[0.004486647021133098, 8, 15, 0.16316364287880078, 0.34593004587557835, 835, 5, 5]




0.803358024691358
Iteration No: 65 ended. Search finished for the next optimal point.
Time taken: 0.8598
Function value obtained: -0.5118
Current minimum: -0.5702
Iteration No: 66 started. Searching for the next optimal point.
[0.0024259961332708043, 8, 1, 0.12816799351949487, 0.14681589695300457, 226, 5, 4]




0.8352592592592593
Iteration No: 66 ended. Search finished for the next optimal point.
Time taken: 0.7324
Function value obtained: -0.5446
Current minimum: -0.5702
Iteration No: 67 started. Searching for the next optimal point.
[0.0167081455196047, 7, 3, 0.051153674018471464, 0.06303069402810776, 171, 5, 4]
0.818962962962963




Iteration No: 67 ended. Search finished for the next optimal point.
Time taken: 0.4949
Function value obtained: -0.4620
Current minimum: -0.5702
Iteration No: 68 started. Searching for the next optimal point.
[0.007427913295910496, 10, 5, 0.18231526329201303, 0.13537572981064047, 134, 5, 3]




0.8130370370370371
Iteration No: 68 ended. Search finished for the next optimal point.
Time taken: 0.6028
Function value obtained: -0.4814
Current minimum: -0.5702
Iteration No: 69 started. Searching for the next optimal point.
[0.002254977483392133, 9, 1, 0.16734380060524479, 0.1385832379992905, 319, 5, 5]




0.8263703703703704
Iteration No: 69 ended. Search finished for the next optimal point.
Time taken: 1.0284
Function value obtained: -0.5473
Current minimum: -0.5702
Iteration No: 70 started. Searching for the next optimal point.
[0.003755228602264735, 9, 4, 0.23895062052462207, 0.08551909178402803, 542, 5, 4]




0.8224197530864198
Iteration No: 70 ended. Search finished for the next optimal point.
Time taken: 0.8682
Function value obtained: -0.5418
Current minimum: -0.5702
Iteration No: 71 started. Searching for the next optimal point.
[0.0052851518800625455, 9, 9, 0.26037633964886675, 0.08812280990416547, 725, 5, 1]




0.7716543209876543
Iteration No: 71 ended. Search finished for the next optimal point.
Time taken: 0.6655
Function value obtained: -0.4970
Current minimum: -0.5702
Iteration No: 72 started. Searching for the next optimal point.
[0.0013919702648753326, 7, 3, 0.18973557628464272, 0.09667890817533148, 641, 5, 4]




0.8164938271604938
Iteration No: 72 ended. Search finished for the next optimal point.
Time taken: 0.8656
Function value obtained: -0.5164
Current minimum: -0.5702
Iteration No: 73 started. Searching for the next optimal point.
[0.006327991778360332, 9, 2, 0.29958069312164853, 0.05447259418235459, 194, 5, 4]




0.8260740740740741
Iteration No: 73 ended. Search finished for the next optimal point.
Time taken: 0.6733
Function value obtained: -0.5512
Current minimum: -0.5702
Iteration No: 74 started. Searching for the next optimal point.
[0.007457630914876315, 7, 5, 0.3697970957904218, 0.052825519874089254, 277, 5, 4]




0.8232098765432098
Iteration No: 74 ended. Search finished for the next optimal point.
Time taken: 0.6018
Function value obtained: -0.5224
Current minimum: -0.5702
Iteration No: 75 started. Searching for the next optimal point.
[0.009244970318559191, 9, 7, 0.5675826015697814, 0.10903735203292259, 103, 5, 4]




0.8239012345679012
Iteration No: 75 ended. Search finished for the next optimal point.
Time taken: 0.5936
Function value obtained: -0.5227
Current minimum: -0.5702


In [18]:
lr, max_depth, min_child_samples, subsample, colsample_bytree, n_estimators, min_df, ngram_range = res.x

In [19]:
ngram_range = (1, ngram_range)
title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
title_bow_train = title_vec.fit_transform(train_titles)
title_bow_val = title_vec.transform(val_titles)

X_train_title = hstack([X_train, title_bow_train])
X_val_title = hstack([X_val, title_bow_val])

lgbm = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth, 
                     min_child_samples=min_child_samples, subsample=subsample,
                     colsample_bytree=colsample_bytree, bagging_freq=1,n_estimators=n_estimators, random_state=0, 
                     class_weight='balanced', n_jobs=8)
lgbm.fit(X_train_title, y_train)

LGBMClassifier(bagging_freq=1, boosting_type='gbdt', class_weight='balanced',
               colsample_bytree=0.05650259156907794, importance_type='split',
               learning_rate=0.0050661720343894556, max_depth=9,
               min_child_samples=6, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=575, n_jobs=8, num_leaves=512, objective=None,
               random_state=0, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=0.18631641341650396, subsample_for_bin=200000,
               subsample_freq=0)

In [20]:
lgbm_train_proba = lgbm.predict_proba(X_train_title)
lgbm_train_preds = lgbm.predict(X_train_title)
lgbm_val_proba = lgbm.predict_proba(X_val_title)
lgbm_val_preds = lgbm.predict(X_val_title)



In [21]:
# Getting the metrics
print('TRAIN METRICS:')
print('log_loss: ', log_loss(y_train, lgbm_train_preds))
print('avg_precision_score: ', average_precision_score(y_train, lgbm_train_proba[:, 1]))
print('roc_auc: ', roc_auc_score(y_train, lgbm_train_proba[:, 1]))

print('\nVALIDATION METRICS:')
print('log_loss: ', log_loss(y_val, lgbm_val_preds))
print('avg_precision_score: ', average_precision_score(y_val, lgbm_val_proba[:, 1]))
print('roc_auc: ', roc_auc_score(y_val, lgbm_val_proba[:, 1]))

TRAIN METRICS:
log_loss:  8.450496407118516
avg_precision_score:  0.7549747845229962
roc_auc:  0.8349865440796509

VALIDATION METRICS:
log_loss:  6.907864853444793
avg_precision_score:  0.5701757155143178
roc_auc:  0.8253827160493827


In [27]:
pd.DataFrame({"RF": rf_val_proba[:, 1], "LGBM": lgbm_val_proba[:, 1]}).corr()

Unnamed: 0,RF,LGBM
RF,1.0,0.762598
LGBM,0.762598,1.0


In [28]:
rf_weights = np.linspace(0,1,20)

for rf_weight in rf_weights:
    p = rf_weight*rf_val_proba[:, 1] + (1-rf_weight)*lgbm_val_proba[:, 1]
    print(rf_weight)
    print(average_precision_score(y_val, p), roc_auc_score(y_val, p))

0.0
0.5701757155143178 0.8253827160493827
0.05263157894736842
0.5712651161736446 0.8258765432098766
0.10526315789473684
0.5720701158167869 0.8266666666666667
0.15789473684210525
0.5695877907703422 0.8245925925925925
0.21052631578947367
0.5674371127484588 0.8241975308641976
0.2631578947368421
0.563471606640561 0.821037037037037
0.3157894736842105
0.5583955646950767 0.8194567901234567
0.3684210526315789
0.555255890111646 0.8165925925925926
0.42105263157894735
0.544290751611937 0.8126419753086421
0.47368421052631576
0.5249673089923815 0.8099753086419753
0.5263157894736842
0.5253522580292298 0.8074074074074074
0.5789473684210527
0.5182671358773665 0.8044444444444444
0.631578947368421
0.5105012628605115 0.8011851851851852
0.6842105263157894
0.5069279113810763 0.8000987654320988
0.7368421052631579
0.4953668899026871 0.7982222222222222
0.7894736842105263
0.49312970293514014 0.7969382716049384
0.8421052631578947
0.4876962913585326 0.7958518518518519
0.894736842105263
0.48644027299376 0.7947654

In [None]:
p = 0.26*rf_val_proba[:, 1] + 0.74*lgbm_val_proba[:, 1]
print(average_precision_score(y_val, p), roc_auc_score(y_val, p))

In [None]:
import joblib as jb
jb.dump(lgbm, "../pkls/lgbm_20200324.pkl.z")
jb.dump(rfc, "../pkls/rf_20200324.pkl.z")
jb.dump(title_vec, "../pkls/titlebow_20200324.pkl.z")