In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import hstack
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, log_loss
from skopt import forest_minimize
from sklearn.preprocessing import MaxAbsScaler, StandardScaler
from scipy.sparse import csr_matrix

In [2]:
df = pd.read_csv('../data/raw_data_labeled.csv')

df = df[df['y'].notnull()]

In [3]:
df.isnull().sum()

watch-title                      0
y                                0
watch-view-count                 7
watch-time-text                  0
watch-extras-section             0
content_watch-info-tag-list      0
og:image                         0
og:image:width                   0
og:image:height                  0
og:description                   0
og:video:width                   4
og:video:height                  4
og:video:tag                   185
channel_link_0                   2
dtype: int64

In [4]:
# Clean date attribute
clean_date = df['watch-time-text'].str.extract(r"(\d+) de ([a-z]+)\. de (\d+)")
clean_date[0] = clean_date[0].dropna().astype(str)
clean_date[2] = clean_date[2].dropna().astype(str)

month_map = {"jan": "Jan",
             "fev": "Feb",
             "mar": "Mar",
             "abr": "Apr",
             "mai": "May",
             "jun": "Jun",
             "jul": "Jul",
             "ago": "Aug",
             "set": "Sep",
             "out": "Oct",
             "nov": "Nov",
             "dez": "Dec"}

clean_date[1] = clean_date[1].map(month_map)
clean_date = clean_date.dropna().apply(lambda x: " ".join(x), axis=1)
clean_date = pd.to_datetime(clean_date, format="%d %b %Y")

# Clean view number
views = df['watch-view-count'].str.extract(r"(\d+\.?\d*)", expand=False)
views = views.str.replace(".", "").fillna(0).astype(int)

In [5]:
features = pd.DataFrame()
y = df['y'].copy()

In [6]:
features['time_since_pub'] = (pd.to_datetime("2020-03-24") -  # HARDCODED
                              clean_date) / np.timedelta64(1, 'D')

# Extracting n of view feature
features['views'] = views

# Extracting n of view/day feature
features['views_per_day'] = features['views'] / features['time_since_pub']

# Droping time_since_pub to prevent bias
features = features.drop(['time_since_pub'], axis=1)

# Dropping problematic features
y = y[features.index]
df = df.loc[features.index]

In [7]:
resolutions = []
for height, width in zip(df['og:video:height'], df['og:video:width']):
    try:
        height = float(height)
        width = float(width)
    except:
        resolutions.append(np.nan)
        continue
        
    resolutions.append(height*width)
    
features['resolution'] = resolutions

In [8]:
features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1039 entries, 0 to 1121
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   views          1039 non-null   int64  
 1   views_per_day  1039 non-null   float64
 2   resolution     1033 non-null   float64
dtypes: float64(2), int64(1)
memory usage: 32.5 KB


In [9]:
# Around 75% train and 25% to validation
split_date = '2020-02-27'
mask_train = (clean_date < split_date) & (clean_date.notnull())
mask_val = (clean_date >= split_date) & (clean_date.notnull())

X_train, X_val = features[mask_train.values], features[mask_val.values]
y_train, y_val = y[mask_train.values], y[mask_val.values]
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((786, 3), (253, 3), (786,), (253,))

In [10]:
# Filling NaNs
X_train['resolution'] = X_train['resolution'].fillna(X_train['resolution'].mean())
X_val['resolution'] = X_val['resolution'].fillna(X_train['resolution'].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [33]:
# Extracting features from title
train_titles = df[mask_train]['watch-title']
val_titles = df[mask_val]['watch-title']

title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
title_bow_train = title_vec.fit_transform(train_titles)
title_bow_val = title_vec.transform(val_titles)

# Concat the BoW into features df
X_train_title = hstack([X_train, title_bow_train])
X_val_title = hstack([X_val, title_bow_val])

# Random Forest

In [34]:
# Random Forest Model
rfc = RandomForestClassifier(n_estimators=200, random_state=42,
                             class_weight="balanced", n_jobs=8)
rfc.fit(X_train_title, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=8,
                       oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [35]:
# Predicting
rf_train_proba = rfc.predict_proba(X_train_title)
rf_train_preds = rfc.predict(X_train_title)
rf_val_proba = rfc.predict_proba(X_val_title)
rf_val_preds = rfc.predict(X_val_title)

In [36]:
# Getting the metrics
print('TRAIN METRICS:')
print('log_loss: ', log_loss(y_train, rf_train_proba))
print('avg_precision_score: ', average_precision_score(y_train, rf_train_proba[:, 1]))
print('roc_auc: ', roc_auc_score(y_train, rf_train_proba[:, 1]))

print('\nVALIDATION METRICS:')
print('log_loss: ', log_loss(y_val, rf_val_preds))
print('avg_precision_score: ', average_precision_score(y_val, rf_val_proba[:, 1]))
print('roc_auc: ', roc_auc_score(y_val, rf_val_proba[:, 1]))

TRAIN METRICS:
log_loss:  0.0772328446547738
avg_precision_score:  1.0
roc_auc:  1.0

VALIDATION METRICS:
log_loss:  18.29326812851961
avg_precision_score:  0.7679803376156514
roc_auc:  0.7865608930139221


# LGBM

In [16]:
lgbm = LGBMClassifier(random_state=42, class_weight="balanced", n_jobs=7)
lgbm.fit(X_train_title, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
               colsample_bytree=1.0, importance_type='split', learning_rate=0.1,
               max_depth=-1, min_child_samples=20, min_child_weight=0.001,
               min_split_gain=0.0, n_estimators=100, n_jobs=7, num_leaves=31,
               objective=None, random_state=42, reg_alpha=0.0, reg_lambda=0.0,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [17]:
# Predicting
lgbm_train_proba = lgbm.predict_proba(X_train_title)
lgbm_train_preds = lgbm.predict(X_train_title)
lgbm_val_proba = lgbm.predict_proba(X_val_title)
lgbm_val_preds = lgbm.predict(X_val_title)



In [18]:
# Getting the metrics
print('TRAIN METRICS:')
print('log_loss: ', log_loss(y_train, lgbm_train_preds))
print('avg_precision_score: ', average_precision_score(y_train, lgbm_train_proba[:, 1]))
print('roc_auc: ', roc_auc_score(y_train, lgbm_train_proba[:, 1]))

print('\nVALIDATION METRICS:')
print('log_loss: ', log_loss(y_val, lgbm_val_preds))
print('avg_precision_score: ', average_precision_score(y_val, lgbm_val_proba[:, 1]))
print('roc_auc: ', roc_auc_score(y_val, lgbm_val_proba[:, 1]))

TRAIN METRICS:
log_loss:  0.13183044271885933
avg_precision_score:  1.0
roc_auc:  1.0

VALIDATION METRICS:
log_loss:  15.426438454552486
avg_precision_score:  0.6905704076271251
roc_auc:  0.7080772607550483


In [19]:
def tune_lgbm(params):
    print(params)
    lr = params[0]
    max_depth = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4]
    n_estimators = params[5]
    
    min_df = params[6]
    ngram_range = (1, params[7])
    
    title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
    title_bow_train = title_vec.fit_transform(train_titles)
    title_bow_val = title_vec.transform(val_titles)
    
    X_train_title = hstack([X_train, title_bow_train])
    X_val_title = hstack([X_val, title_bow_val])

    mdl = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth, 
                         min_child_samples=min_child_samples, subsample=subsample,
                         colsample_bytree=colsample_bytree, bagging_freq=1,n_estimators=n_estimators, random_state=0, 
                         class_weight="balanced", n_jobs=8)
    mdl.fit(X_train_title, y_train)
    
    p = mdl.predict_proba(X_val_title)[:, 1]
    
    print(roc_auc_score(y_val, p))
    
    return -average_precision_score(y_val, p)


space = [(1e-3, 1e-1, 'log-uniform'), # lr
          (1, 10), # max_depth
          (1, 20), # min_child_samples
          (0.05, 1.), # subsample
          (0.05, 1.), # colsample_bytree
          (100,1000), # n_estimators
          (1,5), # min_df
          (1,5)] # ngram_range

res = forest_minimize(tune_lgbm, space, random_state=160745, n_random_starts=20, n_calls=50, verbose=1)

Iteration No: 1 started. Evaluating function at random point.
[0.009944912110647982, 5, 1, 0.4677107511929402, 0.49263223036174764, 272, 3, 1]
0.7431957857769973
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.1790
Function value obtained: -0.7101
Current minimum: -0.7101
Iteration No: 2 started. Evaluating function at random point.
[0.053887464791860025, 1, 15, 0.7437489153990157, 0.8675167974293533, 549, 3, 4]




0.7345729336510723
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.1564
Function value obtained: -0.7114
Current minimum: -0.7114
Iteration No: 3 started. Evaluating function at random point.
[0.004151454520895999, 6, 20, 0.8682075103820793, 0.9491436163200662, 411, 4, 3]




0.7267653329988711
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.2621
Function value obtained: -0.7000
Current minimum: -0.7114
Iteration No: 4 started. Evaluating function at random point.
[0.0014099928811969545, 9, 9, 0.6502182010234373, 0.6866210554187129, 828, 5, 2]




0.7247585601404741
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 0.8613
Function value obtained: -0.7060
Current minimum: -0.7114
Iteration No: 5 started. Evaluating function at random point.
[0.08530558241838007, 8, 19, 0.2137736299768322, 0.1313765544201984, 961, 4, 1]
0.730339897152891
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 0.1433
Function value obtained: -0.7181
Current minimum: -0.7181
Iteration No: 6 started. Evaluating function at random point.
[0.003567949451535685, 10, 19, 0.7232951768944309, 0.7298538828427115, 939, 4, 3]




0.7356076759061834
Iteration No: 6 ended. Evaluation done at random point.
Time taken: 0.6686
Function value obtained: -0.6988
Current minimum: -0.7181
Iteration No: 7 started. Evaluating function at random point.
[0.014828577273549474, 7, 1, 0.18428087097824575, 0.3261556557915816, 274, 1, 2]




0.7783143107989464
Iteration No: 7 ended. Evaluation done at random point.
Time taken: 0.7331
Function value obtained: -0.7644
Current minimum: -0.7644
Iteration No: 8 started. Evaluating function at random point.
[0.0015212976972079912, 3, 12, 0.44234694306528044, 0.399351303640462, 272, 3, 5]
0.6922112128433463
Iteration No: 8 ended. Evaluation done at random point.
Time taken: 0.1350
Function value obtained: -0.6451
Current minimum: -0.7644
Iteration No: 9 started. Evaluating function at random point.
[0.01946212855369041, 9, 18, 0.5235636153223084, 0.6728679300083596, 747, 4, 5]




0.7476483130565659
Iteration No: 9 ended. Evaluation done at random point.
Time taken: 0.9027
Function value obtained: -0.7240
Current minimum: -0.7644
Iteration No: 10 started. Evaluating function at random point.
[0.0012116790683302117, 3, 2, 0.06616307483844217, 0.23025600705315752, 677, 2, 5]




0.6507588109870813
Iteration No: 10 ended. Evaluation done at random point.
Time taken: 0.3961
Function value obtained: -0.6058
Current minimum: -0.7644
Iteration No: 11 started. Evaluating function at random point.
[0.0053139776214487944, 6, 9, 0.14251441334450304, 0.8175761405215897, 297, 1, 5]




0.7531669384171579
Iteration No: 11 ended. Evaluation done at random point.
Time taken: 0.2405
Function value obtained: -0.7117
Current minimum: -0.7644
Iteration No: 12 started. Evaluating function at random point.
[0.0068572961982704935, 10, 5, 0.2390386584472456, 0.49053406102209746, 176, 2, 4]




0.711087420042644
Iteration No: 12 ended. Evaluation done at random point.
Time taken: 0.2668
Function value obtained: -0.6673
Current minimum: -0.7644
Iteration No: 13 started. Evaluating function at random point.
[0.00781968225875022, 3, 4, 0.7078936710077383, 0.31818755505678337, 275, 4, 4]
0.7289602408127429
Iteration No: 13 ended. Evaluation done at random point.
Time taken: 0.1539
Function value obtained: -0.6990
Current minimum: -0.7644
Iteration No: 14 started. Evaluating function at random point.
[0.017293945600511968, 2, 15, 0.9007557574888567, 0.41026441194439994, 316, 5, 1]




0.7578076006522012
Iteration No: 14 ended. Evaluation done at random point.
Time taken: 0.0957
Function value obtained: -0.7501
Current minimum: -0.7644
Iteration No: 15 started. Evaluating function at random point.
[0.012250750764764855, 8, 6, 0.5976582413192033, 0.2474882432951916, 516, 4, 4]




0.7267653329988712
Iteration No: 15 ended. Evaluation done at random point.
Time taken: 0.4677
Function value obtained: -0.7006
Current minimum: -0.7644
Iteration No: 16 started. Evaluating function at random point.
[0.018353598126553926, 4, 3, 0.47305622526323254, 0.1404164811277527, 133, 4, 1]




0.7419415527404992
Iteration No: 16 ended. Evaluation done at random point.
Time taken: 0.3190
Function value obtained: -0.7085
Current minimum: -0.7644
Iteration No: 17 started. Evaluating function at random point.
[0.0010383234748454694, 9, 19, 0.9256771571832196, 0.9321438677645206, 312, 4, 3]




0.7072306534554118
Iteration No: 17 ended. Evaluation done at random point.
Time taken: 0.3789
Function value obtained: -0.6939
Current minimum: -0.7644
Iteration No: 18 started. Evaluating function at random point.
[0.004955229758078229, 5, 5, 0.06939551310802591, 0.4193273080472823, 725, 4, 1]
0.7064467578076007
Iteration No: 18 ended. Evaluation done at random point.
Time taken: 0.1430
Function value obtained: -0.6703
Current minimum: -0.7644
Iteration No: 19 started. Evaluating function at random point.
[0.0699516121742407, 9, 10, 0.6477856515609233, 0.8594430701440198, 616, 1, 1]




0.7189263765207575
Iteration No: 19 ended. Evaluation done at random point.
Time taken: 0.4176
Function value obtained: -0.7269
Current minimum: -0.7644
Iteration No: 20 started. Evaluating function at random point.
[0.0014752743467850462, 5, 4, 0.9747950537021096, 0.982207187458162, 909, 2, 4]




0.7090806471842468
Iteration No: 20 ended. Evaluation done at random point.
Time taken: 1.0128
Function value obtained: -0.6761
Current minimum: -0.7644
Iteration No: 21 started. Searching for the next optimal point.
[0.008805425871468512, 2, 11, 0.15568718222117628, 0.13443133827274129, 356, 1, 1]
0.7201806095572557




Iteration No: 21 ended. Search finished for the next optimal point.
Time taken: 0.3054
Function value obtained: -0.6678
Current minimum: -0.7644
Iteration No: 22 started. Searching for the next optimal point.
[0.015993460495540318, 4, 11, 0.11886574909673835, 0.380101213702967, 313, 1, 2]




0.7830803963376396
Iteration No: 22 ended. Search finished for the next optimal point.
Time taken: 0.5436
Function value obtained: -0.7470
Current minimum: -0.7644
Iteration No: 23 started. Searching for the next optimal point.
[0.012236510385762744, 2, 6, 0.3133640050209447, 0.40365929776754006, 303, 1, 2]
0.7827668380785149




Iteration No: 23 ended. Search finished for the next optimal point.
Time taken: 0.4213
Function value obtained: -0.7667
Current minimum: -0.7667
Iteration No: 24 started. Searching for the next optimal point.
[0.014104262557974579, 3, 1, 0.21085954274578095, 0.35596326760885144, 998, 1, 2]




0.7821397215602659
Iteration No: 24 ended. Search finished for the next optimal point.
Time taken: 0.8123
Function value obtained: -0.7837
Current minimum: -0.7837
Iteration No: 25 started. Searching for the next optimal point.
[0.009372820666229503, 1, 4, 0.2653543484849527, 0.3644825834262291, 187, 1, 2]
0.7219051799824407




Iteration No: 25 ended. Search finished for the next optimal point.
Time taken: 0.3188
Function value obtained: -0.7062
Current minimum: -0.7837
Iteration No: 26 started. Searching for the next optimal point.
[0.041011321923526405, 3, 1, 0.09377729128111109, 0.14481265134677324, 891, 1, 2]




0.7500940674777373
Iteration No: 26 ended. Search finished for the next optimal point.
Time taken: 0.5686
Function value obtained: -0.7430
Current minimum: -0.7837
Iteration No: 27 started. Searching for the next optimal point.
[0.017496037150178285, 7, 1, 0.19784468993494003, 0.4566493375034152, 813, 1, 5]




0.7807600652201179
Iteration No: 27 ended. Search finished for the next optimal point.
Time taken: 5.7221
Function value obtained: -0.7722
Current minimum: -0.7837
Iteration No: 28 started. Searching for the next optimal point.
[0.03676086102612071, 2, 1, 0.1371992651017364, 0.1690641640866839, 440, 1, 5]




0.7814498933901918
Iteration No: 28 ended. Search finished for the next optimal point.
Time taken: 0.7849
Function value obtained: -0.7750
Current minimum: -0.7837
Iteration No: 29 started. Searching for the next optimal point.
[0.010701010329000423, 3, 1, 0.11846601216658804, 0.33027482992752977, 843, 1, 5]




0.7911074877712279
Iteration No: 29 ended. Search finished for the next optimal point.
Time taken: 1.3467
Function value obtained: -0.7819
Current minimum: -0.7837
Iteration No: 30 started. Searching for the next optimal point.
[0.0033851266778424437, 5, 1, 0.08767940409749594, 0.6583975263738645, 898, 1, 5]




0.7786278690580709
Iteration No: 30 ended. Search finished for the next optimal point.
Time taken: 3.5295
Function value obtained: -0.7587
Current minimum: -0.7837
Iteration No: 31 started. Searching for the next optimal point.
[0.0025959048263979054, 4, 1, 0.5961879019065093, 0.2765789164048932, 972, 1, 5]




0.7928634140223254
Iteration No: 31 ended. Search finished for the next optimal point.
Time taken: 2.8414
Function value obtained: -0.7905
Current minimum: -0.7905
Iteration No: 32 started. Searching for the next optimal point.
[0.0011588605144421265, 8, 1, 0.5166562694949383, 0.9626699097071043, 901, 1, 5]




0.7781261758434718
Iteration No: 32 ended. Search finished for the next optimal point.
Time taken: 16.2763
Function value obtained: -0.7619
Current minimum: -0.7905
Iteration No: 33 started. Searching for the next optimal point.
[0.00234269797634117, 2, 1, 0.9974316859505784, 0.34971496727069207, 516, 1, 5]




0.7535118525021948
Iteration No: 33 ended. Search finished for the next optimal point.
Time taken: 2.1800
Function value obtained: -0.7346
Current minimum: -0.7905
Iteration No: 34 started. Searching for the next optimal point.
[0.001702216955425863, 2, 2, 0.8907848370369225, 0.07959805147222872, 905, 1, 4]




0.7412203687445127
Iteration No: 34 ended. Search finished for the next optimal point.
Time taken: 0.5649
Function value obtained: -0.7359
Current minimum: -0.7905
Iteration No: 35 started. Searching for the next optimal point.
[0.0010026768123704466, 4, 1, 0.7060335714621119, 0.31621464449039605, 645, 1, 5]




0.7571804841339521
Iteration No: 35 ended. Search finished for the next optimal point.
Time taken: 3.1676
Function value obtained: -0.7487
Current minimum: -0.7905
Iteration No: 36 started. Searching for the next optimal point.
[0.0010718812590120985, 5, 2, 0.4168604336399415, 0.7650272620581396, 797, 1, 5]




0.7811363351310674
Iteration No: 36 ended. Search finished for the next optimal point.
Time taken: 1.0822
Function value obtained: -0.7574
Current minimum: -0.7905
Iteration No: 37 started. Searching for the next optimal point.
[0.0019688778585705913, 3, 2, 0.2784599553222719, 0.08918816627419876, 940, 1, 5]




0.7778753292361721
Iteration No: 37 ended. Search finished for the next optimal point.
Time taken: 0.6477
Function value obtained: -0.7816
Current minimum: -0.7905
Iteration No: 38 started. Searching for the next optimal point.
[0.07903121890945496, 1, 1, 0.1502557352049193, 0.2771246228321132, 927, 2, 5]




0.7705067101467451
Iteration No: 38 ended. Search finished for the next optimal point.
Time taken: 0.4761
Function value obtained: -0.7583
Current minimum: -0.7905
Iteration No: 39 started. Searching for the next optimal point.
[0.0020707848804744364, 4, 2, 0.2174949036604109, 0.9545423005606591, 658, 1, 5]




0.7864041138843597
Iteration No: 39 ended. Search finished for the next optimal point.
Time taken: 0.8856
Function value obtained: -0.7586
Current minimum: -0.7905
Iteration No: 40 started. Searching for the next optimal point.
[0.021207980975448733, 1, 1, 0.07725019950554887, 0.394660288490393, 992, 2, 5]




0.7399347798821021
Iteration No: 40 ended. Search finished for the next optimal point.
Time taken: 0.5068
Function value obtained: -0.7181
Current minimum: -0.7905
Iteration No: 41 started. Searching for the next optimal point.
[0.0010688352891869697, 1, 2, 0.2281963261338918, 0.2028501712830087, 401, 1, 5]
0.6822400602031856




Iteration No: 41 ended. Search finished for the next optimal point.
Time taken: 0.4466
Function value obtained: -0.6232
Current minimum: -0.7905
Iteration No: 42 started. Searching for the next optimal point.
[0.0015411870889187908, 1, 1, 0.2743521711201572, 0.905806648158753, 977, 1, 5]




0.7295873573309921
Iteration No: 42 ended. Search finished for the next optimal point.
Time taken: 1.1057
Function value obtained: -0.7050
Current minimum: -0.7905
Iteration No: 43 started. Searching for the next optimal point.
[0.003616895340961819, 5, 4, 0.36675251589191943, 0.22345640956152302, 949, 1, 4]




0.7894142731719554
Iteration No: 43 ended. Search finished for the next optimal point.
Time taken: 0.7202
Function value obtained: -0.7783
Current minimum: -0.7905
Iteration No: 44 started. Searching for the next optimal point.
[0.008803495707499095, 1, 2, 0.4993455330148246, 0.2550917631464051, 932, 1, 2]
0.7805092186128183




Iteration No: 44 ended. Search finished for the next optimal point.
Time taken: 0.4595
Function value obtained: -0.7649
Current minimum: -0.7905
Iteration No: 45 started. Searching for the next optimal point.
[0.001042898297950985, 4, 1, 0.24066368374010344, 0.27983952251916183, 723, 1, 1]




0.698607801329487
Iteration No: 45 ended. Search finished for the next optimal point.
Time taken: 0.5935
Function value obtained: -0.6338
Current minimum: -0.7905
Iteration No: 46 started. Searching for the next optimal point.
[0.0013143957514453646, 8, 2, 0.29936293746139114, 0.0588174239044972, 927, 1, 5]




0.7888498683055312
Iteration No: 46 ended. Search finished for the next optimal point.
Time taken: 1.0293
Function value obtained: -0.7861
Current minimum: -0.7905
Iteration No: 47 started. Searching for the next optimal point.
[0.014591029403585629, 5, 2, 0.17376204850346286, 0.20831080346578928, 955, 1, 1]




0.7209958610309795
Iteration No: 47 ended. Search finished for the next optimal point.
Time taken: 0.5809
Function value obtained: -0.6984
Current minimum: -0.7905
Iteration No: 48 started. Searching for the next optimal point.
[0.0012080104824077195, 5, 20, 0.3471491660400161, 0.2715484723487133, 947, 1, 5]




0.714536560893014
Iteration No: 48 ended. Search finished for the next optimal point.
Time taken: 0.6253
Function value obtained: -0.6766
Current minimum: -0.7905
Iteration No: 49 started. Searching for the next optimal point.
[0.0013558332246312638, 10, 2, 0.667565460226446, 0.2794917885514963, 650, 1, 5]




0.8036498181362097
Iteration No: 49 ended. Search finished for the next optimal point.
Time taken: 2.0457
Function value obtained: -0.7954
Current minimum: -0.7954
Iteration No: 50 started. Searching for the next optimal point.
[0.00135582937161669, 9, 2, 0.48544346796143395, 0.3837558448803829, 670, 1, 4]




0.7943684936661232
Iteration No: 50 ended. Search finished for the next optimal point.
Time taken: 1.8212
Function value obtained: -0.7948
Current minimum: -0.7954


In [20]:
lr, max_depth, min_child_samples, subsample, colsample_bytree, n_estimators, min_df, ngram_range = res.x

In [25]:
ngram_range = (1, ngram_range)
title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
title_bow_train = title_vec.fit_transform(train_titles)
title_bow_val = title_vec.transform(val_titles)

X_train_title = hstack([X_train, title_bow_train])
X_val_title = hstack([X_val, title_bow_val])

lgbm = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth, 
                     min_child_samples=min_child_samples, subsample=subsample,
                     colsample_bytree=colsample_bytree, bagging_freq=1,n_estimators=n_estimators, random_state=0, 
                     class_weight='balanced', n_jobs=8)
lgbm.fit(X_train_title, y_train)

LGBMClassifier(bagging_freq=1, boosting_type='gbdt', class_weight='balanced',
               colsample_bytree=0.2794917885514963, importance_type='split',
               learning_rate=0.0013558332246312638, max_depth=10,
               min_child_samples=2, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=650, n_jobs=8, num_leaves=1024, objective=None,
               random_state=0, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=0.667565460226446, subsample_for_bin=200000,
               subsample_freq=0)

In [26]:
lgbm_train_proba = lgbm.predict_proba(X_train_title)
lgbm_train_preds = lgbm.predict(X_train_title)
lgbm_val_proba = lgbm.predict_proba(X_val_title)
lgbm_val_preds = lgbm.predict(X_val_title)



In [27]:
# Getting the metrics
print('TRAIN METRICS:')
print('log_loss: ', log_loss(y_train, lgbm_train_preds))
print('avg_precision_score: ', average_precision_score(y_train, lgbm_train_proba[:, 1]))
print('roc_auc: ', roc_auc_score(y_train, lgbm_train_proba[:, 1]))

print('\nVALIDATION METRICS:')
print('log_loss: ', log_loss(y_val, lgbm_val_preds))
print('avg_precision_score: ', average_precision_score(y_val, lgbm_val_proba[:, 1]))
print('roc_auc: ', roc_auc_score(y_val, lgbm_val_proba[:, 1]))

TRAIN METRICS:
log_loss:  3.383648029784031
avg_precision_score:  0.9689868191320634
roc_auc:  0.9944362017804154

VALIDATION METRICS:
log_loss:  12.559592978264604
avg_precision_score:  0.7953813802384352
roc_auc:  0.8036498181362097


In [28]:
pd.DataFrame({"RF": rf_val_proba[:, 1], "LGBM": lgbm_val_proba[:, 1]}).corr()

Unnamed: 0,RF,LGBM
RF,1.0,0.767823
LGBM,0.767823,1.0


In [38]:
rf_weights = np.linspace(0,1,20)

for rf_weight in rf_weights:
    p = rf_weight*rf_val_proba[:, 1] + (1-rf_weight)*lgbm_val_proba[:, 1]
    print(rf_weight)
    print(average_precision_score(y_val, p), roc_auc_score(y_val, p))

0.0
0.7953813802384352 0.8036498181362097
0.05263157894736842
0.7970241899631076 0.8045277812617585
0.10526315789473684
0.7973648162198794 0.8054684560391321
0.15789473684210525
0.7976410482567151 0.8065345541201555
0.21052631578947367
0.799715393580658 0.8081650570676032
0.2631578947368421
0.8002670490593915 0.8084786153267277
0.3157894736842105
0.798941363584448 0.8078514988084786
0.3684210526315789
0.7966577711713432 0.8066599774238054
0.42105263157894735
0.7947257044384437 0.8051548977800075
0.47368421052631576
0.7904753673042139 0.8031481249216105
0.5263157894736842
0.7861263179186391 0.8008277938040889
0.5789473684210527
0.7844234623381304 0.7998871190267152
0.631578947368421
0.782254007751841 0.7981939044274425
0.6842105263157894
0.780342561305207 0.7976922112128433
0.7368421052631579
0.7780369359080768 0.795936284961746
0.7894736842105263
0.776059983252292 0.7945566286215978
0.8421052631578947
0.7748604094222509 0.7932396839332747
0.894736842105263
0.7733560760526245 0.79091935

In [40]:
p = 0.26*rf_val_proba[:, 1] + 0.74*lgbm_val_proba[:, 1]
print(average_precision_score(y_val, p), roc_auc_score(y_val, p))

0.8001519316027363 0.8084786153267277


In [43]:
import joblib as jb
jb.dump(lgbm, "../pkls/lgbm_20200324.pkl.z")
jb.dump(rfc, "../pkls/rf_20200324.pkl.z")
jb.dump(title_vec, "../pkls/titlebow_20200324.pkl.z")

['../pkls/titlebow_20200324.pkl.z']