In [1]:
import pandas as pd
import numpy as np

pd.set_option('max.columns', 131)

In [2]:
df = pd.read_csv('full_labels.csv').dropna(subset=['y'])

In [3]:
df.duplicated().mean()

0.0

In [4]:
df.shape

(297, 4)

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [6]:
df['upload_date'] = pd.to_datetime(df['upload_date'], format='%Y-%m-%d')
df['view_count'] = df['view_count'].astype(int)
df = df.sort_values('upload_date')
df_limpo = df.copy()
df_limpo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 297 entries, 151 to 34
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   title        297 non-null    object        
 1   upload_date  297 non-null    datetime64[ns]
 2   view_count   297 non-null    int64         
 3   y            297 non-null    int64         
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 11.6+ KB


In [7]:
features = pd.DataFrame(index=df_limpo.index)
y = df['y'].copy()

In [8]:
features['tempo_desde_pub'] = (pd.to_datetime('2021-01-31') - df_limpo['upload_date']) / np.timedelta64(1, 'D')
features['views'] = df_limpo['view_count']
features['views_por_dia'] = features['views'] / features['tempo_desde_pub']
features = features.drop(['tempo_desde_pub'], axis=1)

In [9]:
Xtrain, Xval = features.iloc[:140], features.iloc[140:]
ytrain, yval = y.iloc[:140], y.iloc[140:]
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((140, 2), (157, 2), (140,), (157,))

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

title_train = df_limpo.iloc[:140]['title']
title_val = df_limpo.iloc[140:]['title']

title_vec = TfidfVectorizer(min_df=1, ngram_range=(1, 3))
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

In [11]:
title_bow_train.shape

(140, 2183)

In [12]:
from scipy.sparse import hstack

Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])

In [13]:
Xtrain_wtitle.shape, Xval_wtitle.shape

((140, 2185), (157, 2185))

# Random Forest

In [14]:
mdl = RandomForestClassifier(n_estimators=1000, random_state=0, min_samples_leaf=1, class_weight='balanced', n_jobs=3)
mdl.fit(Xtrain_wtitle, ytrain)

RandomForestClassifier(class_weight='balanced', n_estimators=1000, n_jobs=3,
                       random_state=0)

In [15]:
p = mdl.predict_proba(Xval_wtitle)[:, 1]

In [16]:
from sklearn.metrics import roc_auc_score, average_precision_score

In [17]:
print(f'avg={average_precision_score(yval, p)}, auc={roc_auc_score(yval, p)}')

avg=0.4452215654214563, auc=0.5782456140350878


# LightGBM

In [18]:
from lightgbm import LGBMClassifier

In [19]:
mdl = LGBMClassifier(random_state=0, class_weight='balanced', n_jobs=3)
mdl.fit(Xtrain_wtitle, ytrain)

LGBMClassifier(class_weight='balanced', n_jobs=3, random_state=0)

In [20]:
p = mdl.predict_proba(Xval_wtitle)[:, 1]



In [21]:
print(f'avg={average_precision_score(yval, p)}, auc={roc_auc_score(yval, p)}')

avg=0.40335269182299466, auc=0.5562280701754385


# Bayesian Optimization

In [22]:
from skopt import forest_minimize

In [23]:
def tune_lgbm(params):
    print(params)
    lr = params[0]
    max_depth = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4]
    n_estimators = params[5]
    min_df = params[6]
    ngram_range = (1, params[7])
    
    title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
    title_bow_train = title_vec.fit_transform(title_train)
    title_bow_val = title_vec.transform(title_val)
    
    Xtrain_wtitle = hstack([Xtrain, title_bow_train])
    Xval_wtitle = hstack([Xval, title_bow_val])
    
    mdl = LGBMClassifier(
        learning_rate=lr,
        num_leaves=2 ** max_depth,
        max_depth=max_depth,
        min_child_samples=min_child_samples,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        bagging_freq=1,
        n_estimators=n_estimators,
        random_state=0,
        class_weight='balanced',
        n_jobs=3
    )
    
    mdl.fit(Xtrain_wtitle, ytrain)
    
    p = mdl.predict_proba(Xval_wtitle)[:, 1]
    
    print(roc_auc_score(yval, p))
    
    return -average_precision_score(yval, p)

space = [
    (1e-3, 1e-1, 'log-uniform'), # lr
    (1, 15), # max_depth
    (1, 20), # min_child_samples
    (0.05, 1.), # subsample
    (0.05, 1.), # colsample_bytree
    (100, 1000), # n_estimators
    (1, 5), # min_df
    (1, 5) # ngram_range
]

res = forest_minimize(tune_lgbm, space, random_state=160745, n_random_starts=20, n_calls=50, verbose=1)

Iteration No: 1 started. Evaluating function at random point.
[0.009944912110647982, 5, 1, 0.4677107511929402, 0.49263223036174764, 272, 3, 1]




0.5378070175438597
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.5296
Function value obtained: -0.4142
Current minimum: -0.4142
Iteration No: 2 started. Evaluating function at random point.
[0.053887464791860025, 1, 15, 0.7437489153990157, 0.8675167974293533, 549, 3, 4]




0.5201754385964913
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.4221
Function value obtained: -0.3959
Current minimum: -0.4142
Iteration No: 3 started. Evaluating function at random point.
[0.004151454520895999, 6, 20, 0.8682075103820793, 0.9491436163200662, 411, 4, 3]
0.5514035087719299
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.1958
Function value obtained: -0.3934
Current minimum: -0.4142
Iteration No: 4 started. Evaluating function at random point.
[0.0014099928811969545, 9, 9, 0.6502182010234373, 0.6866210554187129, 828, 5, 2]




0.5420175438596491
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 0.6624
Function value obtained: -0.3976
Current minimum: -0.4142
Iteration No: 5 started. Evaluating function at random point.
[0.08530558241838007, 8, 19, 0.2137736299768322, 0.1313765544201984, 961, 4, 1]




0.5754385964912281
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 0.2898
Function value obtained: -0.4285
Current minimum: -0.4285
Iteration No: 6 started. Evaluating function at random point.
[0.003567949451535685, 13, 19, 0.7232951768944309, 0.7298538828427115, 939, 4, 3]




0.5512280701754386
Iteration No: 6 ended. Evaluation done at random point.
Time taken: 1.8822
Function value obtained: -0.3923
Current minimum: -0.4285
Iteration No: 7 started. Evaluating function at random point.
[0.014828577273549474, 7, 1, 0.18428087097824575, 0.3261556557915816, 274, 1, 2]




0.5392982456140352
Iteration No: 7 ended. Evaluation done at random point.
Time taken: 0.4993
Function value obtained: -0.3876
Current minimum: -0.4285
Iteration No: 8 started. Evaluating function at random point.
[0.0015212976972079912, 14, 3, 0.8183084505971293, 0.7859673038076707, 189, 5, 3]




0.5468421052631579
Iteration No: 8 ended. Evaluation done at random point.
Time taken: 1.2588
Function value obtained: -0.4157
Current minimum: -0.4285
Iteration No: 9 started. Evaluating function at random point.
[0.009565866803971352, 6, 18, 0.5235636153223084, 0.6728679300083596, 747, 4, 5]
0.5449999999999999
Iteration No: 9 ended. Evaluation done at random point.
Time taken: 0.2235
Function value obtained: -0.4221
Current minimum: -0.4285
Iteration No: 10 started. Evaluating function at random point.
[0.0012116790683302117, 3, 2, 0.06616307483844217, 0.23025600705315752, 677, 2, 5]




0.522280701754386
Iteration No: 10 ended. Evaluation done at random point.
Time taken: 0.3656
Function value obtained: -0.3919
Current minimum: -0.4285
Iteration No: 11 started. Evaluating function at random point.
[0.0053139776214487944, 6, 9, 0.14251441334450304, 0.8175761405215897, 297, 1, 5]
0.5114912280701754
Iteration No: 11 ended. Evaluation done at random point.
Time taken: 0.1157
Function value obtained: -0.3849
Current minimum: -0.4285
Iteration No: 12 started. Evaluating function at random point.
[0.0068572961982704935, 10, 5, 0.2390386584472456, 0.49053406102209746, 176, 2, 4]




0.5338596491228069
Iteration No: 12 ended. Evaluation done at random point.
Time taken: 0.2036
Function value obtained: -0.3911
Current minimum: -0.4285
Iteration No: 13 started. Evaluating function at random point.
[0.00781968225875022, 3, 4, 0.7078936710077383, 0.31818755505678337, 275, 4, 4]
0.5112280701754386
Iteration No: 13 ended. Evaluation done at random point.
Time taken: 0.1368
Function value obtained: -0.4121
Current minimum: -0.4285
Iteration No: 14 started. Evaluating function at random point.
[0.017293945600511968, 2, 15, 0.9007557574888567, 0.41026441194439994, 316, 5, 1]




0.5414035087719298
Iteration No: 14 ended. Evaluation done at random point.
Time taken: 0.0937
Function value obtained: -0.3989
Current minimum: -0.4285
Iteration No: 15 started. Evaluating function at random point.
[0.012250750764764855, 8, 6, 0.5976582413192033, 0.2474882432951916, 516, 4, 4]




0.5113157894736842
Iteration No: 15 ended. Evaluation done at random point.
Time taken: 0.3184
Function value obtained: -0.3631
Current minimum: -0.4285
Iteration No: 16 started. Evaluating function at random point.
[0.018353598126553926, 4, 3, 0.47305622526323254, 0.1404164811277527, 133, 4, 1]
0.4993859649122807
Iteration No: 16 ended. Evaluation done at random point.
Time taken: 0.1116
Function value obtained: -0.3754
Current minimum: -0.4285
Iteration No: 17 started. Evaluating function at random point.
[0.0010383234748454694, 15, 19, 0.9256771571832196, 0.9321438677645206, 312, 4, 3]




0.5323684210526316
Iteration No: 17 ended. Evaluation done at random point.
Time taken: 2.1645
Function value obtained: -0.3797
Current minimum: -0.4285
Iteration No: 18 started. Evaluating function at random point.
[0.004955229758078229, 5, 5, 0.06939551310802591, 0.4193273080472823, 725, 4, 1]
0.546578947368421
Iteration No: 18 ended. Evaluation done at random point.
Time taken: 0.1158
Function value obtained: -0.4121
Current minimum: -0.4285
Iteration No: 19 started. Evaluating function at random point.
[0.0699516121742407, 11, 9, 0.22351444794819092, 0.9946871410890346, 947, 5, 1]




0.5012280701754386
Iteration No: 19 ended. Evaluation done at random point.
Time taken: 0.4032
Function value obtained: -0.3812
Current minimum: -0.4285
Iteration No: 20 started. Evaluating function at random point.
[0.004955402904180171, 6, 1, 0.13021457554920057, 0.6158804906347372, 615, 3, 4]




0.5416666666666666
Iteration No: 20 ended. Evaluation done at random point.
Time taken: 1.0105
Function value obtained: -0.3846
Current minimum: -0.4285
Iteration No: 21 started. Searching for the next optimal point.
[0.05195259350021447, 11, 20, 0.7078945360275744, 0.15060221518303796, 749, 4, 1]




0.5163157894736841
Iteration No: 21 ended. Search finished for the next optimal point.
Time taken: 0.8017
Function value obtained: -0.3750
Current minimum: -0.4285
Iteration No: 22 started. Searching for the next optimal point.
[0.04528337896351853, 7, 18, 0.10341309161224294, 0.16695132655302086, 925, 3, 1]
0.5




Iteration No: 22 ended. Search finished for the next optimal point.
Time taken: 0.6742
Function value obtained: -0.3631
Current minimum: -0.4285
Iteration No: 23 started. Searching for the next optimal point.
[0.0562789232945079, 6, 18, 0.05968626595221926, 0.1771252964136254, 998, 4, 3]
0.5




Iteration No: 23 ended. Search finished for the next optimal point.
Time taken: 0.5746
Function value obtained: -0.3631
Current minimum: -0.4285
Iteration No: 24 started. Searching for the next optimal point.
[0.010962900927020083, 6, 19, 0.49769994947152246, 0.11202159692601399, 930, 4, 1]
0.5253508771929825




Iteration No: 24 ended. Search finished for the next optimal point.
Time taken: 0.7342
Function value obtained: -0.3899
Current minimum: -0.4285
Iteration No: 25 started. Searching for the next optimal point.
[0.05560352507779976, 15, 19, 0.19336126704259698, 0.1484505857435771, 957, 2, 3]




0.5389473684210526
Iteration No: 25 ended. Search finished for the next optimal point.
Time taken: 2.7469
Function value obtained: -0.3857
Current minimum: -0.4285
Iteration No: 26 started. Searching for the next optimal point.
[0.05906522180447278, 7, 17, 0.26855923801299086, 0.2893216994982986, 775, 4, 1]
0.4485087719298245




Iteration No: 26 ended. Search finished for the next optimal point.
Time taken: 0.6477
Function value obtained: -0.3511
Current minimum: -0.4285
Iteration No: 27 started. Searching for the next optimal point.
[0.0694628857577433, 1, 19, 0.3803474290968138, 0.6668484325272054, 862, 4, 1]
0.4619298245614035




Iteration No: 27 ended. Search finished for the next optimal point.
Time taken: 0.6911
Function value obtained: -0.3553
Current minimum: -0.4285
Iteration No: 28 started. Searching for the next optimal point.
[0.010603151967481083, 5, 17, 0.46466562505330733, 0.3817140249132511, 523, 4, 5]
0.5432456140350878




Iteration No: 28 ended. Search finished for the next optimal point.
Time taken: 0.7399
Function value obtained: -0.4273
Current minimum: -0.4285
Iteration No: 29 started. Searching for the next optimal point.
[0.01029606737114396, 7, 17, 0.5178833270961892, 0.3765751900385231, 533, 4, 4]
0.547719298245614




Iteration No: 29 ended. Search finished for the next optimal point.
Time taken: 0.6188
Function value obtained: -0.4316
Current minimum: -0.4316
Iteration No: 30 started. Searching for the next optimal point.
[0.017641152133788654, 13, 18, 0.45356929712943245, 0.33286819320928085, 778, 5, 5]




0.5492982456140351
Iteration No: 30 ended. Search finished for the next optimal point.
Time taken: 1.1204
Function value obtained: -0.4247
Current minimum: -0.4316
Iteration No: 31 started. Searching for the next optimal point.
[0.0786438498774217, 5, 18, 0.5091353721186425, 0.3719407265190602, 159, 5, 5]
0.5343859649122806




Iteration No: 31 ended. Search finished for the next optimal point.
Time taken: 0.5954
Function value obtained: -0.4206
Current minimum: -0.4316
Iteration No: 32 started. Searching for the next optimal point.
[0.047109107142463906, 6, 19, 0.6117008830176226, 0.27017730114785654, 773, 4, 5]
0.5108771929824563




Iteration No: 32 ended. Search finished for the next optimal point.
Time taken: 0.6867
Function value obtained: -0.3731
Current minimum: -0.4316
Iteration No: 33 started. Searching for the next optimal point.
[0.025503779085629487, 5, 19, 0.42143742309444476, 0.590462690732925, 526, 5, 5]
0.5399122807017543




Iteration No: 33 ended. Search finished for the next optimal point.
Time taken: 0.7117
Function value obtained: -0.4228
Current minimum: -0.4316
Iteration No: 34 started. Searching for the next optimal point.
[0.014709797661405474, 11, 18, 0.5006381259619506, 0.3350134678074511, 232, 4, 5]
0.5628070175438596




Iteration No: 34 ended. Search finished for the next optimal point.
Time taken: 0.6787
Function value obtained: -0.4288
Current minimum: -0.4316
Iteration No: 35 started. Searching for the next optimal point.
[0.017260409514394472, 15, 16, 0.5141379876684112, 0.2610771859916649, 148, 1, 5]




0.5535964912280702
Iteration No: 35 ended. Search finished for the next optimal point.
Time taken: 1.1841
Function value obtained: -0.4110
Current minimum: -0.4316
Iteration No: 36 started. Searching for the next optimal point.
[0.04689061354539554, 14, 20, 0.44958764316895183, 0.36995342221184535, 612, 4, 5]




0.518859649122807
Iteration No: 36 ended. Search finished for the next optimal point.
Time taken: 1.7048
Function value obtained: -0.4179
Current minimum: -0.4316
Iteration No: 37 started. Searching for the next optimal point.
[0.007686091527671306, 4, 17, 0.4607533993842288, 0.20231921708771167, 839, 4, 5]
0.5443859649122806




Iteration No: 37 ended. Search finished for the next optimal point.
Time taken: 0.7009
Function value obtained: -0.4187
Current minimum: -0.4316
Iteration No: 38 started. Searching for the next optimal point.
[0.009755989568690922, 10, 20, 0.5494659592684037, 0.36790569530401346, 436, 4, 4]
0.5580701754385965




Iteration No: 38 ended. Search finished for the next optimal point.
Time taken: 0.6855
Function value obtained: -0.4591
Current minimum: -0.4591
Iteration No: 39 started. Searching for the next optimal point.
[0.010996126177207218, 13, 17, 0.5353854209330845, 0.2135022545834191, 435, 5, 4]




0.5601754385964911
Iteration No: 39 ended. Search finished for the next optimal point.
Time taken: 0.8832
Function value obtained: -0.4210
Current minimum: -0.4591
Iteration No: 40 started. Searching for the next optimal point.
[0.010952386849461903, 9, 20, 0.6605979632535616, 0.31523630549668197, 346, 4, 4]
0.5596491228070175




Iteration No: 40 ended. Search finished for the next optimal point.
Time taken: 0.6393
Function value obtained: -0.4348
Current minimum: -0.4591
Iteration No: 41 started. Searching for the next optimal point.
[0.012036439837569347, 6, 20, 0.1877743593612452, 0.38252854949946674, 300, 4, 4]
0.5264912280701755




Iteration No: 41 ended. Search finished for the next optimal point.
Time taken: 0.6088
Function value obtained: -0.3779
Current minimum: -0.4591
Iteration No: 42 started. Searching for the next optimal point.
[0.0023636666783931155, 12, 20, 0.7316965691289751, 0.27040196313994846, 502, 5, 4]




0.5643859649122807
Iteration No: 42 ended. Search finished for the next optimal point.
Time taken: 0.8286
Function value obtained: -0.4193
Current minimum: -0.4591
Iteration No: 43 started. Searching for the next optimal point.
[0.021878064938048142, 11, 20, 0.8532177745647387, 0.41208157156770825, 528, 4, 4]




0.525
Iteration No: 43 ended. Search finished for the next optimal point.
Time taken: 0.8174
Function value obtained: -0.3793
Current minimum: -0.4591
Iteration No: 44 started. Searching for the next optimal point.
[0.005411107100146343, 14, 18, 0.6485703538039759, 0.3667013371216595, 383, 4, 4]




0.5528070175438596
Iteration No: 44 ended. Search finished for the next optimal point.
Time taken: 1.2800
Function value obtained: -0.4230
Current minimum: -0.4591
Iteration No: 45 started. Searching for the next optimal point.
[0.01079136410462864, 8, 19, 0.5522853581078869, 0.5674994772391371, 464, 4, 4]
0.5534210526315789




Iteration No: 45 ended. Search finished for the next optimal point.
Time taken: 0.5908
Function value obtained: -0.4351
Current minimum: -0.4591
Iteration No: 46 started. Searching for the next optimal point.
[0.013794932689210995, 8, 19, 0.5415878644310625, 0.5877794797196334, 315, 4, 4]
0.5568421052631579




Iteration No: 46 ended. Search finished for the next optimal point.
Time taken: 0.5968
Function value obtained: -0.4291
Current minimum: -0.4591
Iteration No: 47 started. Searching for the next optimal point.
[0.011369608176962588, 7, 20, 0.36235950100439596, 0.3479103628708842, 273, 4, 4]
0.5605263157894737




Iteration No: 47 ended. Search finished for the next optimal point.
Time taken: 0.6575
Function value obtained: -0.4205
Current minimum: -0.4591
Iteration No: 48 started. Searching for the next optimal point.
[0.011399736313634907, 15, 19, 0.5743066524685474, 0.3578383405198918, 930, 4, 4]




0.5320175438596491
Iteration No: 48 ended. Search finished for the next optimal point.
Time taken: 4.6141
Function value obtained: -0.3883
Current minimum: -0.4591
Iteration No: 49 started. Searching for the next optimal point.
[0.010942300203808828, 9, 14, 0.5414389443862576, 0.7555998291102568, 420, 5, 4]
0.5570175438596491




Iteration No: 49 ended. Search finished for the next optimal point.
Time taken: 0.6909
Function value obtained: -0.4103
Current minimum: -0.4591
Iteration No: 50 started. Searching for the next optimal point.
[0.003429106657950565, 10, 20, 0.36099260125130456, 0.20334746014685123, 634, 4, 4]
0.5353508771929825




Iteration No: 50 ended. Search finished for the next optimal point.
Time taken: 0.6634
Function value obtained: -0.4001
Current minimum: -0.4591
