In [1]:
import pandas as pd
import numpy as np

pd.set_option('max.columns', 131)

In [2]:
df = pd.read_csv('full_labels.csv').dropna(subset=['y'])

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [4]:
df_limpo = pd.DataFrame(index=df.index)
df_limpo['title'] = df['title']

In [5]:
df['upload_date'] = pd.to_datetime(df['upload_date'], format='%Y-%m-%d')
df['view_count'] = df['view_count'].astype(int)
df_limpo = df.copy()
df_limpo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 297 entries, 0 to 296
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   title        297 non-null    object        
 1   upload_date  297 non-null    datetime64[ns]
 2   view_count   297 non-null    int64         
 3   y            297 non-null    int64         
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 11.6+ KB


In [6]:
features = pd.DataFrame(index=df_limpo.index)
y = df['y'].copy()

In [7]:
features['tempo_desde_pub'] = (pd.to_datetime('2021-01-31') - df_limpo['upload_date']) / np.timedelta64(1, 'D')
features['views'] = df_limpo['view_count']
features['views_por_dia'] = features['views'] / features['tempo_desde_pub']
features = features.drop(['tempo_desde_pub'], axis=1)

In [8]:
Xtrain, Xval = features.iloc[:140], features.iloc[140:]
ytrain, yval = y.iloc[:140], y.iloc[140:]
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((140, 2), (157, 2), (140,), (157,))

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

title_train = df_limpo.iloc[:140]['title']
title_val = df_limpo.iloc[140:]['title']

title_vec = TfidfVectorizer(min_df=1, ngram_range=(1, 3))
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

In [10]:
from scipy.sparse import hstack

Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])

In [11]:
Xtrain_wtitle.shape

(140, 2406)

# RF

In [12]:
mdl_rf = RandomForestClassifier(n_estimators=1000, random_state=0, min_samples_leaf=1, class_weight='balanced', n_jobs=3)
mdl_rf.fit(Xtrain_wtitle, ytrain)

RandomForestClassifier(class_weight='balanced', n_estimators=1000, n_jobs=3,
                       random_state=0)

In [13]:
p_rf = mdl_rf.predict_proba(Xval_wtitle)[:, 1]

In [14]:
def print_model_report(y, p):
    from sklearn.metrics import average_precision_score, roc_auc_score
    
    print(f'ap={average_precision_score(y, p)}, auc={roc_auc_score(y, p)}')

In [15]:
print_model_report(yval, p_rf)

ap=0.4542962326968974, auc=0.5905475397706252


# LGBM

In [16]:
params = [0.009755989568690922, 10, 20, 0.5494659592684037, 0.36790569530401346, 436, 1, 3]

lr = params[0]
max_depth = params[1]
min_child_samples = params[2]
subsample = params[3]
colsample_bytree = params[4]
n_estimators = params[5]
min_df = params[6]
ngram_range = (1, params[7])

In [17]:
title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])

In [18]:
Xtrain_wtitle.shape

(140, 2406)

In [19]:
from lightgbm import LGBMClassifier

In [20]:
mdl_lgbm = LGBMClassifier(
    learning_rate=lr,
    num_leaves=2 ** max_depth,
    max_depth=max_depth,
    min_child_samples=min_child_samples,
    subsample=subsample,
    colsample_bytree=colsample_bytree,
    bagging_freq=1,
    n_estimators=n_estimators,
    random_state=0,
    class_weight='balanced',
    n_jobs=3
)

mdl_lgbm.fit(Xtrain_wtitle, ytrain)



LGBMClassifier(bagging_freq=1, class_weight='balanced',
               colsample_bytree=0.36790569530401346,
               learning_rate=0.009755989568690922, max_depth=10,
               n_estimators=436, n_jobs=3, num_leaves=1024, random_state=0,
               subsample=0.5494659592684037)

In [21]:
p_lgbm = mdl_lgbm.predict_proba(Xval_wtitle)[:, 1]



In [22]:
print_model_report(yval, p_lgbm)

ap=0.31864909369242067, auc=0.49889012208657046


# LR

In [23]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix

In [24]:
Xtrain_wtitle2 = csr_matrix(Xtrain_wtitle.copy())
Xval_wtitle2 = csr_matrix(Xval_wtitle.copy())

lr_pipeline = make_pipeline(MaxAbsScaler(), LogisticRegression(C=.5, penalty='l2', n_jobs=3, random_state=0))
lr_pipeline.fit(Xtrain_wtitle2, ytrain)

Pipeline(steps=[('maxabsscaler', MaxAbsScaler()),
                ('logisticregression',
                 LogisticRegression(C=0.5, n_jobs=3, random_state=0))])

In [25]:
p_lr = lr_pipeline.predict_proba(Xval_wtitle2)[:, 1]

In [26]:
print_model_report(yval, p_lr)

ap=0.4183380097044219, auc=0.6034036256011839


# Ensemble

In [27]:
p = (p_lr * p_rf * p_lgbm) / 3
print_model_report(yval, p)

ap=0.43089223321878534, auc=0.590455049944506


In [28]:
pd.DataFrame({'LR': p_lr, 'RF': p_rf, 'LGBM': p_lgbm}).corr()

Unnamed: 0,LR,RF,LGBM
LR,1.0,0.800318,0.634475
RF,0.800318,1.0,0.848687
LGBM,0.634475,0.848687,1.0


In [29]:
p = 0.7*p_rf + 0.3*p_lgbm
print_model_report(yval, p)

ap=0.43594092258202866, auc=0.5711246762856086


# Salvando o modelo

In [30]:
import joblib as jb

In [31]:
jb.dump(mdl_lgbm, "clean_deploy/lgbm_20210301.pkl.z")
jb.dump(mdl_rf, "clean_deploy/random_forest_20210301.pkl.z")
jb.dump(lr_pipeline, "clean_deploy/logistic_reg_20210301.pkl.z")
jb.dump(title_vec, "clean_deploy/title_vectorizer_20210301.pkl.z")

['clean_deploy/title_vectorizer_20210301.pkl.z']

In [32]:
num_features = csr_matrix(np.array([Xtrain['views'], Xtrain['views_por_dia']]))
hstack([num_features, title_bow_train])
#hstack([Xtrain, title_bow_train])

ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,1].shape[0] == 140, expected 2.