In [1]:
import pandas as pd
import numpy as np

pd.set_option('max.columns', 131)

In [2]:
df = pd.read_csv('full_labels.csv').dropna(subset=['y'])

In [3]:
df.duplicated().mean()

0.0

In [4]:
df.shape

(297, 4)

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [6]:
df['upload_date'] = pd.to_datetime(df['upload_date'], format='%Y-%m-%d')
df['view_count'] = df['view_count'].astype(int)
df = df.sort_values('upload_date')
df_limpo = df.copy()
df_limpo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 297 entries, 151 to 34
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   title        297 non-null    object        
 1   upload_date  297 non-null    datetime64[ns]
 2   view_count   297 non-null    int64         
 3   y            297 non-null    int64         
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 11.6+ KB


In [7]:
features = pd.DataFrame(index=df_limpo.index)
y = df['y'].copy()

In [8]:
features['tempo_desde_pub'] = (pd.to_datetime('2021-01-31') - df_limpo['upload_date']) / np.timedelta64(1, 'D')
features['views'] = df_limpo['view_count']
features['views_por_dia'] = features['views'] / features['tempo_desde_pub']
features = features.drop(['tempo_desde_pub'], axis=1)

In [9]:
Xtrain, Xval = features.iloc[:140], features.iloc[140:]
ytrain, yval = y.iloc[:140], y.iloc[140:]
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((140, 2), (157, 2), (140,), (157,))

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

title_train = df_limpo.iloc[:140]['title']
title_val = df_limpo.iloc[140:]['title']

title_vec = TfidfVectorizer(min_df=1, ngram_range=(1, 3))
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

In [11]:
title_bow_train.shape

(140, 2183)

In [12]:
from scipy.sparse import hstack

Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])

In [13]:
Xtrain_wtitle.shape, Xval_wtitle.shape

((140, 2185), (157, 2185))

# Random Forest

In [14]:
mdl = RandomForestClassifier(n_estimators=1000, random_state=0, min_samples_leaf=1, class_weight='balanced', n_jobs=3)
mdl.fit(Xtrain_wtitle, ytrain)

RandomForestClassifier(class_weight='balanced', n_estimators=1000, n_jobs=3,
                       random_state=0)

In [15]:
p = mdl.predict_proba(Xval_wtitle)[:, 1]

In [16]:
from sklearn.metrics import roc_auc_score, average_precision_score

In [17]:
print(f'avg={average_precision_score(yval, p)}, auc={roc_auc_score(yval, p)}')

avg=0.4452215654214563, auc=0.5782456140350878
