In [1]:
import pandas as pd
import numpy as np
import re
import time

pd.set_option('max.columns', 131)

In [2]:
df = pd.read_csv('raw_data_with_labels.csv')
df = df[df['y'].notnull()]
df.shape

(91, 4)

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [4]:
df_limpo = pd.DataFrame(index=df.index)
df_limpo['title'] = df['title']

In [5]:
df['upload_date'] = pd.to_datetime(df['upload_date'], format='%Y-%m-%d')
df['view_count'] = df['view_count'].astype(int)
df_limpo = df.copy()
df_limpo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 91 entries, 0 to 231
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   title        91 non-null     object        
 1   upload_date  91 non-null     datetime64[ns]
 2   view_count   91 non-null     int64         
 3   y            91 non-null     float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 3.6+ KB


In [6]:
features = pd.DataFrame(index=df_limpo.index)
y = df['y'].copy()

In [7]:
features['tempo_desde_pub'] = (pd.to_datetime('2021-01-31') - df_limpo['upload_date']) / np.timedelta64(1, 'D')
features['views'] = df_limpo['view_count']
features['views_por_dia'] = features['views'] / features['tempo_desde_pub']
features = features.drop(['tempo_desde_pub'], axis=1)

In [8]:
Xtrain, Xval = features.iloc[:45], features.iloc[45:]
ytrain, yval = y.iloc[:45], y.iloc[45:]
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((45, 2), (46, 2), (45,), (46,))

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

title_train = df_limpo.iloc[:45]['title']
title_val = df_limpo.iloc[45:]['title']

title_vec = TfidfVectorizer(min_df=2)
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

In [26]:
title_bow_train.shape

(45, 44)

In [27]:
from scipy.sparse import hstack

Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])

In [28]:
mdl = RandomForestClassifier(n_estimators=1000, random_state=0, class_weight='balanced', n_jobs=3)
mdl.fit(Xtrain_wtitle, ytrain)

RandomForestClassifier(class_weight='balanced', n_estimators=1000, n_jobs=3,
                       random_state=0)

In [29]:
p = mdl.predict_proba(Xval_wtitle)[:, 1]

In [30]:
from sklearn.metrics import roc_auc_score, average_precision_score

In [31]:
average_precision_score(yval, p)

0.41925805072356803

In [32]:
roc_auc_score(yval, p)

0.45039682539682546