In [1]:
import pandas as pd
import numpy as np
import re
import time
import bs4 as bs4
import json
import glob
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

In [2]:
df1 = pd.read_csv("../data/raw_data.csv")
df1 = df1[df1['y'].notnull()]
df1.shape

(500, 18)

In [3]:
df2 = pd.read_csv("../data/active_label_done.csv", index_col=0)
df2 = df2[df2['y'].notnull()]
df2['new_data'] = 1
df2.shape

(100, 20)

In [4]:
from sklearn.metrics import roc_auc_score, average_precision_score
average_precision_score(df2['y'],df2['p']), roc_auc_score(df2['y'],df2['p'])

(0.4045743325634558, 0.6466731423020884)

In [5]:
df = pd.concat([df1, df2.drop("p", axis=1)])

In [6]:
df_clean = pd.DataFrame(index=df.index)
df_clean['new_data'] = df2['new_data']
df_clean['new_data'].fillna(0, inplace=True)
df_clean['title'] = df['watch-title']

# Clean date attribute
clean_date = df['watch-time-text'].str.extract(r"(\d+) de ([a-z]+)\. de (\d+)")
clean_date[0] = clean_date[0].dropna().astype(str)
clean_date[2] = clean_date[2].dropna().astype(str)

month_map = {"jan": "Jan",
             "fev": "Feb",
             "mar": "Mar",
             "abr": "Apr",
             "mai": "May",
             "jun": "Jun",
             "jul": "Jul",
             "ago": "Aug",
             "set": "Sep",
             "out": "Oct",
             "nov": "Nov",
             "dez": "Dec"}

clean_date[1] = clean_date[1].map(month_map)
clean_date = clean_date.dropna().apply(lambda x: " ".join(x), axis=1)

df_clean['date'] = pd.to_datetime(clean_date, format="%d %b %Y")

# Clean view number
views = df['watch-view-count'].str.extract(r"(\d+\.?\d*)", expand=False)
df_clean['views'] = views.str.replace(".", "").fillna(0).astype(int)

# Makaing features DataFrame
features = pd.DataFrame(index=df_clean.index)
y = df['y'].copy()

# Extracting time since publication feature
features['time_since_pub'] = (pd.to_datetime("2020-03-24") -  # HARDCODED
                              df_clean['date']) / np.timedelta64(1, 'D')

# Extracting n of view feature
features['views'] = df_clean['views']

# Extracting n of view/day feature
features['views_per_day'] = features['views'] / features['time_since_pub']

# Droping time_since_pub to prevent bias
features = features.drop(['time_since_pub'], axis=1)

In [7]:
# Around 75% train and 25% to validation
split_date = '2020-02-27'
mask_train = (df_clean['date'] < split_date) & (df_clean['date'].notnull())
mask_val = (df_clean['date'] >= split_date) & (df_clean['date'].notnull())

X_train, X_val = features[mask_train.values], features[mask_val.values]
y_train, y_val = y[mask_train.values], y[mask_val.values]
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((450, 2), (147, 2), (450,), (147,))

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

train_titles = df_clean[mask_train]['title']
val_titles = df_clean[mask_val]['title']

title_vec = TfidfVectorizer(min_df=2)
title_bow_train = title_vec.fit_transform(train_titles)
title_bow_val = title_vec.transform(val_titles)

In [9]:
from scipy.sparse import hstack

# Concat the BoW into features df
X_train_title = hstack([X_train, title_bow_train])
X_val_title = hstack([X_val, title_bow_val])

In [10]:
# Random Forest Model
mdl = RandomForestClassifier(n_estimators=1000, random_state=42,
                             class_weight="balanced", n_jobs=8)
mdl.fit(X_train_title, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=8, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [11]:
train_proba = mdl.predict_proba(X_train_title)
train_preds = mdl.predict(X_train_title)

val_proba = mdl.predict_proba(X_val_title)
val_preds = mdl.predict(X_val_title)

In [12]:
from sklearn.metrics import roc_auc_score, average_precision_score, log_loss

In [13]:
# Getting the metrics
print('TRAIN METRICS:')
print('log_loss: ', log_loss(y_train, train_preds))
print('avg_precision_score: ', average_precision_score(y_train, train_proba[:, 1]))
print('roc_auc: ', roc_auc_score(y_train, train_proba[:, 1]))

print('\nVALIDATION METRICS:')
print('log_loss: ', log_loss(y_val, val_preds))
print('avg_precision_score: ', average_precision_score(y_val, val_proba[:, 1]))
print('roc_auc: ', roc_auc_score(y_val, val_proba[:, 1]))

TRAIN METRICS:
log_loss:  9.992007221626413e-16
avg_precision_score:  1.0
roc_auc:  1.0

VALIDATION METRICS:
log_loss:  19.266528329133855
avg_precision_score:  0.7552259526497553
roc_auc:  0.7122071050642479
