In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, log_loss
import scikitplot as skplt
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

%matplotlib inline

In [2]:
df = pd.read_csv('../data/raw_data.csv')
df = df[df['y'].notnull()]

df_clean = pd.DataFrame(index=df.index)

# Clean date attribute
clean_date = df['watch-time-text'].str.extract(r"(\d+) de ([a-z]+)\. de (\d+)")
clean_date[0] = clean_date[0].dropna().astype(str)
clean_date[2] = clean_date[2].dropna().astype(str)

month_map = {"jan": "Jan",
             "fev": "Feb",
             "mar": "Mar",
             "abr": "Apr",
             "mai": "May",
             "jun": "Jun",
             "jul": "Jul",
             "ago": "Aug",
             "set": "Sep",
             "out": "Oct",
             "nov": "Nov",
             "dez": "Dec"}

clean_date[1] = clean_date[1].map(month_map)
clean_date = clean_date.dropna().apply(lambda x: " ".join(x), axis=1)

df_clean['date'] = pd.to_datetime(clean_date, format="%d %b %Y")

# Clean view number
views = df['watch-view-count'].str.extract(r"(\d+\.?\d*)", expand=False)
df_clean['views'] = views.str.replace(".", "").fillna(0).astype(int)

# Makaing features DataFrame
features = pd.DataFrame(index=df_clean.index)
y = df['y'].copy()

# Extracting time since publication feature
features['time_since_pub'] = (pd.to_datetime("2020-03-15") -  # HARDCODED
                              df_clean['date']) / np.timedelta64(1, 'D')

# Extracting n of view feature
features['views'] = df_clean['views']

# Extracting n of view/day feature
features['views_per_day'] = features['views'] / features['time_since_pub']

# Droping time_since_pub to prevent bias
features = features.drop(['time_since_pub'], axis=1)
features['date'] = df_clean['date']
features['index'] = features.index
features = features.set_index('date').sort_index().dropna()

y = pd.DataFrame(y)
y['date'] = df_clean['date']
y['index'] = y.index
y = y.set_index('date').sort_index()
y = y[y.index.notna()]

# Splitting the data set - 60% train 40% validation
n = len(features)
n_train = np.ceil(n * 0.6) - 1
n_val = n - n_train

X_train, X_val = (features.reset_index().loc[:n_train],
                  features.reset_index().loc[n_train+1:])
y_train, y_val = y.reset_index().loc[:n_train], y.reset_index().loc[n_train+1:]


In [3]:
X_train = X_train.set_index('index').drop('date', axis=1)
y_train = y_train.set_index('index').drop('date', axis=1)
X_val = X_val.set_index('index').drop('date', axis=1)
y_val = y_val.set_index('index').drop('date', axis=1)

In [4]:
df_clean['title'] = df['watch-title']

In [5]:
train_titles = df_clean.loc[X_train.index.tolist()]['title']
val_titles = df_clean.loc[X_val.index.tolist()]['title']

In [6]:
title_vec = TfidfVectorizer(min_df=2)
title_bow_train = title_vec.fit_transform(train_titles)
title_bow_val = title_vec.transform(val_titles)

In [7]:
X_train_title = hstack([X_train, title_bow_train])
X_val_title = hstack([X_val, title_bow_val])

In [8]:
mdl = RandomForestClassifier(n_estimators=1000, random_state=0, class_weight="balanced", n_jobs=6)
mdl.fit(X_train_title, y_train['y'])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=6, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [9]:
# Predicting on the validation set
val_proba = mdl.predict_proba(X_val_title)
preds = mdl.predict(X_val_title)

# Getting the metrics
print('log_loss: ', log_loss(y_val, preds))
print('avg_precision_score: ', average_precision_score(y_val, val_proba[:, 1]))
print('roc_auc: ', roc_auc_score(y_val, val_proba[:, 1]))

log_loss:  11.33728379028392
avg_precision_score:  0.7095503582198881
roc_auc:  0.7035914179104477


# ACTIVE LEARNING