In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import hstack
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, log_loss
from skopt import forest_minimize
from sklearn.preprocessing import MaxAbsScaler, StandardScaler
from scipy.sparse import csr_matrix

In [None]:
df = pd.read_csv('../data/raw_data_labeled.csv')

df = df[df['y'].notnull()]

In [None]:
df.isnull().sum()

In [None]:
# Clean date attribute
clean_date = df['watch-time-text'].str.extract(r"(\d+) de ([a-z]+)\. de (\d+)")
clean_date[0] = clean_date[0].dropna().astype(str)
clean_date[2] = clean_date[2].dropna().astype(str)

month_map = {"jan": "Jan",
             "fev": "Feb",
             "mar": "Mar",
             "abr": "Apr",
             "mai": "May",
             "jun": "Jun",
             "jul": "Jul",
             "ago": "Aug",
             "set": "Sep",
             "out": "Oct",
             "nov": "Nov",
             "dez": "Dec"}

clean_date[1] = clean_date[1].map(month_map)
clean_date = clean_date.dropna().apply(lambda x: " ".join(x), axis=1)
clean_date = pd.to_datetime(clean_date, format="%d %b %Y")

# Clean view number
views = df['watch-view-count'].str.extract(r"(\d+\.?\d*)", expand=False)
views = views.str.replace(".", "").fillna(0).astype(int)

In [None]:
features = pd.DataFrame()
y = df['y'].copy()

In [None]:
features['time_since_pub'] = (pd.to_datetime("2020-03-24") -  # HARDCODED
                              clean_date) / np.timedelta64(1, 'D')

# Extracting n of view feature
features['views'] = views

# Extracting n of view/day feature
features['views_per_day'] = features['views'] / features['time_since_pub']

# Droping time_since_pub to prevent bias
features = features.drop(['time_since_pub'], axis=1)

# Dropping problematic features
y = y[features.index]
df = df.loc[features.index]

In [None]:
resolutions = []
for height, width in zip(df['og:video:height'], df['og:video:width']):
    try:
        height = float(height)
        width = float(width)
    except:
        resolutions.append(np.nan)
        continue
        
    resolutions.append(height*width)
    
features['resolution'] = resolutions

In [None]:
features.info()

In [None]:
# Around 75% train and 25% to validation
split_date = '2020-02-27'
mask_train = (clean_date < split_date) & (clean_date.notnull())
mask_val = (clean_date >= split_date) & (clean_date.notnull())

X_train, X_val = features[mask_train.values], features[mask_val.values]
y_train, y_val = y[mask_train.values], y[mask_val.values]
X_train.shape, X_val.shape, y_train.shape, y_val.shape

In [None]:
# Filling NaNs
X_train['resolution'] = X_train['resolution'].fillna(X_train['resolution'].mean())
X_val['resolution'] = X_val['resolution'].fillna(X_train['resolution'].mean())

In [None]:
# Extracting features from title
train_titles = df[mask_train]['watch-title']
val_titles = df[mask_val]['watch-title']

title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
title_bow_train = title_vec.fit_transform(train_titles)
title_bow_val = title_vec.transform(val_titles)

# Concat the BoW into features df
X_train_title = hstack([X_train, title_bow_train])
X_val_title = hstack([X_val, title_bow_val])

# Random Forest

In [None]:
# Random Forest Model
rfc = RandomForestClassifier(n_estimators=200, random_state=42,
                             class_weight="balanced", n_jobs=8)
rfc.fit(X_train_title, y_train)

In [None]:
# Predicting
rf_train_proba = rfc.predict_proba(X_train_title)
rf_train_preds = rfc.predict(X_train_title)
rf_val_proba = rfc.predict_proba(X_val_title)
rf_val_preds = rfc.predict(X_val_title)

In [None]:
# Getting the metrics
print('TRAIN METRICS:')
print('log_loss: ', log_loss(y_train, rf_train_proba))
print('avg_precision_score: ', average_precision_score(y_train, rf_train_proba[:, 1]))
print('roc_auc: ', roc_auc_score(y_train, rf_train_proba[:, 1]))

print('\nVALIDATION METRICS:')
print('log_loss: ', log_loss(y_val, rf_val_preds))
print('avg_precision_score: ', average_precision_score(y_val, rf_val_proba[:, 1]))
print('roc_auc: ', roc_auc_score(y_val, rf_val_proba[:, 1]))

# LGBM

In [None]:
lgbm = LGBMClassifier(random_state=42, class_weight="balanced", n_jobs=7)
lgbm.fit(X_train_title, y_train)

In [None]:
# Predicting
lgbm_train_proba = lgbm.predict_proba(X_train_title)
lgbm_train_preds = lgbm.predict(X_train_title)
lgbm_val_proba = lgbm.predict_proba(X_val_title)
lgbm_val_preds = lgbm.predict(X_val_title)

In [None]:
# Getting the metrics
print('TRAIN METRICS:')
print('log_loss: ', log_loss(y_train, lgbm_train_preds))
print('avg_precision_score: ', average_precision_score(y_train, lgbm_train_proba[:, 1]))
print('roc_auc: ', roc_auc_score(y_train, lgbm_train_proba[:, 1]))

print('\nVALIDATION METRICS:')
print('log_loss: ', log_loss(y_val, lgbm_val_preds))
print('avg_precision_score: ', average_precision_score(y_val, lgbm_val_proba[:, 1]))
print('roc_auc: ', roc_auc_score(y_val, lgbm_val_proba[:, 1]))

In [None]:
def tune_lgbm(params):
    print(params)
    lr = params[0]
    max_depth = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4]
    n_estimators = params[5]
    
    min_df = params[6]
    ngram_range = (1, params[7])
    
    title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
    title_bow_train = title_vec.fit_transform(train_titles)
    title_bow_val = title_vec.transform(val_titles)
    
    X_train_title = hstack([X_train, title_bow_train])
    X_val_title = hstack([X_val, title_bow_val])

    mdl = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth, 
                         min_child_samples=min_child_samples, subsample=subsample,
                         colsample_bytree=colsample_bytree, bagging_freq=1,n_estimators=n_estimators, random_state=0, 
                         class_weight="balanced", n_jobs=8)
    mdl.fit(X_train_title, y_train)
    
    p = mdl.predict_proba(X_val_title)[:, 1]
    
    print(roc_auc_score(y_val, p))
    
    return -average_precision_score(y_val, p)


space = [(1e-3, 1e-1, 'log-uniform'), # lr
          (1, 10), # max_depth
          (1, 20), # min_child_samples
          (0.05, 1.), # subsample
          (0.05, 1.), # colsample_bytree
          (100,1000), # n_estimators
          (1,5), # min_df
          (1,5)] # ngram_range

res = forest_minimize(tune_lgbm, space, random_state=160745, n_random_starts=20, n_calls=50, verbose=1)

In [None]:
lr, max_depth, min_child_samples, subsample, colsample_bytree, n_estimators, min_df, ngram_range = res.x

In [None]:
ngram_range = (1, ngram_range)
title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
title_bow_train = title_vec.fit_transform(train_titles)
title_bow_val = title_vec.transform(val_titles)

X_train_title = hstack([X_train, title_bow_train])
X_val_title = hstack([X_val, title_bow_val])

lgbm = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth, 
                     min_child_samples=min_child_samples, subsample=subsample,
                     colsample_bytree=colsample_bytree, bagging_freq=1,n_estimators=n_estimators, random_state=0, 
                     class_weight='balanced', n_jobs=8)
lgbm.fit(X_train_title, y_train)

In [None]:
lgbm_train_proba = lgbm.predict_proba(X_train_title)
lgbm_train_preds = lgbm.predict(X_train_title)
lgbm_val_proba = lgbm.predict_proba(X_val_title)
lgbm_val_preds = lgbm.predict(X_val_title)

In [None]:
# Getting the metrics
print('TRAIN METRICS:')
print('log_loss: ', log_loss(y_train, lgbm_train_preds))
print('avg_precision_score: ', average_precision_score(y_train, lgbm_train_proba[:, 1]))
print('roc_auc: ', roc_auc_score(y_train, lgbm_train_proba[:, 1]))

print('\nVALIDATION METRICS:')
print('log_loss: ', log_loss(y_val, lgbm_val_preds))
print('avg_precision_score: ', average_precision_score(y_val, lgbm_val_proba[:, 1]))
print('roc_auc: ', roc_auc_score(y_val, lgbm_val_proba[:, 1]))

In [None]:
pd.DataFrame({"RF": rf_val_proba[:, 1], "LGBM": lgbm_val_proba[:, 1]}).corr()

In [None]:
rf_weights = np.linspace(0,1,20)

for rf_weight in rf_weights:
    p = rf_weight*rf_val_proba[:, 1] + (1-rf_weight)*lgbm_val_proba[:, 1]
    print(rf_weight)
    print(average_precision_score(y_val, p), roc_auc_score(y_val, p))

In [None]:
p = 0.26*rf_val_proba[:, 1] + 0.74*lgbm_val_proba[:, 1]
print(average_precision_score(y_val, p), roc_auc_score(y_val, p))

In [None]:
import joblib as jb
jb.dump(lgbm, "../pkls/lgbm_20200324.pkl.z")
jb.dump(rfc, "../pkls/rf_20200324.pkl.z")
jb.dump(title_vec, "../pkls/titlebow_20200324.pkl.z")