In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

%pylab inline

In [None]:
df = pd.read_csv('../data/raw_data.csv')

In [None]:
df = df[df['y'].notnull()]
df.shape

In [None]:
df_clean = pd.DataFrame(index=df.index)

In [None]:
clean_date = df['watch-time-text'].str.extract(r"(\d+) de ([a-z]+)\. de (\d+)")

In [None]:
clean_date[0] = clean_date[0].dropna().astype(str)
clean_date[2] = clean_date[2].dropna().astype(str)

In [None]:
month_map = {"jan": "Jan",
             "fev": "Feb",
             "mar": "Mar",
             "abr": "Apr",
             "mai": "May",
             "jun": "Jun",
             "jul": "Jul",
             "ago": "Aug",
             "set": "Sep",
             "out": "Oct",
             "nov": "Nov",
             "dez": "Dec"}

clean_date[1] = clean_date[1].map(month_map)

In [None]:
clean_date = clean_date.dropna().apply(lambda x: " ".join(x), axis=1)

In [None]:
df_clean['date'] = pd.to_datetime(clean_date, format="%d %b %Y")

In [None]:
df_clean

In [None]:
views = df['watch-view-count'].str.extract(r"(\d+\.?\d*)",
           expand=False).str.replace(".", "").fillna(0).astype(int)
df_clean['views'] = views

In [None]:
# Makaing features DataFrame
features = pd.DataFrame(index=df_clean.index)
y = df['y'].copy()

features['time_since_pub'] = (pd.to_datetime("2020-03-24") -  # HARDCODED
                              df_clean['date']) / np.timedelta64(1, 'D')
features['views'] = df_clean['views']
features['views_per_day'] = features['views'] / features['time_since_pub']
features = features.drop(['time_since_pub'], axis=1)

df_clean['date'].value_counts().plot(figsize=(20, 10))
plt.title('Videos Dates', fontsize=20)
plt.xlabel('Dates', fontsize=15)
plt.ylabel('Video Count', fontsize=15)
plt.savefig('../figures/video_dates.png')
plt.show()

In [None]:
features['date'] = df_clean['date']
features['index'] = features.index
features = features.set_index('date').sort_index().dropna()

In [None]:
y = pd.DataFrame(y)
y['date'] = df_clean['date']
y['index'] = y.index
y = y.set_index('date').sort_index()
y = y[y.index.notna()]

In [None]:
n = len(features)
n_train = np.ceil(n * 0.6) - 1
n_val = n - n_train

In [None]:
X_train, X_val = features.reset_index().loc[:n_train], features.reset_index().loc[n_train+1:]

In [None]:
y_train, y_val = y.reset_index().loc[:n_train], y.reset_index().loc[n_train+1:]

In [None]:
X_train = X_train.drop(['date', 'index'], axis=1)
X_val = X_val.drop(['date', 'index'], axis=1)
y_train = y_train['y']
y_val = y_val['y']

In [None]:
mdl = DecisionTreeClassifier(random_state=0, max_depth=3, class_weight="balanced")
mdl = mdl.fit(X_train, y_train)

In [None]:
val_proba = mdl.predict_proba(X_val)
preds = mdl.predict(X_val)

In [None]:
from sklearn.metrics import roc_auc_score, average_precision_score, log_loss
import scikitplot as skplt

In [None]:
log_loss(y_val,preds)

In [None]:
average_precision_score(y_val, val_proba[:, 1])

In [None]:
roc_auc_score(y_val, val_proba[:, 1])

In [None]:
skplt.metrics.plot_roc(y_val, val_proba, figsize=(8,7))

In [None]:
from sklearn.tree import plot_tree

In [None]:
fig, ax = pylab.subplots(1,1, figsize=(10,10))
plot_tree(mdl, ax=ax, feature_names=X_train.columns)