In [1]:
from __future__ import division
import itertools
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from numpy import linspace, loadtxt, ones, convolve
from sklearn.ensemble import IsolationForest
import numpy as np
import pandas as pd
import collections
import math
from sklearn import metrics
from random import randint
from matplotlib import style
import seaborn as sns
# style.use('fivethirtyeight')
%matplotlib inline

def evaluate(actual, predictions, output=True):
    mse = metrics.mean_squared_error(actual, predictions)
    rmse = math.sqrt(mse)

    if output:
        print('MSE:  {}'.format(mse))
        print('RMSE: {}'.format(rmse))
    else:
        return mse, rmse    

def plot_and_eval(predictions, actual, metric_fmt='{:.2f}', linewidth=4):
    if type(predictions) is not list:
        predictions = [predictions]

    plt.figure(figsize=(16, 8))
    plt.plot(train,label='Train')
    plt.plot(test, label='Test')

    for yhat in predictions:
        mse, rmse = evaluate(actual, yhat, output=False)        
        label = f'{yhat.name}'
        if len(predictions) > 1:
            label = f'{label} -- MSE: {metric_fmt} RMSE: {metric_fmt}'.format(mse, rmse)
        plt.plot(yhat, label=label, linewidth=linewidth)

    if len(predictions) == 1:
        label = f'{label} -- MSE: {metric_fmt} RMSE: {metric_fmt}'.format(mse, rmse)
        plt.title(label)

    plt.legend(loc='best')
    plt.show()    


In [35]:
data = '/Users/orion/Downloads/anonymized-curriculum-access.txt'

In [40]:
colnames=['date','page_viewed','user_id','cohort_id','ip']
df = pd.read_csv(data,          
                 engine='python',
                 header=None,
                 index_col=False,
                 names=colnames,
                 sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
                 na_values='"-"',
                 usecols=[0,2,3,4,5]
)

df.drop(df.index[0], inplace=True)
df.head()

Unnamed: 0,date,page_viewed,user_id,cohort_id,ip
1,2018-01-26,/,1,8,97.105.19.61
2,2018-01-26,java-ii,1,8,97.105.19.61
3,2018-01-26,java-ii/object-oriented-programming,1,8,97.105.19.61
4,2018-01-26,slides/object_oriented_programming,1,8,97.105.19.61
5,2018-01-26,javascript-i/conditionals,2,22,97.105.19.61


In [41]:
df = df.dropna()
df.cohort_id = df.cohort_id.astype('int')
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 219070 entries, 1 to 233528
Data columns (total 5 columns):
date           219070 non-null object
page_viewed    219070 non-null object
user_id        219070 non-null object
cohort_id      219070 non-null int64
ip             219070 non-null object
dtypes: int64(1), object(4)
memory usage: 10.0+ MB
None


Unnamed: 0,date,page_viewed,user_id,cohort_id,ip
1,2018-01-26,/,1,8,97.105.19.61
2,2018-01-26,java-ii,1,8,97.105.19.61
3,2018-01-26,java-ii/object-oriented-programming,1,8,97.105.19.61
4,2018-01-26,slides/object_oriented_programming,1,8,97.105.19.61
5,2018-01-26,javascript-i/conditionals,2,22,97.105.19.61


In [43]:
colnames=['cohort_id', 'cohort_name', 'start_date', 'end_date']
df_cohort = pd.read_clipboard(names=colnames, skiprows=1, sep=',')
print(df_cohort.info())
df_cohort.to_csv('/Users/orion/Downloads/cohorts.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 0 to 30
Data columns (total 4 columns):
cohort_id      31 non-null int64
cohort_name    31 non-null object
start_date     31 non-null object
end_date       31 non-null object
dtypes: int64(1), object(3)
memory usage: 1.0+ KB
None


In [44]:
df_cohort.head()

Unnamed: 0,cohort_id,cohort_name,start_date,end_date
0,1,Arches,2014-02-04,2014-04-22
1,2,Badlands,2014-06-04,2014-08-22
2,3,Carlsbad,2014-09-04,2014-11-05
3,4,Denali,2014-10-20,2015-01-18
4,5,Everglades,2014-11-18,2015-02-24


In [45]:
df = df.merge(df_cohort, on='cohort_id', how='left')
df.head()

Unnamed: 0,date,page_viewed,user_id,cohort_id,ip,cohort_name,start_date,end_date
0,2018-01-26,/,1,8,97.105.19.61,Hampton,2015-09-22,2016-02-06
1,2018-01-26,java-ii,1,8,97.105.19.61,Hampton,2015-09-22,2016-02-06
2,2018-01-26,java-ii/object-oriented-programming,1,8,97.105.19.61,Hampton,2015-09-22,2016-02-06
3,2018-01-26,slides/object_oriented_programming,1,8,97.105.19.61,Hampton,2015-09-22,2016-02-06
4,2018-01-26,javascript-i/conditionals,2,22,97.105.19.61,Teddy,2018-01-08,2018-05-17


* add a datetime column
* set index to newly created datetime column
* convert datetime to actual datetime
* drop date and time

In [46]:
df['date'] = pd.to_datetime(df.date)
df = df.dropna()

In [48]:
df_agg = df.groupby(['date','cohort_id','cohort_name'])['user_id'].\
                        nunique().\
                    reset_index().\
                    rename(index=str, 
                       columns={'user_id': 'users_viewed'})

In [49]:
df_agg.head()

Unnamed: 0,date,cohort_id,cohort_name,users_viewed
0,2018-01-26,1,Arches,1
1,2018-01-26,8,Hampton,1
2,2018-01-26,13,Kings,1
3,2018-01-26,16,Niagara,2
4,2018-01-26,18,Pinnacles,1


In [50]:
cohorts = list(df_agg.cohort_name.unique())
cohorts

['Arches',
 'Hampton',
 'Kings',
 'Niagara',
 'Pinnacles',
 'Quincy',
 'Sequoia',
 'Teddy',
 'Lassen',
 'Mammoth',
 'Glacier',
 'Denali',
 'Joshua',
 'Olympic',
 'Ulysses',
 'Badlands',
 'Apollo',
 'Ike',
 'Voyageurs',
 'Wrangell',
 'Xanadu',
 'Franklin',
 'Yosemite',
 'Staff',
 'Zion',
 'Andromeda']

In [51]:
df_agg.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3366 entries, 0 to 3365
Data columns (total 4 columns):
date            3366 non-null datetime64[ns]
cohort_id       3366 non-null int64
cohort_name     3366 non-null object
users_viewed    3366 non-null int64
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 131.5+ KB


In [54]:
df = df_agg.set_index('date').rename(index=str, columns={'users_viewed': 'ema'})
df.drop(columns='cohort_id', inplace=True)

In [60]:
span = 7
def bollinger_bands(df):
    ema = df.ewm(span=span, adjust=False).mean()
    ema['stdev'] = ema.ema.rolling(span).std()
    ema['ub'] = ema.ema + ema.stdev*3
    ema['lb'] = ema.ema - ema.stdev*3
    return ema.reset_index()

In [61]:
bands = []
for cohort in cohorts:
    bb = bollinger_bands(df[df.cohort_name == cohort])
    bands.append(bb)

df2 = pd.concat(bands)

In [62]:
df2.head(15)

Unnamed: 0,date,cohort_name,ema,stdev,ub,lb
0,2018-01-26 00:00:00,Arches,1.0,,,
1,2018-01-29 00:00:00,Arches,1.5,,,
2,2018-01-30 00:00:00,Arches,1.625,,,
3,2018-01-31 00:00:00,Arches,1.71875,,,
4,2018-02-01 00:00:00,Arches,1.539062,,,
5,2018-02-02 00:00:00,Arches,1.404297,,,
6,2018-02-05 00:00:00,Arches,1.303223,0.237665,2.016217,0.590228
7,2018-02-06 00:00:00,Arches,1.477417,0.137075,1.888642,1.066192
8,2018-02-07 00:00:00,Arches,1.608063,0.141807,2.033483,1.182643
9,2018-02-08 00:00:00,Arches,1.956047,0.216304,2.604958,1.307136


In [63]:
df2.isnull().sum()

date             0
cohort_name      0
ema              0
stdev          146
ub             146
lb             146
dtype: int64

In [64]:
df_missing = df2[df2.stdev.isnull()][['cohort_name', 'ema']]
df_missing.head()

Unnamed: 0,cohort_name,ema
0,Arches,1.0
1,Arches,1.5
2,Arches,1.625
3,Arches,1.71875
4,Arches,1.539062


In [65]:
df_missing = df_missing.groupby('cohort_name').std().fillna(value=0).reset_index().rename(index=str, columns={'ema': 'stdev_null'})
df_missing.head()

Unnamed: 0,cohort_name,stdev_null
0,Andromeda,4.222773
1,Apollo,0.0
2,Arches,0.251638
3,Badlands,0.0
4,Denali,0.0


In [66]:
df = df2.merge(df_missing, on='cohort_name', how='left')

In [67]:
df.head()

Unnamed: 0,date,cohort_name,ema,stdev,ub,lb,stdev_null
0,2018-01-26 00:00:00,Arches,1.0,,,,0.251638
1,2018-01-29 00:00:00,Arches,1.5,,,,0.251638
2,2018-01-30 00:00:00,Arches,1.625,,,,0.251638
3,2018-01-31 00:00:00,Arches,1.71875,,,,0.251638
4,2018-02-01 00:00:00,Arches,1.539062,,,,0.251638


In [68]:
idx = df.stdev.isnull()
df.loc[idx, 'stdev'] = df.loc[idx, 'stdev_null']
df.drop(columns='stdev_null', inplace=True)
df.head()

Unnamed: 0,date,cohort_name,ema,stdev,ub,lb
0,2018-01-26 00:00:00,Arches,1.0,0.251638,,
1,2018-01-29 00:00:00,Arches,1.5,0.251638,,
2,2018-01-30 00:00:00,Arches,1.625,0.251638,,
3,2018-01-31 00:00:00,Arches,1.71875,0.251638,,
4,2018-02-01 00:00:00,Arches,1.539062,0.251638,,


In [69]:
idx = df.ub.isnull()
df.loc[idx,'ub'] = df.loc[idx,'ema'] + df.loc[idx,'stdev']*3
df.loc[idx,'lb'] = df.loc[idx,'ema'] - df.loc[idx,'stdev']*3
df.head()

Unnamed: 0,date,cohort_name,ema,stdev,ub,lb
0,2018-01-26 00:00:00,Arches,1.0,0.251638,1.754913,0.245087
1,2018-01-29 00:00:00,Arches,1.5,0.251638,2.254913,0.745087
2,2018-01-30 00:00:00,Arches,1.625,0.251638,2.379913,0.870087
3,2018-01-31 00:00:00,Arches,1.71875,0.251638,2.473663,0.963837
4,2018-02-01 00:00:00,Arches,1.539062,0.251638,2.293976,0.784149
