# Time Series Project

##### Luke Becker, Data Scientist

In [1]:
# Importing libraries and functions for use.
# from __future__ import division
import itertools
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import pandas as pd
import math
from sklearn import metrics
from random import randint
from matplotlib import style
import seaborn as sns
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import MinMaxScaler

%matplotlib inline

# splitting data:
from sklearn.model_selection import train_test_split
from sklearn import metrics

from scipy.stats import entropy

# web-based requests
import requests
import rapid_env

# Importing the os library specifically for reading the csv once I've created the file in my working directory.
import os

import acquire
import prepare
import env
import rapid_env


# This is to make sure matplotlib doesn't throw the following error:
# The next line fixes "TypeError: float() argument must be a string or a number, not 'Timestamp' matplotlib"
pd.plotting.register_matplotlib_converters()

Imported env successfully.
credentials loaded successfully
End of acquire.py file.
Loaded all prepare functions.


# Plan

Data source is from Codeup's curriculum logs from January 2018 until October 2020. These logs track income IP addresses which are accessing the website's curriculum and possibly other areas of the website.

#### Objectives:
1. Which lesson appears to attract the most traffic consistently across cohorts (per program)?
2. Is there a cohort that referred to a lesson significantly more that other cohorts seemed to gloss over? 
3. Are there students who, when active, hardly access the curriculum? If so, what information do you have about these students? 
4. Is there any suspicious activity, such as users/machines/etc accessing the curriculum who shouldn’t be? Does it appear that any web-scraping is happening? Are there any suspicious IP addresses? Any odd user-agents? 
5. At some point in the last year, ability for students and alumni to cross-access curriculum (web dev to ds, ds to web dev) should have been shut off. Do you see any evidence of that happening? Did it happen before? 
6. What topics are grads continuing to reference after graduation and into their jobs (for each program)? 
7. Which lessons are least accessed? 
8. Anything else I should be aware of? 


### Ideas:
- Try creating continuous variables from my categorical variables.
- Filter down the data to a specific cohort and looking at data that way to hopefully find something useful.
- Use as my attribute a single user id; loop through each user id. Then, *by user* find anomalies with regard to that *individual user*, not to the patterns across *all* users.
- Use this code to help create a categorical variable from continuous variables: `groupby().size() = continuous variable`
- *Nice to have:* to help me visualize, install an ip address library to map out where each ip address is.

## Acquire

In [2]:
# def get_time_series(raw_data = 'anonymized-curriculum-access.txt'):

#     colnames = ['date', 'timestamp', 'webpage', 'user_id', 'cohort_id', 'ip']

#     df = pd.read_csv('anonymized-curriculum-access.txt', engine='python',
#                      header=None,
#                      index_col=False,
#                      names=colnames,
#                      sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
#                      na_values='"-"',
#                      usecols=[0,1,2,3,4,5])
#     return df
    

In [3]:
df = acquire.get_time_series('anonymized-curriculum-access.txt')
df.head()

Unnamed: 0,date,timestamp,webpage,user_id,cohort_id,ip
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 719459 entries, 0 to 719458
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   date       719459 non-null  object 
 1   timestamp  719459 non-null  object 
 2   webpage    719458 non-null  object 
 3   user_id    719459 non-null  int64  
 4   cohort_id  674619 non-null  float64
 5   ip         719459 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 32.9+ MB


In [5]:
# Importing the Cohort data

In [6]:
# Adding cohort names


cohorts = pd.read_csv('cohorts.csv', index_col = 'cohort_id')
cohorts

Unnamed: 0_level_0,name,start_date,end_date,program_id
cohort_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Arches,2014-02-04,2014-04-22,1
2,Badlands,2014-06-04,2014-08-22,1
3,Carlsbad,2014-09-04,2014-11-05,1
4,Denali,2014-10-20,2015-01-18,1
5,Everglades,2014-11-18,2015-02-24,1
6,Franklin,2015-02-03,2015-05-26,1
7,Glacier,2015-06-05,2015-10-06,1
8,Hampton,2015-09-22,2016-02-06,1
9,Apollo,2015-03-30,2015-07-29,4
10,Balboa,2015-11-03,2016-03-11,4


In [None]:
# Acquire complete.

## Prepare Stage:

- I need to combine the date and timestamp into one column and make it a datetime dtype.
- Drop the old date and time seprate columns.
- Set the new datetime as the index
- Add year, month, day, hour and weekday columns to primary dataframe

In [7]:
df.shape

(719459, 6)

In [8]:
df = prepare.prep_web_project(df)
df.shape

(719459, 10)

In [9]:
df.head()

Unnamed: 0_level_0,webpage,user_id,cohort_id,ip,year,month,day,hour,weekday,is_ds
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-01-26 09:55:03,/,1,8.0,97.105.19.61,2018,1,26,9,Friday,False
2018-01-26 09:56:02,java-ii,1,8.0,97.105.19.61,2018,1,26,9,Friday,False
2018-01-26 09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61,2018,1,26,9,Friday,False
2018-01-26 09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61,2018,1,26,9,Friday,False
2018-01-26 09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61,2018,1,26,9,Friday,False


In [None]:
# Prep Function:

# def prep_web_project(df):
#     '''
#     This function will prepare the dataframe for exploration. 
#     '''
    
#     # Combining the date and time into one column
#     df['date_time'] = df['date'] + " " + df["timestamp"]
#     df['date_time'] = pd.to_datetime(df.date_time)
    
#     # Dropping old columns:
#     df.drop(columns = ['date', 'timestamp'], inplace = True)
    
#     # Now to set that dt as the index:
#     df = df.set_index('date_time')
    
#     # Adding columns for future analysis and exploration:
    
#     df['year'] = df.index.year
#     df['month'] = df.index.month
#     df['day'] = df.index.day
#     df['hour'] = df.index.hour
#     df['weekday'] = df.index.day_name()
    
#     return df
    

In [None]:
# Creating cohort type mask; ds vs. webdev:

# cohorts['is_ds'] = cohorts.program_id == 3

In [10]:
cohorts.head()

Unnamed: 0_level_0,name,start_date,end_date,program_id
cohort_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Arches,2014-02-04,2014-04-22,1
2,Badlands,2014-06-04,2014-08-22,1
3,Carlsbad,2014-09-04,2014-11-05,1
4,Denali,2014-10-20,2015-01-18,1
5,Everglades,2014-11-18,2015-02-24,1


In [11]:
df_merge = pd.merge(df, cohorts, on = 'cohort_id')
df_merge.head()

Unnamed: 0,webpage,user_id,cohort_id,ip,year,month,day,hour,weekday,is_ds,name,start_date,end_date,program_id
0,/,1,8.0,97.105.19.61,2018,1,26,9,Friday,False,Hampton,2015-09-22,2016-02-06,1
1,java-ii,1,8.0,97.105.19.61,2018,1,26,9,Friday,False,Hampton,2015-09-22,2016-02-06,1
2,java-ii/object-oriented-programming,1,8.0,97.105.19.61,2018,1,26,9,Friday,False,Hampton,2015-09-22,2016-02-06,1
3,slides/object_oriented_programming,1,8.0,97.105.19.61,2018,1,26,9,Friday,False,Hampton,2015-09-22,2016-02-06,1
4,javascript-i/functions,1,8.0,97.105.19.61,2018,1,26,10,Friday,False,Hampton,2015-09-22,2016-02-06,1


In [13]:
df['is_ds'] = df.cohort_id.isin([30, 34, 55, 59])

In [14]:
# I should probably also add a counter column like this:
# df[pageviews] == 1

In [15]:
df

Unnamed: 0_level_0,webpage,user_id,cohort_id,ip,year,month,day,hour,weekday,is_ds
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-01-26 09:55:03,/,1,8.0,97.105.19.61,2018,1,26,9,Friday,False
2018-01-26 09:56:02,java-ii,1,8.0,97.105.19.61,2018,1,26,9,Friday,False
2018-01-26 09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61,2018,1,26,9,Friday,False
2018-01-26 09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61,2018,1,26,9,Friday,False
2018-01-26 09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61,2018,1,26,9,Friday,False
...,...,...,...,...,...,...,...,...,...,...
2020-11-02 16:48:13,javascript-i/coffee-project,763,62.0,107.192.148.199,2020,11,2,16,Monday,False
2020-11-02 16:48:17,javascript-i/mapbox-api,771,62.0,172.125.226.175,2020,11,2,16,Monday,False
2020-11-02 16:48:18,javascript-i/coffee-project,771,62.0,172.125.226.175,2020,11,2,16,Monday,False
2020-11-02 16:48:28,javascript-i/bom-and-dom/bom,771,62.0,172.125.226.175,2020,11,2,16,Monday,False


In [None]:
# This means that my log data starts at part-wary through the Sequoia cohort on.

# Now I also have a categorical variable in the `is_ds` column that I can use later.

In [None]:
# The above didn't work, simply because I think the cohort ids are out of sync between the two dataframes...
# Maybe the best thing to do is simply use the cohort Ids I already have for now...

## Explore - Answers to Questions


### 1. Which lesson appears to attract the most traffic consistently across cohorts (per program)?

In [16]:
df2 = df.copy()
df2.shape

(719459, 10)

In [17]:
# df2 = df2.resample("D").sum()
df2

Unnamed: 0_level_0,webpage,user_id,cohort_id,ip,year,month,day,hour,weekday,is_ds
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-01-26 09:55:03,/,1,8.0,97.105.19.61,2018,1,26,9,Friday,False
2018-01-26 09:56:02,java-ii,1,8.0,97.105.19.61,2018,1,26,9,Friday,False
2018-01-26 09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61,2018,1,26,9,Friday,False
2018-01-26 09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61,2018,1,26,9,Friday,False
2018-01-26 09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61,2018,1,26,9,Friday,False
...,...,...,...,...,...,...,...,...,...,...
2020-11-02 16:48:13,javascript-i/coffee-project,763,62.0,107.192.148.199,2020,11,2,16,Monday,False
2020-11-02 16:48:17,javascript-i/mapbox-api,771,62.0,172.125.226.175,2020,11,2,16,Monday,False
2020-11-02 16:48:18,javascript-i/coffee-project,771,62.0,172.125.226.175,2020,11,2,16,Monday,False
2020-11-02 16:48:28,javascript-i/bom-and-dom/bom,771,62.0,172.125.226.175,2020,11,2,16,Monday,False


In [18]:
df2 = df2.groupby(['webpage', 'cohort_id'])['user_id'].count()
df2 = df2.reset_index()
df2

Unnamed: 0,webpage,cohort_id,user_id
0,%20https://github.com/RaulCPena,55.0,1
1,",%20https://github.com/RaulCPena",55.0,1
2,.git,24.0,1
3,.gitignore,24.0,1
4,.well-known/assetlinks.json,58.0,2
...,...,...,...
11775,web-dev-day-two,61.0,2
11776,working-with-time-series-data,28.0,1
11777,working-with-time-series-data,59.0,14
11778,wp-admin,22.0,1


In [19]:
df2.describe()

Unnamed: 0,cohort_id,user_id
count,11780.0,11780.0
mean,30.401273,57.268081
std,16.559253,130.288566
min,1.0,1.0
25%,19.0,2.0
50%,28.0,6.0
75%,34.0,51.0
max,62.0,4633.0


In [20]:
df2.sort_values(by = 'user_id', ascending = False)

Unnamed: 0,webpage,cohort_id,user_id
29,/,28.0,4633
25,/,24.0,2098
42,/,59.0,2041
37,/,53.0,1962
34,/,34.0,1842
...,...,...,...
5139,content/php_ii/array-functions/explode-and-imp...,23.0,1
5140,content/php_ii/array-functions/explode-and-imp...,31.0,1
5142,content/php_ii/array-functions/gitbook/images/...,14.0,1
5145,content/php_ii/array-functions/gitbook/images/...,18.0,1


In [30]:
# I'm dropping all requests to the homepage, since that isn't useful information for the question proposed.
df2 = df2[df2.webpage != "/"]
df2 = df2[~df2.webpage.str.contains("github.com")]
df2.shape

(11738, 4)

In [31]:
df2.sort_values(by = 'user_id', ascending = False)

Unnamed: 0,webpage,cohort_id,user_id,is_ds
11597,toc,29.0,1457,False
10354,search/search_index.json,33.0,1376,False
10360,search/search_index.json,56.0,1361,False
11604,toc,53.0,1273,False
7888,javascript-i,28.0,1263,False
...,...,...,...,...
5139,content/php_ii/array-functions/explode-and-imp...,23.0,1,False
5140,content/php_ii/array-functions/explode-and-imp...,31.0,1,False
5142,content/php_ii/array-functions/gitbook/images/...,14.0,1,False
5145,content/php_ii/array-functions/gitbook/images/...,18.0,1,False


In [32]:
# Going to remove other rows, trying to isolate down to the DS cohorts.

df2['is_ds'] = df2.cohort_id.isin([30, 34, 55, 59])

In [33]:
# Isolating only the ds cohorts, and also renaming the counted user_id column to the number of hits

ds_page_counts = df2[df2.is_ds == True]
ds_page_counts.rename(columns = {'user_id': 'pageviews_by_cohort'}, inplace = True)

In [34]:
ds_page_counts.sort_values(by = 'pageviews_by_cohort', ascending = False)

Unnamed: 0,webpage,cohort_id,pageviews_by_cohort,is_ds
3470,classification/overview,59.0,759,True
180,1-fundamentals/modern-data-scientist.jpg,34.0,626,True
157,1-fundamentals/AI-ML-DL-timeline.jpg,34.0,624,True
80,1-fundamentals/1.1-intro-to-data-science,34.0,615,True
1226,6-regression/1-overview,55.0,595,True
...,...,...,...,...
3001,appendix/open_data/www.flickr.com/services/api,59.0,1,True
989,4-python/handling-duplicate-values,59.0,1,True
3000,appendix/open_data/www.databasefootball.com,59.0,1,True
986,4-python/error-handling,59.0,1,True


In [35]:
ds_page_counts

Unnamed: 0,webpage,cohort_id,pageviews_by_cohort,is_ds
62,1-fundamentals,59.0,10,True
80,1-fundamentals/1.1-intro-to-data-science,34.0,615,True
81,1-fundamentals/1.1-intro-to-data-science,55.0,461,True
82,1-fundamentals/1.1-intro-to-data-science,59.0,460,True
89,1-fundamentals/1.2-data-science-pipeline,34.0,221,True
...,...,...,...,...
11573,timeseries/working-with-time-series-data-with-...,34.0,2,True
11574,timeseries/working-with-time-series-data-with-...,55.0,2,True
11575,timeseries/working-with-time-series-data-with-...,59.0,4,True
11601,toc,34.0,9,True


In [36]:
# Now I need to group by lesson, not just the endpoint. In the df above, one can see that the fundamentals page has several hits.

# Using str.split():

ds_page_counts_split = ds_page_counts.webpage.str.split('/', n = 1, expand = True)
ds_page_counts_split.rename(columns = {0: 'module', 1: 'lesson'}, inplace = True)
ds_page_counts_split.head()

Unnamed: 0,module,lesson
62,1-fundamentals,
80,1-fundamentals,1.1-intro-to-data-science
81,1-fundamentals,1.1-intro-to-data-science
82,1-fundamentals,1.1-intro-to-data-science
89,1-fundamentals,1.2-data-science-pipeline


In [37]:
ds_page_counts = pd.merge(ds_page_counts, ds_page_counts_split, right_index = True, left_index = True)

In [38]:
ds_page_counts

Unnamed: 0,webpage,cohort_id,pageviews_by_cohort,is_ds,module,lesson
62,1-fundamentals,59.0,10,True,1-fundamentals,
80,1-fundamentals/1.1-intro-to-data-science,34.0,615,True,1-fundamentals,1.1-intro-to-data-science
81,1-fundamentals/1.1-intro-to-data-science,55.0,461,True,1-fundamentals,1.1-intro-to-data-science
82,1-fundamentals/1.1-intro-to-data-science,59.0,460,True,1-fundamentals,1.1-intro-to-data-science
89,1-fundamentals/1.2-data-science-pipeline,34.0,221,True,1-fundamentals,1.2-data-science-pipeline
...,...,...,...,...,...,...
11573,timeseries/working-with-time-series-data-with-...,34.0,2,True,timeseries,working-with-time-series-data-with-pandas
11574,timeseries/working-with-time-series-data-with-...,55.0,2,True,timeseries,working-with-time-series-data-with-pandas
11575,timeseries/working-with-time-series-data-with-...,59.0,4,True,timeseries,working-with-time-series-data-with-pandas
11601,toc,34.0,9,True,toc,


In [40]:
ds_page_counts.sort_values(by = 'pageviews_by_cohort', ascending = False)

Unnamed: 0,webpage,cohort_id,pageviews_by_cohort,is_ds,module,lesson
3470,classification/overview,59.0,759,True,classification,overview
180,1-fundamentals/modern-data-scientist.jpg,34.0,626,True,1-fundamentals,modern-data-scientist.jpg
157,1-fundamentals/AI-ML-DL-timeline.jpg,34.0,624,True,1-fundamentals,AI-ML-DL-timeline.jpg
80,1-fundamentals/1.1-intro-to-data-science,34.0,615,True,1-fundamentals,1.1-intro-to-data-science
1226,6-regression/1-overview,55.0,595,True,6-regression,1-overview
...,...,...,...,...,...,...
3001,appendix/open_data/www.flickr.com/services/api,59.0,1,True,appendix,open_data/www.flickr.com/services/api
989,4-python/handling-duplicate-values,59.0,1,True,4-python,handling-duplicate-values
3000,appendix/open_data/www.databasefootball.com,59.0,1,True,appendix,open_data/www.databasefootball.com
986,4-python/error-handling,59.0,1,True,4-python,error-handling


In [95]:
ds_max_row = ds_page_counts.groupby(['cohort_id', 'webpage'])[['pageviews_by_cohort']].sum()

In [96]:
ds_max_row

Unnamed: 0_level_0,Unnamed: 1_level_0,pageviews_by_cohort
cohort_id,webpage,Unnamed: 2_level_1
34.0,1-fundamentals/1.1-intro-to-data-science,615
34.0,1-fundamentals/1.2-data-science-pipeline,221
34.0,1-fundamentals/1.3-pipeline-demo,130
34.0,1-fundamentals/2.1-excel-overview,86
34.0,1-fundamentals/2.1-spreadsheets-overview,4
...,...,...
59.0,timeseries/sarimax,4
59.0,timeseries/svr,6
59.0,timeseries/working-with-time-series-data,83
59.0,timeseries/working-with-time-series-data-with-pandas,4


In [81]:
ds_max_row[[ds_max_row.pageviews_by_cohort == ds_max_row.pageviews_by_cohort.max()]]

Unnamed: 0_level_0,Unnamed: 1_level_0,pageviews_by_cohort
cohort_id,webpage,Unnamed: 2_level_1
59.0,classification/overview,759


In [91]:

ds_max_row.idxmax()

pageviews_by_cohort    (59.0, classification/overview)
dtype: object

In [100]:
ds_max_row.pageviews_by_cohort.rank()

cohort_id  webpage                                             
34.0       1-fundamentals/1.1-intro-to-data-science                1191.0
           1-fundamentals/1.2-data-science-pipeline                1136.0
           1-fundamentals/1.3-pipeline-demo                        1030.0
           1-fundamentals/2.1-excel-overview                        930.5
           1-fundamentals/2.1-spreadsheets-overview                 403.0
                                                                    ...  
59.0       timeseries/sarimax                                       403.0
           timeseries/svr                                           501.0
           timeseries/working-with-time-series-data                 923.5
           timeseries/working-with-time-series-data-with-pandas     403.0
           working-with-time-series-data                            664.5
Name: pageviews_by_cohort, Length: 1194, dtype: float64

### I've almost gotten this, but I can't seem to figure out how to return just the first row of each cohort...

Maybe try .idxmax() after doing a boolean by cohort? something like that...

In [106]:
ds_max_row.groupby(level = 0).apply(max)

# Need to somehow keep the webpage index in here.

Unnamed: 0_level_0,pageviews_by_cohort
cohort_id,Unnamed: 1_level_1
34.0,626
55.0,595
59.0,759


In [None]:
# [# Looks for a list to return [# regular python list here]]

In [115]:
ds_max_row.sort_values(by = 'pageviews_by_cohort', ascending = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,pageviews_by_cohort
cohort_id,webpage,Unnamed: 2_level_1
59.0,classification/overview,759
34.0,1-fundamentals/modern-data-scientist.jpg,626
34.0,1-fundamentals/AI-ML-DL-timeline.jpg,624
34.0,1-fundamentals/1.1-intro-to-data-science,615
55.0,6-regression/1-overview,595
...,...,...
59.0,appendix/open_data/www.flickr.com/services/api,1
59.0,appendix/open_data/www.followthemoney.org,1
59.0,appendix/open_data/www.openstreetmap.org,1
59.0,appendix/open_data/www.stat.ucla.edu/data,1


In [127]:
ds_max_row.rank(axis = 0, ascending = False, method = 'max')

AttributeError: 'DataFrame' object has no attribute 'get_level_values'

In [114]:
ds_max_row[ds_max_row.index.get_level_values(0) == 34.0]

Unnamed: 0_level_0,Unnamed: 1_level_0,pageviews_by_cohort
cohort_id,webpage,Unnamed: 2_level_1
34.0,1-fundamentals/1.1-intro-to-data-science,615
34.0,1-fundamentals/1.2-data-science-pipeline,221
34.0,1-fundamentals/1.3-pipeline-demo,130
34.0,1-fundamentals/2.1-excel-overview,86
34.0,1-fundamentals/2.1-spreadsheets-overview,4
34.0,...,...
34.0,timeseries/sarimax,1
34.0,timeseries/svr,1
34.0,timeseries/working-with-time-series-data,2
34.0,timeseries/working-with-time-series-data-with-pandas,2


In [146]:
ds_max_row_2 = ds_page_counts.groupby(['cohort_id', 'webpage'])[['pageviews_by_cohort']].sum()

In [147]:
ds_max_row_2

Unnamed: 0_level_0,Unnamed: 1_level_0,pageviews_by_cohort
cohort_id,webpage,Unnamed: 2_level_1
34.0,1-fundamentals/1.1-intro-to-data-science,615
34.0,1-fundamentals/1.2-data-science-pipeline,221
34.0,1-fundamentals/1.3-pipeline-demo,130
34.0,1-fundamentals/2.1-excel-overview,86
34.0,1-fundamentals/2.1-spreadsheets-overview,4
...,...,...
59.0,timeseries/sarimax,4
59.0,timeseries/svr,6
59.0,timeseries/working-with-time-series-data,83
59.0,timeseries/working-with-time-series-data-with-pandas,4


In [None]:
# Check out the rank function within .groupby().
# Run a 1 sample t-test on the pages seen by cohort type.
# eg, what is the average hits on pandas pages from the Curie cohort, vs the overall datascience average?
# Can I then turn that into a function for the high level pages, instead of the individual pages within each lesson?
# ie, group pages by module instead of by individual page...

## Is there a cohort that referred to a lesson significantly more that other cohorts seemed to gloss over? 