# Anomaly Detection Project

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
from env import user, host, password
import os
import wrangle as w

# Acquire and Prep

In [2]:
df = w.get_cohort_logs_data()
# get data

In [3]:
df.head()

Unnamed: 0,date,time,path,user_id,cohort_id,ip,id,name,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61,22.0,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,,2.0


In [4]:
df.info()
# drop 'deleted_at'
# rows that didn't have a 'cohort_id' have nulls

<class 'pandas.core.frame.DataFrame'>
Int64Index: 900223 entries, 0 to 900222
Data columns (total 15 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   date        900223 non-null  object 
 1   time        900223 non-null  object 
 2   path        900222 non-null  object 
 3   user_id     900223 non-null  int64  
 4   cohort_id   847330 non-null  float64
 5   ip          900223 non-null  object 
 6   id          847330 non-null  float64
 7   name        847330 non-null  object 
 8   slack       847330 non-null  object 
 9   start_date  847330 non-null  object 
 10  end_date    847330 non-null  object 
 11  created_at  847330 non-null  object 
 12  updated_at  847330 non-null  object 
 13  deleted_at  0 non-null       float64
 14  program_id  847330 non-null  float64
dtypes: float64(4), int64(1), object(10)
memory usage: 109.9+ MB


In [5]:
#convert to object:
# user_id, cohort_id
df['user_id'] = df.user_id.astype(object)
df['cohort_id'] = df.cohort_id.astype(object)

In [6]:
df.cohort_id.dtype

dtype('O')

In [7]:
df.isna().sum()

date               0
time               0
path               1
user_id            0
cohort_id      52893
ip                 0
id             52893
name           52893
slack          52893
start_date     52893
end_date       52893
created_at     52893
updated_at     52893
deleted_at    900223
program_id     52893
dtype: int64

In [8]:
no_cid = df[df.cohort_id.isna()]
#selects rows without a cohort_id

In [9]:
no_cid.user_id.unique()
# These users had access but were not linked to any cohort. 
# However they have to have had some some sort of codeup association

array([48, 54, 58, 59, 61, 62, 63, 73, 74, 78, 79, 86, 88, 89, 100, 103,
       111, 137, 166, 176, 213, 247, 317, 349, 350, 351, 352, 353, 354,
       355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367,
       368, 369, 372, 375, 403, 406, 429, 544, 64, 713, 714, 715, 716,
       717, 718, 663, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728,
       729, 346, 731, 736, 644, 744, 782, 810, 814, 815], dtype=object)

In [10]:

for i in no_cid.user_id.unique(): #looks at the list of users who had a NaN under 'cohort_id'
    if df[df.user_id == i].cohort_id.isna().mean() < 1: # Mean of NaN's present under 'cohort_id' column for users in above list
        print(i, df[df.user_id == i].cohort_id.isna().mean()) # print user and mean if less than 1
# list of users who have something other than NaN present under 'cohort_id' column.    


88 0.4190715181932246
358 0.11423841059602649
375 0.07478753541076487
64 0.0015293325992536857
663 0.013468013468013467
346 0.47548711502199875
644 0.0035863717872086074
744 0.09583333333333334


In [11]:
x = df[df.user_id == 88].cohort_id.unique().tolist()
type(x)

list

In [137]:
list_users_w_cid = [88, 358, 375, 64, 663, 346, 644, 744]

for i in list_users_w_cid:
    x = df[df.user_id == i].cohort_id.unique().tolist()
    print(f'User_id ={i} is linked to the following cohorts: {x}')
    print('Cohort names:')
    print(df['name'][df.user_id == i].unique())
    print('------------------')
          

User_id =88 is linked to the following cohorts: [nan, 7.0, 12.0, 11.0]
Cohort names:
[nan 'Glacier' 'Joshua' 'Ike']
------------------
User_id =358 is linked to the following cohorts: [nan, 34.0]
Cohort names:
[nan 'Bayes']
------------------
User_id =375 is linked to the following cohorts: [nan, 31.0]
Cohort names:
[nan 'Andromeda']
------------------
User_id =64 is linked to the following cohorts: [1.0, 28.0, nan, 52.0]
Cohort names:
['Arches' 'Staff' nan 'Europa']
------------------
User_id =663 is linked to the following cohorts: [58.0, nan, 134.0]
Cohort names:
['Hyperion' nan 'Luna']
------------------
User_id =346 is linked to the following cohorts: [29.0, 21.0, nan]
Cohort names:
['Zion' 'Sequoia' nan]
------------------
User_id =644 is linked to the following cohorts: [57.0, nan]
Cohort names:
['Ganymede' nan]
------------------
User_id =744 is linked to the following cohorts: [nan, 28.0]
Cohort names:
[nan 'Staff']
------------------


In [13]:
#The following users are linked to multiple cohorts:
#    88, 64, 663, 346
#Maybe they signed up to multiple cohorts
#
#The other users are only linked to one cohort:
#    358 = 34, 375 = 31, 644 = 57, 744 = 28
#Perhaps these students were allowed access before they were assigned a cohort.     


In [14]:
df.path.unique().shape
# 2314 unique paths

(2314,)

In [15]:
df.path.value_counts().head(20)
#look only at top 10 paths

/                                                                            50313
search/search_index.json                                                     19519
javascript-i                                                                 18983
toc                                                                          18297
java-iii                                                                     13733
html-css                                                                     13635
java-ii                                                                      12685
spring                                                                       12524
jquery                                                                       11525
mysql                                                                        11033
java-i                                                                       10865
javascript-ii                                                                10730
appe

In [16]:
df['path_1'] = df.path.str.split('/').str[0]

In [17]:
df['path_2'] = df.path.str.split('/').str[1]

In [18]:
df['path_3'] = df.path.str.split('/').str[2]

In [19]:
df['path_4'] = df.path.str.split('/').str[3]

In [20]:
df['path_5'] = df.path.str.split('/').str[4]

In [21]:
df.head()

Unnamed: 0,date,time,path,user_id,cohort_id,ip,id,name,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id,path_1,path_2,path_3,path_4,path_5
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0,,,,,
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0,java-ii,,,,
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0,java-ii,object-oriented-programming,,,
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0,slides,object_oriented_programming,,,
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61,22.0,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,,2.0,javascript-i,conditionals,,,


In [22]:
df[df.path_5.isna() == False]

Unnamed: 0,date,time,path,user_id,cohort_id,ip,id,name,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id,path_1,path_2,path_3,path_4,path_5
2071,2018-01-30,11:37:48,content/laravel/intro/gitbook/images/favicon.ico,68,13.0,216.1.153.162,13.0,Kings,#kings,2016-05-23,2016-09-15,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0,content,laravel,intro,gitbook,images
2073,2018-01-30,11:37:49,content/laravel/intro/gitbook/images/favicon.ico,68,13.0,216.1.153.162,13.0,Kings,#kings,2016-05-23,2016-09-15,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0,content,laravel,intro,gitbook,images
2075,2018-01-30,11:37:50,content/laravel/quickstart/gitbook/images/favi...,68,13.0,216.1.153.162,13.0,Kings,#kings,2016-05-23,2016-09-15,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0,content,laravel,quickstart,gitbook,images
2078,2018-01-30,11:38:31,content/laravel/intro/gitbook/images/favicon.ico,68,13.0,216.1.153.162,13.0,Kings,#kings,2016-05-23,2016-09-15,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0,content,laravel,intro,gitbook,images
2080,2018-01-30,11:38:33,content/laravel/intro/gitbook/images/favicon.ico,68,13.0,216.1.153.162,13.0,Kings,#kings,2016-05-23,2016-09-15,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0,content,laravel,intro,gitbook,images
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
881764,2021-04-08,15:05:29,appendix/open_data/www.who.int/research/en,837,133.0,66.69.94.59,133.0,Easley,#easley,2020-12-07,2021-06-08,2020-12-07 15:20:18,2020-12-07 15:20:18,,3.0,appendix,open_data,www.who.int,research,en
881780,2021-04-08,15:13:10,appendix/open_data/www.who.int/research/en,837,133.0,66.69.94.59,133.0,Easley,#easley,2020-12-07,2021-06-08,2020-12-07 15:20:18,2020-12-07 15:20:18,,3.0,appendix,open_data,www.who.int,research,en
882420,2021-04-09,00:10:17,content/html-css/gitbook/images/favicon.ico,80,14.0,136.50.29.193,14.0,Lassen,#lassen,2016-07-18,2016-11-10,2016-07-18 19:06:27,2016-07-18 19:06:27,,1.0,content,html-css,gitbook,images,favicon.ico
882426,2021-04-09,00:10:29,content/javascript/gitbook/images/favicon.ico,80,14.0,136.50.29.193,14.0,Lassen,#lassen,2016-07-18,2016-11-10,2016-07-18 19:06:27,2016-07-18 19:06:27,,1.0,content,javascript,gitbook,images,favicon.ico


In [23]:
df.head()
#drop id, deleted_at

Unnamed: 0,date,time,path,user_id,cohort_id,ip,id,name,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id,path_1,path_2,path_3,path_4,path_5
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0,,,,,
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0,java-ii,,,,
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0,java-ii,object-oriented-programming,,,
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0,slides,object_oriented_programming,,,
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61,22.0,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,,2.0,javascript-i,conditionals,,,


In [24]:
df = df.drop(columns=['id', 'deleted_at'])
#id is that same as cohort_id, and 'deleted_at' only has NaN values

In [25]:
df.head()

Unnamed: 0,date,time,path,user_id,cohort_id,ip,name,slack,start_date,end_date,created_at,updated_at,program_id,path_1,path_2,path_3,path_4,path_5
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,1.0,,,,,
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,1.0,java-ii,,,,
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,1.0,java-ii,object-oriented-programming,,,
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,1.0,slides,object_oriented_programming,,,
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,2.0,javascript-i,conditionals,,,


In [80]:
df.path


0                                                        /
1                                                  java-ii
2                      java-ii/object-oriented-programming
3                       slides/object_oriented_programming
4                                javascript-i/conditionals
                                ...                       
900218                                jquery/personal-site
900219                                   jquery/mapbox-api
900220                             jquery/ajax/weather-map
900221    anomaly-detection/discrete-probabilistic-methods
900222                                   jquery/mapbox-api
Name: path, Length: 900223, dtype: object

0

# Answers to Questions

1. Which lesson appears to attract the most traffic consistently across cohorts (per program)?

PHP Full Stack Web Development

In [53]:
df[df.program_id == 1].path.value_counts().head(15)

/                                                                            1681
index.html                                                                   1011
javascript-i                                                                  736
html-css                                                                      542
spring                                                                        501
java-iii                                                                      479
java-ii                                                                       454
java-i                                                                        444
javascript-ii                                                                 429
appendix                                                                      409
jquery                                                                        344
mysql                                                                         284
content/html-css

In [47]:
# "javascript-i/introduction/working-with-data-types-operators-and-variables" is the most viewed lesson
# The paths above appears to be modules, not lessons

Java Full Stack Web Development

In [55]:
df[df.program_id == 2].path.value_counts().head(15)

/                                                                            35814
javascript-i                                                                 17457
toc                                                                          17428
search/search_index.json                                                     15212
java-iii                                                                     12683
html-css                                                                     12569
java-ii                                                                      11719
spring                                                                       11376
jquery                                                                       10693
mysql                                                                        10318
java-i                                                                       10016
javascript-ii                                                                 9861
appe

In [56]:
# javascript-i/introduction/working-with-data-types-operators-and-variables is the most viewed lesson for this program
# the paths above it appear to be modules

Data Science Program

In [57]:
df[df.program_id == 3].path.value_counts().head(15)

/                                                    8358
search/search_index.json                             2203
classification/overview                              1785
1-fundamentals/modern-data-scientist.jpg             1655
1-fundamentals/AI-ML-DL-timeline.jpg                 1651
1-fundamentals/1.1-intro-to-data-science             1633
classification/scale_features_or_not.svg             1590
fundamentals/AI-ML-DL-timeline.jpg                   1443
fundamentals/modern-data-scientist.jpg               1438
sql/mysql-overview                                   1424
fundamentals/intro-to-data-science                   1413
6-regression/1-overview                              1124
anomaly-detection/AnomalyDetectionCartoon.jpeg        829
anomaly-detection/overview                            804
10-anomaly-detection/AnomalyDetectionCartoon.jpeg     754
Name: path, dtype: int64

In [58]:
# 'classification/overview' is the most viewed lesson for the data science program

Front End Web Dev

In [59]:
df[df.program_id == 4].path.value_counts().head(15)

content/html-css                               2
content/html-css/introduction.html             1
content/html-css/gitbook/images/favicon.ico    1
/                                              1
Name: path, dtype: int64

In [62]:
(df.program_id == 4).sum()

5

In [63]:
# The data only contains 5 rows pertaining to this program. content/html-css being the most viewed lesson

2. Is there a cohort that referred to a lesson significantly more than other cohorts seemed to gloss over?

In [65]:
df.groupby('cohort_id').path.value_counts()
# groups by cohort id and shows unique path value counts

cohort_id  path                                       
1.0        /                                              626
           javascript-i                                   294
           html-css                                       215
           javascript-ii                                  204
           spring                                         192
                                                         ... 
139.0      java-iii/servlets                                1
           javascript-i/bom-and-dom/dom                     1
           javascript-i/objects                             1
           javascript-i/objects/math                        1
           jquery/essential-methods/attributes-and-css      1
Name: path, Length: 13565, dtype: int64

In [67]:
df[df.cohort_id == 1].path.value_counts().head()
# testing code for cohort_id = 1

/                626
javascript-i     294
html-css         215
javascript-ii    204
spring           192
Name: path, dtype: int64

In [83]:
for i in df['cohort_id'].unique():
    print(f"Cohort ID: {i}")
    print(df[df.cohort_id == i].path.value_counts().head())
    
#prints out cohort_id and top 5 value counts for path. 

Cohort ID: 8.0
/           210
java-iii     57
appendix     55
java-i       46
java-ii      46
Name: path, dtype: int64
Cohort ID: 22.0
/               1828
spring           744
java-iii         732
mysql            638
javascript-i     615
Name: path, dtype: int64
Cohort ID: 21.0
/                                  630
spring                             364
spring/fundamentals/views          207
javascript-i                       202
spring/fundamentals/controllers    190
Name: path, dtype: int64
Cohort ID: 1.0
/                626
javascript-i     294
html-css         215
javascript-ii    204
spring           192
Name: path, dtype: int64
Cohort ID: 16.0
spring        65
index.html    63
/             37
java-iii      29
mysql         24
Name: path, dtype: int64
Cohort ID: 18.0
/                149
javascript-i      69
javascript-ii     58
html-css          56
spring            56
Name: path, dtype: int64
Cohort ID: 19.0
/                                                   151
content/l

In [84]:
# cohort_id = 28 had 1817 visits to javascript-i
# cohort_id = 24 had 887 visits to javascript-i as well

In [87]:
df['name'][df.cohort_id == 28].head(1)
# cohort_id = 28 is staff so I will diregard the results

166284    Staff
Name: name, dtype: object

In [88]:
df['name'][df.cohort_id == 24].head(1)

60650    Voyageurs
Name: name, dtype: object

Voyageurs cohort had 887 visits to javascript-i

3. Are there students who, when active, hardly access the curriculum? If so, what information do you have about these students?

In [100]:
min_user = 1000 #1000 path requests
u = 0 # will hold user_id

for i in df.user_id.unique(): #look through list unique user_ids
    if len(df.path[df.user_id == i]) < min_user: #if length is less than # of path requests
        min_user = len(df.path[df.user_id == i]) # min_user now holds that number of path requests
        u = i # holds user_id
        

        

In [102]:
min_user, u

(1, 66)

In [106]:
l = []
for i in df.user_id.unique(): #looks at list of unique user_ids
    if len(df.path[df.user_id == i]) <= min_user: # if length of list is equal to or less than one print user_id
        print(f"user_id: {i}")
        l.append(i)
        
#list of user_ids with 1 or less path requests

user_id: 66
user_id: 163
user_id: 165
user_id: 212
user_id: 592
user_id: 593
user_id: 619
user_id: 649
user_id: 652
user_id: 879
user_id: 918
user_id: 940
user_id: 952


In [120]:
df[df.user_id == 66]
# logged on once post graduation

Unnamed: 0,date,time,path,user_id,cohort_id,ip,name,slack,start_date,end_date,created_at,updated_at,program_id,path_1,path_2,path_3,path_4,path_5,lesson,subpath
1592,2018-01-29,15:48:48,/,66,13.0,67.198.116.192,Kings,#kings,2016-05-23,2016-09-15,2016-06-14 19:52:26,2016-06-14 19:52:26,1.0,,,,,,0,"[, ]"


In [122]:
df[df.user_id == 163]
# logged on once post graduation

Unnamed: 0,date,time,path,user_id,cohort_id,ip,name,slack,start_date,end_date,created_at,updated_at,program_id,path_1,path_2,path_3,path_4,path_5,lesson,subpath
36235,2018-03-29,22:40:57,/,163,19.0,192.171.117.234,Quincy,#quincy,2017-06-05,2017-09-22,2017-06-05 20:12:10,2017-06-05 20:12:10,1.0,,,,,,0,"[, ]"


In [124]:
df[df.user_id == 165]
# logged on once post graduation

Unnamed: 0,date,time,path,user_id,cohort_id,ip,name,slack,start_date,end_date,created_at,updated_at,program_id,path_1,path_2,path_3,path_4,path_5,lesson,subpath
36372,2018-03-30,21:37:04,index.html,165,16.0,24.243.3.200,Niagara,#niagara,2016-10-26,2017-03-09,2016-10-26 02:34:05,2016-10-26 14:30:19,2.0,index.html,,,,,0,[index.html]


In [125]:
df[df.user_id == 212]
# same

Unnamed: 0,date,time,path,user_id,cohort_id,ip,name,slack,start_date,end_date,created_at,updated_at,program_id,path_1,path_2,path_3,path_4,path_5,lesson,subpath
73251,2018-06-19,09:21:08,students/units/75/sub_units/268,212,1.0,170.248.173.247,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,1.0,students,units,75,sub_units,268,0,"[students, units, 75, sub_units, 268]"


In [126]:
df[df.user_id == 592]
#staff

Unnamed: 0,date,time,path,user_id,cohort_id,ip,name,slack,start_date,end_date,created_at,updated_at,program_id,path_1,path_2,path_3,path_4,path_5,lesson,subpath
451443,2020-02-07,12:17:45,/,592,28.0,97.105.19.58,Staff,#,2014-02-04,2014-02-04,2018-12-06 17:04:19,2018-12-06 17:04:19,2.0,,,,,,0,"[, ]"


In [127]:
df[df.user_id == 593]
#same

Unnamed: 0,date,time,path,user_id,cohort_id,ip,name,slack,start_date,end_date,created_at,updated_at,program_id,path_1,path_2,path_3,path_4,path_5,lesson,subpath
458459,2020-02-17,11:45:21,index.html,593,14.0,167.24.104.150,Lassen,#lassen,2016-07-18,2016-11-10,2016-07-18 19:06:27,2016-07-18 19:06:27,1.0,index.html,,,,,0,[index.html]


In [128]:
df[df.user_id == 619]
#opened the curriculum once while enrolled

Unnamed: 0,date,time,path,user_id,cohort_id,ip,name,slack,start_date,end_date,created_at,updated_at,program_id,path_1,path_2,path_3,path_4,path_5,lesson,subpath
491139,2020-03-23,14:10:17,/,619,57.0,97.105.19.58,Ganymede,#ganymede,2020-03-23,2020-08-20,2020-03-23 17:52:16,2020-03-23 17:52:16,2.0,,,,,,0,"[, ]"


In [129]:
df[df.user_id == 649]
# post-graduation

Unnamed: 0,date,time,path,user_id,cohort_id,ip,name,slack,start_date,end_date,created_at,updated_at,program_id,path_1,path_2,path_3,path_4,path_5,lesson,subpath
535518,2020-05-06,19:06:32,/,649,5.0,70.125.150.41,Everglades,#everglades,2014-11-18,2015-02-24,2016-06-14 19:52:26,2016-06-14 19:52:26,1.0,,,,,,0,"[, ]"


In [130]:
df[df.user_id == 652]
#staff

Unnamed: 0,date,time,path,user_id,cohort_id,ip,name,slack,start_date,end_date,created_at,updated_at,program_id,path_1,path_2,path_3,path_4,path_5,lesson,subpath
545172,2020-05-18,09:56:10,loguout,652,28.0,71.150.217.33,Staff,#,2014-02-04,2014-02-04,2018-12-06 17:04:19,2018-12-06 17:04:19,2.0,loguout,,,,,0,[loguout]


In [131]:
df[df.user_id == 879]
#logged in once during class

Unnamed: 0,date,time,path,user_id,cohort_id,ip,name,slack,start_date,end_date,created_at,updated_at,program_id,path_1,path_2,path_3,path_4,path_5,lesson,subpath
799236,2021-01-26,12:21:18,/,879,135.0,136.50.50.187,Marco,#marco,2021-01-25,2021-07-19,2021-01-20 21:31:11,2021-01-20 21:31:11,2.0,,,,,,0,"[, ]"


In [132]:
df[df.user_id == 918]
# logged in once on first day of class

Unnamed: 0,date,time,path,user_id,cohort_id,ip,name,slack,start_date,end_date,created_at,updated_at,program_id,path_1,path_2,path_3,path_4,path_5,lesson,subpath
847372,2021-03-15,16:59:56,/,918,138.0,71.221.46.112,Neptune,#neptune,2021-03-15,2021-09-03,2021-03-15 19:57:09,2021-03-15 19:57:09,2.0,,,,,,0,"[, ]"


In [133]:
df[df.user_id == 940]
# logged in once on first day of class

Unnamed: 0,date,time,path,user_id,cohort_id,ip,name,slack,start_date,end_date,created_at,updated_at,program_id,path_1,path_2,path_3,path_4,path_5,lesson,subpath
847404,2021-03-15,17:00:37,/,940,138.0,70.121.129.79,Neptune,#neptune,2021-03-15,2021-09-03,2021-03-15 19:57:09,2021-03-15 19:57:09,2.0,,,,,,0,"[, ]"


In [134]:
df[df.user_id == 952]
# post graduation

Unnamed: 0,date,time,path,user_id,cohort_id,ip,name,slack,start_date,end_date,created_at,updated_at,program_id,path_1,path_2,path_3,path_4,path_5,lesson,subpath
882087,2021-04-08,16:29:45,/,952,17.0,70.125.158.82,Olympic,#olympic,2017-02-06,2017-05-25,2017-02-06 17:49:10,2017-02-06 17:49:10,1.0,,,,,,0,"[, ]"


4. Is there any suspicious activity, such as users/machines/etc accessing the curriculum who shouldn’t be? Does it appear that any web-scraping is happening? Are there any suspicious IP addresses?

In [139]:
df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'])

In [140]:
df = df.set_index('datetime').sort_index()

In [150]:
df.resample('1Min').user_id.count().sort_values().tail()

datetime
2019-08-02 14:02:00     65
2020-04-05 17:41:00     83
2018-06-02 15:05:00     84
2019-12-19 23:58:00    125
2019-03-03 22:52:00    170
Name: user_id, dtype: int64

In [154]:
df.loc['2019-03-03 22:52:00':'2019-03-03 22:53:00'].user_id.unique()
# users 341 had 170 requests in under a minute, ip = 204.44.112.76

array([341, 54], dtype=object)

In [156]:
pd. set_option('display.max_rows', None)

In [157]:
df.loc['2019-03-03 22:52:00':'2019-03-03 22:53:00']
# user_id = 341, ip = 204.44.112.76, cohort = Zion

Unnamed: 0_level_0,date,time,path,user_id,cohort_id,ip,name,slack,start_date,end_date,created_at,updated_at,program_id,path_1,path_2,path_3,path_4,path_5,lesson,subpath
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2019-03-03 22:52:05,2019-03-03,22:52:05,html-css,341,29.0,204.44.112.76,Zion,#zion,2019-01-22,2019-06-04,2019-01-20 23:18:57,2019-01-20 23:18:57,2.0,html-css,,,,,0,[html-css]
2019-03-03 22:52:06,2019-03-03,22:52:06,javascript-i,341,29.0,204.44.112.76,Zion,#zion,2019-01-22,2019-06-04,2019-01-20 23:18:57,2019-01-20 23:18:57,2.0,javascript-i,,,,,0,[javascript-i]
2019-03-03 22:52:06,2019-03-03,22:52:06,java-i,341,29.0,204.44.112.76,Zion,#zion,2019-01-22,2019-06-04,2019-01-20 23:18:57,2019-01-20 23:18:57,2.0,java-i,,,,,0,[java-i]
2019-03-03 22:52:06,2019-03-03,22:52:06,java-ii,341,29.0,204.44.112.76,Zion,#zion,2019-01-22,2019-06-04,2019-01-20 23:18:57,2019-01-20 23:18:57,2.0,java-ii,,,,,0,[java-ii]
2019-03-03 22:52:06,2019-03-03,22:52:06,javascript-ii,341,29.0,204.44.112.76,Zion,#zion,2019-01-22,2019-06-04,2019-01-20 23:18:57,2019-01-20 23:18:57,2.0,javascript-ii,,,,,0,[javascript-ii]
2019-03-03 22:52:06,2019-03-03,22:52:06,jquery,341,29.0,204.44.112.76,Zion,#zion,2019-01-22,2019-06-04,2019-01-20 23:18:57,2019-01-20 23:18:57,2.0,jquery,,,,,0,[jquery]
2019-03-03 22:52:06,2019-03-03,22:52:06,mysql,341,29.0,204.44.112.76,Zion,#zion,2019-01-22,2019-06-04,2019-01-20 23:18:57,2019-01-20 23:18:57,2.0,mysql,,,,,0,[mysql]
2019-03-03 22:52:06,2019-03-03,22:52:06,java-iii,341,29.0,204.44.112.76,Zion,#zion,2019-01-22,2019-06-04,2019-01-20 23:18:57,2019-01-20 23:18:57,2.0,java-iii,,,,,0,[java-iii]
2019-03-03 22:52:06,2019-03-03,22:52:06,spring,341,29.0,204.44.112.76,Zion,#zion,2019-01-22,2019-06-04,2019-01-20 23:18:57,2019-01-20 23:18:57,2.0,spring,,,,,0,[spring]
2019-03-03 22:52:06,2019-03-03,22:52:06,appendix,341,29.0,204.44.112.76,Zion,#zion,2019-01-22,2019-06-04,2019-01-20 23:18:57,2019-01-20 23:18:57,2.0,appendix,,,,,0,[appendix]


In [158]:
df.loc['2019-12-19 23:58:00':'2019-12-19 23:59:00']
# user 526, ip = 172.124.70.146, cohort = Europa
# accessed multiple lessons in under a minute

Unnamed: 0_level_0,date,time,path,user_id,cohort_id,ip,name,slack,start_date,end_date,created_at,updated_at,program_id,path_1,path_2,path_3,path_4,path_5,lesson,subpath
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2019-12-19 23:58:00,2019-12-19,23:58:00,javascript-ii/npm,526,52.0,172.124.70.146,Europa,#europa,2019-11-04,2020-04-17,2019-11-04 18:27:07,2019-11-04 18:27:07,2.0,javascript-ii,npm,,,,0,"[javascript-ii, npm]"
2019-12-19 23:58:00,2019-12-19,23:58:00,javascript-ii/modules,526,52.0,172.124.70.146,Europa,#europa,2019-11-04,2020-04-17,2019-11-04 18:27:07,2019-11-04 18:27:07,2.0,javascript-ii,modules,,,,0,"[javascript-ii, modules]"
2019-12-19 23:58:00,2019-12-19,23:58:00,javascript-ii/ajax-api,526,52.0,172.124.70.146,Europa,#europa,2019-11-04,2020-04-17,2019-11-04 18:27:07,2019-11-04 18:27:07,2.0,javascript-ii,ajax-api,,,,0,"[javascript-ii, ajax-api]"
2019-12-19 23:58:00,2019-12-19,23:58:00,java-i/introduction-to-java,526,52.0,172.124.70.146,Europa,#europa,2019-11-04,2020-04-17,2019-11-04 18:27:07,2019-11-04 18:27:07,2.0,java-i,introduction-to-java,,,,0,"[java-i, introduction-to-java]"
2019-12-19 23:58:00,2019-12-19,23:58:00,java-i/syntax-types-and-variables,526,52.0,172.124.70.146,Europa,#europa,2019-11-04,2020-04-17,2019-11-04 18:27:07,2019-11-04 18:27:07,2.0,java-i,syntax-types-and-variables,,,,0,"[java-i, syntax-types-and-variables]"
2019-12-19 23:58:00,2019-12-19,23:58:00,java-i/console-io,526,52.0,172.124.70.146,Europa,#europa,2019-11-04,2020-04-17,2019-11-04 18:27:07,2019-11-04 18:27:07,2.0,java-i,console-io,,,,0,"[java-i, console-io]"
2019-12-19 23:58:00,2019-12-19,23:58:00,java-i/control-statements-and-loops,526,52.0,172.124.70.146,Europa,#europa,2019-11-04,2020-04-17,2019-11-04 18:27:07,2019-11-04 18:27:07,2.0,java-i,control-statements-and-loops,,,,0,"[java-i, control-statements-and-loops]"
2019-12-19 23:58:00,2019-12-19,23:58:00,java-ii/object-oriented-programming,526,52.0,172.124.70.146,Europa,#europa,2019-11-04,2020-04-17,2019-11-04 18:27:07,2019-11-04 18:27:07,2.0,java-ii,object-oriented-programming,,,,0,"[java-ii, object-oriented-programming]"
2019-12-19 23:58:00,2019-12-19,23:58:00,java-i/strings,526,52.0,172.124.70.146,Europa,#europa,2019-11-04,2020-04-17,2019-11-04 18:27:07,2019-11-04 18:27:07,2.0,java-i,strings,,,,0,"[java-i, strings]"
2019-12-19 23:58:00,2019-12-19,23:58:00,java-i/methods,526,52.0,172.124.70.146,Europa,#europa,2019-11-04,2020-04-17,2019-11-04 18:27:07,2019-11-04 18:27:07,2.0,java-i,methods,,,,0,"[java-i, methods]"


In [159]:
df.loc['2018-06-02 15:05:00':'2018-06-02 15:06:00']
# user 128, ip = 108.65.244.91, cohort = Teddy

Unnamed: 0_level_0,date,time,path,user_id,cohort_id,ip,name,slack,start_date,end_date,created_at,updated_at,program_id,path_1,path_2,path_3,path_4,path_5,lesson,subpath
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2018-06-02 15:05:00,2018-06-02,15:05:00,content/html-css/css-i,138,22.0,108.65.244.91,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,2.0,content,html-css,css-i,,,0,"[content, html-css, css-i]"
2018-06-02 15:05:01,2018-06-02,15:05:01,content/html-css/css-i/introduction.html,138,22.0,108.65.244.91,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,2.0,content,html-css,css-i,introduction.html,,0,"[content, html-css, css-i, introduction.html]"
2018-06-02 15:05:03,2018-06-02,15:05:03,content/html-css/css-i/selectors-and-propertie...,138,22.0,108.65.244.91,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,2.0,content,html-css,css-i,selectors-and-properties.html,,0,"[content, html-css, css-i, selectors-and-prope..."
2018-06-02 15:05:05,2018-06-02,15:05:05,content/html-css/css-i/box-model.html,138,22.0,108.65.244.91,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,2.0,content,html-css,css-i,box-model.html,,0,"[content, html-css, css-i, box-model.html]"
2018-06-02 15:05:07,2018-06-02,15:05:07,content/html-css/css-i/positioning.html,138,22.0,108.65.244.91,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,2.0,content,html-css,css-i,positioning.html,,0,"[content, html-css, css-i, positioning.html]"
2018-06-02 15:05:08,2018-06-02,15:05:08,content/html-css/css-ii,138,22.0,108.65.244.91,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,2.0,content,html-css,css-ii,,,0,"[content, html-css, css-ii]"
2018-06-02 15:05:10,2018-06-02,15:05:10,content/html-css/css-ii/media-queries.html,138,22.0,108.65.244.91,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,2.0,content,html-css,css-ii,media-queries.html,,0,"[content, html-css, css-ii, media-queries.html]"
2018-06-02 15:05:12,2018-06-02,15:05:12,content/html-css/css-ii/grids.html,138,22.0,108.65.244.91,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,2.0,content,html-css,css-ii,grids.html,,0,"[content, html-css, css-ii, grids.html]"
2018-06-02 15:05:13,2018-06-02,15:05:13,content/html-css/css-ii/bootstrap-introduction...,138,22.0,108.65.244.91,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,2.0,content,html-css,css-ii,bootstrap-introduction.html,,0,"[content, html-css, css-ii, bootstrap-introduc..."
2018-06-02 15:05:13,2018-06-02,15:05:13,content/html-css/css-ii/bootstrap-grid-system....,138,22.0,108.65.244.91,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,2.0,content,html-css,css-ii,bootstrap-grid-system.html,,0,"[content, html-css, css-ii, bootstrap-grid-sys..."


In [160]:
df.loc['2020-04-05 17:41:00':'2020-04-05 17:42:00']
# user 570, ip 172.124.67.93, cohort Fortuna

Unnamed: 0_level_0,date,time,path,user_id,cohort_id,ip,name,slack,start_date,end_date,created_at,updated_at,program_id,path_1,path_2,path_3,path_4,path_5,lesson,subpath
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2020-04-05 17:41:04,2020-04-05,17:41:04,appendix/slides,570,53.0,172.124.67.93,Fortuna,#fortuna,2020-01-13,2020-06-23,2020-01-13 21:17:08,2020-01-13 21:17:08,2.0,appendix,slides,,,,0,"[appendix, slides]"
2020-04-05 17:41:04,2020-04-05,17:41:04,appendix/slides,570,53.0,172.124.67.93,Fortuna,#fortuna,2020-01-13,2020-06-23,2020-01-13 21:17:08,2020-01-13 21:17:08,2.0,appendix,slides,,,,0,"[appendix, slides]"
2020-04-05 17:41:07,2020-04-05,17:41:07,slides/arrays,570,53.0,172.124.67.93,Fortuna,#fortuna,2020-01-13,2020-06-23,2020-01-13 21:17:08,2020-01-13 21:17:08,2.0,slides,arrays,,,,0,"[slides, arrays]"
2020-04-05 17:41:08,2020-04-05,17:41:08,slides/arrays,570,53.0,172.124.67.93,Fortuna,#fortuna,2020-01-13,2020-06-23,2020-01-13 21:17:08,2020-01-13 21:17:08,2.0,slides,arrays,,,,0,"[slides, arrays]"
2020-04-05 17:41:09,2020-04-05,17:41:09,slides/arrays,570,53.0,172.124.67.93,Fortuna,#fortuna,2020-01-13,2020-06-23,2020-01-13 21:17:08,2020-01-13 21:17:08,2.0,slides,arrays,,,,0,"[slides, arrays]"
2020-04-05 17:41:09,2020-04-05,17:41:09,slides/arrays,570,53.0,172.124.67.93,Fortuna,#fortuna,2020-01-13,2020-06-23,2020-01-13 21:17:08,2020-01-13 21:17:08,2.0,slides,arrays,,,,0,"[slides, arrays]"
2020-04-05 17:41:09,2020-04-05,17:41:09,slides/arrays,570,53.0,172.124.67.93,Fortuna,#fortuna,2020-01-13,2020-06-23,2020-01-13 21:17:08,2020-01-13 21:17:08,2.0,slides,arrays,,,,0,"[slides, arrays]"
2020-04-05 17:41:09,2020-04-05,17:41:09,slides/arrays,570,53.0,172.124.67.93,Fortuna,#fortuna,2020-01-13,2020-06-23,2020-01-13 21:17:08,2020-01-13 21:17:08,2.0,slides,arrays,,,,0,"[slides, arrays]"
2020-04-05 17:41:09,2020-04-05,17:41:09,slides/arrays,570,53.0,172.124.67.93,Fortuna,#fortuna,2020-01-13,2020-06-23,2020-01-13 21:17:08,2020-01-13 21:17:08,2.0,slides,arrays,,,,0,"[slides, arrays]"
2020-04-05 17:41:09,2020-04-05,17:41:09,slides/arrays,570,53.0,172.124.67.93,Fortuna,#fortuna,2020-01-13,2020-06-23,2020-01-13 21:17:08,2020-01-13 21:17:08,2.0,slides,arrays,,,,0,"[slides, arrays]"


In [161]:
df.loc['2019-08-02 14:02:00':'2019-08-02 14:03:00']
# user 448, ip 97.105.19.58, cohort Ceres, over 20 requests in a minute

Unnamed: 0_level_0,date,time,path,user_id,cohort_id,ip,name,slack,start_date,end_date,created_at,updated_at,program_id,path_1,path_2,path_3,path_4,path_5,lesson,subpath
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2019-08-02 14:02:02,2019-08-02,14:02:02,javascript-i/introduction/primitive-types,455,33.0,97.105.19.58,Ceres,#ceres,2019-07-15,2019-12-11,2019-07-15 16:57:21,2019-07-15 16:57:21,2.0,javascript-i,introduction,primitive-types,,,0,"[javascript-i, introduction, primitive-types]"
2019-08-02 14:02:02,2019-08-02,14:02:02,javascript-i/introduction,435,33.0,97.105.19.58,Ceres,#ceres,2019-07-15,2019-12-11,2019-07-15 16:57:21,2019-07-15 16:57:21,2.0,javascript-i,introduction,,,,0,"[javascript-i, introduction]"
2019-08-02 14:02:05,2019-08-02,14:02:05,javascript-i/introduction/primitive-types,458,33.0,97.105.19.58,Ceres,#ceres,2019-07-15,2019-12-11,2019-07-15 16:57:21,2019-07-15 16:57:21,2.0,javascript-i,introduction,primitive-types,,,0,"[javascript-i, introduction, primitive-types]"
2019-08-02 14:02:13,2019-08-02,14:02:13,javascript-i,448,33.0,97.105.19.58,Ceres,#ceres,2019-07-15,2019-12-11,2019-07-15 16:57:21,2019-07-15 16:57:21,2.0,javascript-i,,,,,0,[javascript-i]
2019-08-02 14:02:21,2019-08-02,14:02:21,jquery,448,33.0,97.105.19.58,Ceres,#ceres,2019-07-15,2019-12-11,2019-07-15 16:57:21,2019-07-15 16:57:21,2.0,jquery,,,,,0,[jquery]
2019-08-02 14:02:22,2019-08-02,14:02:22,javascript-i,448,33.0,97.105.19.58,Ceres,#ceres,2019-07-15,2019-12-11,2019-07-15 16:57:21,2019-07-15 16:57:21,2.0,javascript-i,,,,,0,[javascript-i]
2019-08-02 14:02:25,2019-08-02,14:02:25,javascript-i/introduction/operators,458,33.0,97.105.19.58,Ceres,#ceres,2019-07-15,2019-12-11,2019-07-15 16:57:21,2019-07-15 16:57:21,2.0,javascript-i,introduction,operators,,,0,"[javascript-i, introduction, operators]"
2019-08-02 14:02:25,2019-08-02,14:02:25,jquery/introduction,448,33.0,97.105.19.58,Ceres,#ceres,2019-07-15,2019-12-11,2019-07-15 16:57:21,2019-07-15 16:57:21,2.0,jquery,introduction,,,,0,"[jquery, introduction]"
2019-08-02 14:02:25,2019-08-02,14:02:25,javascript-i/introduction/primitive-types,458,33.0,97.105.19.58,Ceres,#ceres,2019-07-15,2019-12-11,2019-07-15 16:57:21,2019-07-15 16:57:21,2.0,javascript-i,introduction,primitive-types,,,0,"[javascript-i, introduction, primitive-types]"
2019-08-02 14:02:29,2019-08-02,14:02:29,javascript-i/introduction,458,33.0,97.105.19.58,Ceres,#ceres,2019-07-15,2019-12-11,2019-07-15 16:57:21,2019-07-15 16:57:21,2.0,javascript-i,introduction,,,,0,"[javascript-i, introduction]"


5. At some point in 2019, the ability for students and alumni to access both curriculums (web dev to ds, ds to web dev) should have been shut off. Do you see any evidence of that happening? Did it happen before?

In [170]:
pd. set_option('display.max_rows', 100)

In [173]:
path_values = [df[df.program_id == 3].path.value_counts()]

In [174]:
path_values
# java is present under paths for some ds students

[/                                           8358
 search/search_index.json                    2203
 classification/overview                     1785
 1-fundamentals/modern-data-scientist.jpg    1655
 1-fundamentals/AI-ML-DL-timeline.jpg        1651
                                             ... 
 Index.html                                     1
 decision-trees                                 1
 spring/extra-features/error-pages              1
 javascript-i/conditionals                      1
 3-sql                                          1
 Name: path, Length: 682, dtype: int64]

  df.tail(100)[df.program_id == 3].path.value_counts()


ValueError: cannot reindex from a duplicate axis