In [7]:
import numpy as np
import pandas as pd
import math
from sklearn import metrics

from scipy.stats import entropy

import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import matplotlib.dates as mdates #to format dates on our plots
%matplotlib inline
import seaborn as sns
import acquire
from acquire import get_codeup_data
from acquire import get_cohort_data
from prepare import prep_codeup_data
from prepare import prep_cohort_data

# Goal of this project is to answer a set of questions regarding the Codeup dataset that will be delivered to Maggie via email, which she will use as part of a presentation in a board meeting. 

-------------------------------------------------------------

# Acquire

**First thing is to acquire the data from the acquire.py file**

In [2]:
df = get_codeup_data()

In [3]:
df.head()

Unnamed: 0,0,1,2,3,4,5
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61


**Acquiring the cohort dataset in order to join with codeup dataset, matching the cohort name with the id**

In [4]:
cohort = get_cohort_data()

In [5]:
cohort.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,,cohort_id,name,start_date,end_date,program_id,
1,,1,Arches,2014-02-04,2014-04-22,1,
2,,2,Badlands,2014-06-04,2014-08-22,1,
3,,3,Carlsbad,2014-09-04,2014-11-05,1,
4,,4,Denali,2014-10-20,2015-01-18,1,


# Prepare

**Cleaning the cohort data by renaming columns and changing the cohort id to an integer**

In [8]:
cohort = prep_cohort_data(cohort)

In [9]:
cohort.head()

Unnamed: 0,cohort_id,name,start_date,end_date
1,1,Arches,2014-02-04,2014-04-22
2,2,Badlands,2014-06-04,2014-08-22
3,3,Carlsbad,2014-09-04,2014-11-05
4,4,Denali,2014-10-20,2015-01-18
5,5,Everglades,2014-11-18,2015-02-24


**Running the main prep function to clean up the primary dataset. This will rename the columns, merge the two dataframes together, change the date to datetime and set the date as the index**

In [10]:
df = prep_codeup_data(df, cohort)

In [11]:
df.head()

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-01-26 09:55:03,/,1,8,97.105.19.61,Hampton,2015-09-22,2016-02-06
2018-01-26 09:56:02,java-ii,1,8,97.105.19.61,Hampton,2015-09-22,2016-02-06
2018-01-26 09:56:05,java-ii/object-oriented-programming,1,8,97.105.19.61,Hampton,2015-09-22,2016-02-06
2018-01-26 09:56:06,slides/object_oriented_programming,1,8,97.105.19.61,Hampton,2015-09-22,2016-02-06
2018-01-26 10:40:15,javascript-i/functions,1,8,97.105.19.61,Hampton,2015-09-22,2016-02-06


**Going to split the data into two datasets, data science and web development**

In [14]:
data_science = df[(df.name == 'Curie') | (df.name == 'Bayes') | (df.name == 'Ada') | (df.name == 'Darden')]

In [17]:
data_science.shape

(68016, 7)

In [18]:
web_dev = df[(df.name != 'Curie') & (df.name != 'Bayes') & (df.name != 'Ada') & (df.name != 'Darden')]

In [19]:
web_dev.shape

(606602, 7)

# Explore / Answer the email questions

 # 1. Which lesson appears to attract the most traffic consistently across cohorts (per program)?

In [20]:
#Create a dataframe for each of the data science cohorts(there are 3 of them)
bayes = data_science[data_science.name == 'Bayes']

In [21]:
bayes.page_viewed.value_counts().head(15)

/                                                    1842
1-fundamentals/modern-data-scientist.jpg              626
1-fundamentals/AI-ML-DL-timeline.jpg                  624
1-fundamentals/1.1-intro-to-data-science              615
search/search_index.json                              551
6-regression/1-overview                               521
10-anomaly-detection/AnomalyDetectionCartoon.jpeg     386
10-anomaly-detection/1-overview                       383
6-regression/5.0-evaluate                             333
5-stats/3-probability-distributions                   320
5-stats/4.2-compare-means                             315
appendix/cli-git-overview                             311
6-regression/7.0-model                                310
6-regression/4.0-explore                              267
6-regression/3.0-split-and-scale                      260
Name: page_viewed, dtype: int64

In [22]:
darden = data_science[data_science.name == 'Darden']

In [23]:
darden.page_viewed.value_counts().head(15)

/                                           2041
classification/overview                      759
classification/scale_features_or_not.svg     590
sql/mysql-overview                           513
1-fundamentals/AI-ML-DL-timeline.jpg         470
1-fundamentals/modern-data-scientist.jpg     470
1-fundamentals/1.1-intro-to-data-science     460
stats/compare-means                          338
classification/logistic-regression           334
classification/prep                          321
search/search_index.json                     300
1-fundamentals/DataToAction_v2.jpg           284
classification/explore                       282
classification/evaluation                    280
1-fundamentals/1.2-data-science-pipeline     271
Name: page_viewed, dtype: int64

In [24]:
curie = data_science[data_science.name == 'Curie']

In [25]:
curie.page_viewed.value_counts().head(15)

/                                                    1523
6-regression/1-overview                               595
search/search_index.json                              480
1-fundamentals/modern-data-scientist.jpg              467
1-fundamentals/AI-ML-DL-timeline.jpg                  465
1-fundamentals/1.1-intro-to-data-science              461
3-sql/1-mysql-overview                                441
10-anomaly-detection/AnomalyDetectionCartoon.jpeg     345
10-anomaly-detection/1-overview                       345
4-python/8.4.3-dataframes                             260
4-python/8.4.4-advanced-dataframes                    246
4-python/3-data-types-and-variables                   234
4-python/5-functions                                  203
5-stats/4.2-compare-means                             197
5-stats/2-simulation                                  193
Name: page_viewed, dtype: int64

**For the data science cohort, fundamentals/modern-data-scientist was consistently the most viewed webpage.**

In [None]:
# Going to take a sample of a few web dev cohorts in order to see a pattern of what the most viewed page is.

In [40]:
web_dev.name.value_counts().head()

Staff        60315
Ceres        40168
Zion         37548
Fortuna      36047
Voyageurs    35624
Name: name, dtype: int64

In [30]:
ceres = web_dev[web_dev.name == 'Ceres']

In [31]:
ceres.page_viewed.value_counts().head(10)

/                           1620
search/search_index.json    1376
javascript-i                 977
toc                          909
html-css                     753
java-iii                     674
java-ii                      667
jquery                       632
mysql                        617
spring                       546
Name: page_viewed, dtype: int64

In [32]:
zion = web_dev[web_dev.name == 'Zion']

In [33]:
zion.page_viewed.value_counts().head(10)

/                           1756
toc                         1457
javascript-i                 868
java-iii                     742
search/search_index.json     689
spring                       650
html-css                     649
javascript-ii                637
java-ii                      613
mysql                        598
Name: page_viewed, dtype: int64

In [34]:
fortuna = web_dev[web_dev.name == 'Fortuna']

In [35]:
fortuna.page_viewed.value_counts().head(10)

/                           1962
toc                         1273
search/search_index.json     989
java-iii                     767
javascript-i                 756
java-ii                      637
spring                       616
html-css                     578
mysql                        571
java-i                       538
Name: page_viewed, dtype: int64

In [36]:
voyageurs = web_dev[web_dev.name == 'Voyageurs']

In [37]:
voyageurs.page_viewed.value_counts().head(10)

/                2098
javascript-i      884
java-iii          770
java-ii           754
mysql             663
spring            650
java-i            639
javascript-ii     584
jquery            583
html-css          528
Name: page_viewed, dtype: int64

**Javascript-I and Java-III are the most accessed pages for web dev.**

-------------------------------------------------------------


 # 2. Is there a cohort that referred to a lesson significantly more that other cohorts seemed to gloss over?

In [42]:
bayes.page_viewed.value_counts().head(20)

/                                                    1842
1-fundamentals/modern-data-scientist.jpg              626
1-fundamentals/AI-ML-DL-timeline.jpg                  624
1-fundamentals/1.1-intro-to-data-science              615
search/search_index.json                              551
6-regression/1-overview                               521
10-anomaly-detection/AnomalyDetectionCartoon.jpeg     386
10-anomaly-detection/1-overview                       383
6-regression/5.0-evaluate                             333
5-stats/3-probability-distributions                   320
5-stats/4.2-compare-means                             315
appendix/cli-git-overview                             311
6-regression/7.0-model                                310
6-regression/4.0-explore                              267
6-regression/3.0-split-and-scale                      260
7-classification/3-prep                               256
4-python/7.4.3-dataframes                             251
7-classificati

In [43]:
curie.page_viewed.value_counts().head(20)

/                                                    1523
6-regression/1-overview                               595
search/search_index.json                              480
1-fundamentals/modern-data-scientist.jpg              467
1-fundamentals/AI-ML-DL-timeline.jpg                  465
1-fundamentals/1.1-intro-to-data-science              461
3-sql/1-mysql-overview                                441
10-anomaly-detection/AnomalyDetectionCartoon.jpeg     345
10-anomaly-detection/1-overview                       345
4-python/8.4.3-dataframes                             260
4-python/8.4.4-advanced-dataframes                    246
4-python/3-data-types-and-variables                   234
4-python/5-functions                                  203
5-stats/4.2-compare-means                             197
5-stats/2-simulation                                  193
appendix/cli-git-overview                             190
3-sql/7-functions                                     185
7-classificati

In [44]:
darden.page_viewed.value_counts().head(20)

/                                           2041
classification/overview                      759
classification/scale_features_or_not.svg     590
sql/mysql-overview                           513
1-fundamentals/AI-ML-DL-timeline.jpg         470
1-fundamentals/modern-data-scientist.jpg     470
1-fundamentals/1.1-intro-to-data-science     460
stats/compare-means                          338
classification/logistic-regression           334
classification/prep                          321
search/search_index.json                     300
1-fundamentals/DataToAction_v2.jpg           284
classification/explore                       282
classification/evaluation                    280
1-fundamentals/1.2-data-science-pipeline     271
classification/project                       252
classification/acquire                       252
stats/probability-distributions              246
python/data-types-and-variables              235
stats/correlation                            234
Name: page_viewed, d

**For data science, Curie accessed the Python modules much more frequently than the other cohorts.**

In [46]:
web_dev = web_dev[web_dev.page_viewed != '/']

In [62]:
pd.set_option("max_rows",2000)

In [66]:
web_2020 = web_dev[(web_dev.index.year < 2020) & (web_dev.index.year > 2018)]


In [68]:
web_2020 = web_dev[web_dev.name != 'Staff']

In [69]:
pd.crosstab(web_2020.page_viewed, web_2020.name)

name,Andromeda,Apex,Apollo,Arches,Badlands,Bash,Betelgeuse,Ceres,Deimos,Denali,...,Pinnacles,Quincy,Sequoia,Teddy,Ulysses,Voyageurs,Wrangell,Xanadu,Yosemite,Zion
page_viewed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.git,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
.gitignore,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
.well-known/assetlinks.json,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00_index,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,0
01_intro,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,0
02_listing_files,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
03_file_paths,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
04_navigating_the_filesystem,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
05_creating_files_and_directories,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
0_Classification_Algorithms,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**Zion accessed javascript-II (612 compared to 449 for next closest), (721 to 593) for java_III.**

# 3. Are there students who, when active, hardly access the curriculum? If so, what information do you have about these students?

In [71]:
darden.user_id.value_counts() < 15

685    False
698    False
689    False
699    False
681    False
692    False
688    False
691    False
682    False
678    False
696    False
684    False
680    False
268    False
687    False
686    False
695    False
690    False
694    False
739    False
693    False
683    False
781    False
783    False
780    False
785    False
697     True
679     True
Name: user_id, dtype: bool

In [72]:
user1 = darden[darden.user_id == 697]

In [73]:
user1

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-07-13 15:20:27,/,697,59,136.50.70.27,Darden,2020-07-13,2021-01-12
2020-07-13 15:20:48,3-sql/1-mysql-overview,697,59,136.50.70.27,Darden,2020-07-13,2021-01-12
2020-07-13 15:20:50,1-fundamentals/1.1-intro-to-data-science,697,59,136.50.70.27,Darden,2020-07-13,2021-01-12
2020-07-13 15:20:50,1-fundamentals/modern-data-scientist.jpg,697,59,136.50.70.27,Darden,2020-07-13,2021-01-12
2020-07-13 15:20:50,1-fundamentals/AI-ML-DL-timeline.jpg,697,59,136.50.70.27,Darden,2020-07-13,2021-01-12
2020-07-13 15:20:59,1-fundamentals/1.2-data-science-pipeline,697,59,136.50.70.27,Darden,2020-07-13,2021-01-12
2020-07-13 15:20:59,1-fundamentals/DataToAction_v2.jpg,697,59,136.50.70.27,Darden,2020-07-13,2021-01-12
2020-07-13 15:21:01,1-fundamentals/1.1-intro-to-data-science,697,59,136.50.70.27,Darden,2020-07-13,2021-01-12
2020-07-13 15:21:02,1-fundamentals/AI-ML-DL-timeline.jpg,697,59,136.50.70.27,Darden,2020-07-13,2021-01-12
2020-07-13 15:21:02,1-fundamentals/modern-data-scientist.jpg,697,59,136.50.70.27,Darden,2020-07-13,2021-01-12


In [75]:
user2 = darden[darden.user_id == 679]

In [76]:
user2

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-07-13 14:37:22,/,679,59,24.28.146.155,Darden,2020-07-13,2021-01-12
2020-07-13 14:39:21,13-advanced-topics/1-tidy-data,679,59,24.28.146.155,Darden,2020-07-13,2021-01-12
2020-07-13 14:39:36,1-fundamentals/1.1-intro-to-data-science,679,59,24.28.146.155,Darden,2020-07-13,2021-01-12
2020-07-13 14:39:37,1-fundamentals/AI-ML-DL-timeline.jpg,679,59,24.28.146.155,Darden,2020-07-13,2021-01-12
2020-07-13 14:39:37,1-fundamentals/modern-data-scientist.jpg,679,59,24.28.146.155,Darden,2020-07-13,2021-01-12
2020-07-13 15:49:31,1-fundamentals/1.1-intro-to-data-science,679,59,24.28.146.155,Darden,2020-07-13,2021-01-12
2020-07-13 15:49:32,1-fundamentals/modern-data-scientist.jpg,679,59,24.28.146.155,Darden,2020-07-13,2021-01-12
2020-07-13 15:49:32,1-fundamentals/AI-ML-DL-timeline.jpg,679,59,24.28.146.155,Darden,2020-07-13,2021-01-12
2020-07-14 08:05:15,1-fundamentals/1.1-intro-to-data-science,679,59,24.28.146.155,Darden,2020-07-13,2021-01-12
2020-07-14 08:05:15,1-fundamentals/AI-ML-DL-timeline.jpg,679,59,24.28.146.155,Darden,2020-07-13,2021-01-12


**UserId 697 and 679 only participated in class for a couple days**

In [77]:
darden.user_id.value_counts() < 100

685    False
698    False
689    False
699    False
681    False
692    False
688    False
691    False
682    False
678    False
696    False
684    False
680    False
268    False
687    False
686    False
695    False
690    False
694    False
739    False
693    False
683    False
781     True
783     True
780     True
785     True
697     True
679     True
Name: user_id, dtype: bool

In [78]:
user3 = darden[darden.user_id == 785]

In [79]:
user3.shape

(31, 7)

In [80]:
user3

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-10-26 14:35:01,timeseries/acquire,785,59,72.181.127.233,Darden,2020-07-13,2021-01-12
2020-10-27 09:13:52,working-with-time-series-data,785,59,72.181.127.233,Darden,2020-07-13,2021-01-12
2020-10-27 09:13:55,working-with-time-series-data,785,59,72.181.127.233,Darden,2020-07-13,2021-01-12
2020-10-27 09:13:57,working-with-time-series-data,785,59,72.181.127.233,Darden,2020-07-13,2021-01-12
2020-10-27 09:14:02,working-with-time-series-data,785,59,72.181.127.233,Darden,2020-07-13,2021-01-12
2020-10-27 09:14:09,/,785,59,72.181.127.233,Darden,2020-07-13,2021-01-12
2020-10-27 09:14:14,sql/mysql-overview,785,59,72.181.127.233,Darden,2020-07-13,2021-01-12
2020-10-27 09:14:14,classification/overview,785,59,72.181.127.233,Darden,2020-07-13,2021-01-12
2020-10-27 09:14:14,classification/scale_features_or_not.svg,785,59,72.181.127.233,Darden,2020-07-13,2021-01-12
2020-10-27 09:14:20,timeseries/overview,785,59,72.181.127.233,Darden,2020-07-13,2021-01-12


In [81]:
darden.user_id.value_counts()

685    2542
698    1695
689    1544
699    1420
681    1270
692    1261
688    1248
691    1222
682    1184
678     937
696     858
684     853
680     847
268     781
687     732
686     725
695     702
690     663
694     567
739     522
693     429
683     168
781      94
783      54
780      52
785      31
697      13
679      11
Name: user_id, dtype: int64

In [None]:
#28 student ids. Possibly a few students were assigned a second id.

In [83]:
user4 = darden[darden.user_id == 780]

In [84]:
user4

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-10-22 14:20:52,/,780,59,99.132.128.255,Darden,2020-07-13,2021-01-12
2020-10-22 14:21:19,timeseries/overview,780,59,99.132.128.255,Darden,2020-07-13,2021-01-12
2020-10-22 15:35:22,timeseries/acquire,780,59,99.132.128.255,Darden,2020-07-13,2021-01-12
2020-10-22 18:40:09,regression/acquire-and-prep,780,59,99.132.128.255,Darden,2020-07-13,2021-01-12
2020-10-22 18:41:01,regression/split-and-scale,780,59,99.132.128.255,Darden,2020-07-13,2021-01-12
2020-10-22 18:42:11,regression/explore,780,59,99.132.128.255,Darden,2020-07-13,2021-01-12
2020-10-22 18:43:55,regression/evaluate,780,59,99.132.128.255,Darden,2020-07-13,2021-01-12
2020-10-22 18:44:54,regression/feature-engineering,780,59,99.132.128.255,Darden,2020-07-13,2021-01-12
2020-10-22 18:47:04,regression/model,780,59,99.132.128.255,Darden,2020-07-13,2021-01-12
2020-10-22 18:47:40,regression/project,780,59,99.132.128.255,Darden,2020-07-13,2021-01-12


In [85]:
curie.user_id.value_counts() < 20

581    False
576    False
590    False
584    False
580    False
582    False
579    False
585    False
586    False
589    False
617    False
591    False
578    False
588    False
616    False
575    False
587    False
583    False
577    False
746     True
787     True
Name: user_id, dtype: bool

In [86]:
user5 = curie[curie.user_id == 746]

In [87]:
user5

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-09-10 10:50:28,/,746,55,173.175.100.201,Curie,2020-02-03,2020-07-07


In [88]:
user6 = curie[curie.user_id == 787]

In [89]:
user6

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-10-29 22:08:03,appendix/interview_questions_students,787,55,99.126.113.140,Curie,2020-02-03,2020-07-07


In [91]:
curie.user_id.value_counts() < 200

581    False
576    False
590    False
584    False
580    False
582    False
579    False
585    False
586    False
589    False
617    False
591    False
578    False
588    False
616    False
575    False
587    False
583    False
577    False
746     True
787     True
Name: user_id, dtype: bool

In [92]:
bayes.user_id.value_counts() < 100

485    False
475    False
476    False
479    False
478    False
482    False
471    False
469    False
466    False
473    False
481    False
358    False
484    False
480    False
483    False
472    False
468    False
474    False
467    False
470    False
477    False
487     True
650     True
Name: user_id, dtype: bool

In [93]:
user7 = bayes[bayes.user_id == 487]

In [94]:
user7

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-08-30 12:01:29,/,487,34,97.105.19.58,Bayes,2019-08-19,2020-01-30
2019-08-30 12:02:11,appendix/cli/1-intro,487,34,97.105.19.58,Bayes,2019-08-19,2020-01-30
2019-08-30 12:11:10,appendix/cli/2-listing-files,487,34,97.105.19.58,Bayes,2019-08-19,2020-01-30
2019-08-30 13:37:40,appendix/cli/0-overview,487,34,97.105.19.58,Bayes,2019-08-19,2020-01-30
2019-08-30 13:37:58,appendix/cli-git-overview,487,34,97.105.19.58,Bayes,2019-08-19,2020-01-30
2019-08-30 14:12:22,appendix/cli/3-file-paths,487,34,97.105.19.58,Bayes,2019-08-19,2020-01-30
2019-08-30 14:19:45,appendix/cli/4-navigating-the-filesystem,487,34,97.105.19.58,Bayes,2019-08-19,2020-01-30
2019-08-30 14:32:22,appendix/cli/5-creating-files-and-directories,487,34,97.105.19.58,Bayes,2019-08-19,2020-01-30
2019-08-30 14:38:57,appendix/cli/4-navigating-the-filesystem,487,34,97.105.19.58,Bayes,2019-08-19,2020-01-30
2019-08-30 14:59:46,appendix/cli/5-creating-files-and-directories,487,34,97.105.19.58,Bayes,2019-08-19,2020-01-30


In [95]:
user8 = bayes[bayes.user_id == 650]

In [96]:
user8

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-05-07 10:35:25,/,650,34,70.123.225.30,Bayes,2019-08-19,2020-01-30
2020-05-07 10:35:52,appendix/professional-development/virtual-meet...,650,34,70.123.225.30,Bayes,2019-08-19,2020-01-30
2020-06-25 10:18:26,/,650,34,70.123.225.30,Bayes,2019-08-19,2020-01-30
2020-06-25 10:19:01,appendix/cli-git-overview,650,34,70.123.225.30,Bayes,2019-08-19,2020-01-30
2020-06-25 10:20:00,appendix/ds-environment-setup,650,34,70.123.225.30,Bayes,2019-08-19,2020-01-30
2020-07-03 15:13:38,/,650,34,70.123.225.30,Bayes,2019-08-19,2020-01-30
2020-07-03 15:15:21,11-nlp/3-acquire,650,34,70.123.225.30,Bayes,2019-08-19,2020-01-30
2020-07-06 11:41:32,appendix/ds-environment-setup,650,34,70.123.225.30,Bayes,2019-08-19,2020-01-30
2020-08-03 11:18:31,/,650,34,70.123.225.30,Bayes,2019-08-19,2020-01-30
2020-08-09 13:17:01,/,650,34,70.123.225.30,Bayes,2019-08-19,2020-01-30


**Within data science, most of the users that rarely accessed the cirriculum appeared to not have kept up with the program. There are a few users in Darden besides the two students that left within the first week that accessed the cirriculum less than 100 times, possibly due to having two userids because it says there are 28 students, which is higher than our enrollment.**

In [97]:
(web_dev.user_id.value_counts() < 5).tail(30)

279    False
95     False
141    False
92     False
84     False
388    False
104    False
107    False
786    False
115    False
401    False
82     False
71      True
461     True
246     True
152     True
85      True
399     True
216     True
97      True
81      True
348     True
169     True
177     True
652     True
745     True
593     True
212     True
574     True
165     True
Name: user_id, dtype: bool

In [98]:
user = web_dev[web_dev.user_id == 651]

In [99]:
user

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


In [100]:
user = web_dev[web_dev.user_id == 212]

In [101]:
user

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-06-19 09:21:08,students/units/75/sub_units/268,212,1,170.248.173.247,Arches,2014-02-04,2014-04-22


In [102]:
user = web_dev[web_dev.user_id == 593]

In [103]:
user

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-02-17 11:45:21,index.html,593,14,167.24.104.150,Lassen,2016-07-18,2016-11-10


In [104]:
user = web_dev[web_dev.user_id == 163]

In [105]:
user

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


**Web dev students that didn't access the cirriculum appear to have dropped out of the program.**

#  4. Is there any suspicious activity, such as users/machines/etc accessing the curriculum who shouldn’t be? Does it appear that any web-scraping is happening? Are there any suspicious IP addresses? Any odd user-agents? 

In [106]:
# Create a dataframe that just includes unique ip addresses.

In [112]:
ips = darden.ip.unique()

In [113]:
ips = pd.DataFrame(ips)

In [114]:
ips.shape

(155, 1)

In [116]:
ips.head()

Unnamed: 0,0
0,76.201.20.193
1,24.28.146.155
2,136.50.56.155
3,108.239.188.205
4,68.54.110.249


In [117]:
#Rename column
ips.rename(columns={0: 'ip'}, inplace=True)

In [118]:
# Hitting an api have will provide the location of each ip address.


In [119]:
import requests
d = []
for index, row in ips.iterrows():
    location = row['ip']
    url = "https://free-geo-ip.p.rapidapi.com/json/" + location

    headers = {
        'x-rapidapi-key': "e971744d3fmshe67b5c357e7ec4ap1aa107jsn4a07ff9a4175",
        'x-rapidapi-host': "free-geo-ip.p.rapidapi.com"
        }

    response = requests.request("GET", url, headers=headers)

    data = response.json()
    d.append(
        {
            'ip': row['ip'],
            'city': data['city'],
            'region': data['region_name']
            
        }
    )

pd.DataFrame(d)

Unnamed: 0,ip,city,region
0,76.201.20.193,Austin,Texas
1,24.28.146.155,San Antonio,Texas
2,136.50.56.155,San Antonio,Texas
3,108.239.188.205,San Antonio,Texas
4,68.54.110.249,Burnsville,Minnesota
5,173.174.165.12,San Antonio,Texas
6,70.120.16.59,San Antonio,Texas
7,99.76.233.212,San Antonio,Texas
8,72.177.148.181,San Antonio,Texas
9,99.132.128.255,San Antonio,Texas


In [127]:
d = pd.DataFrame(d)

In [128]:
d.groupby('region')[['ip']].agg('count').sort_values(by='ip',ascending=False)

Unnamed: 0_level_0,ip
region,Unnamed: 1_level_1
Texas,150
Minnesota,2
Arizona,1
Nebraska,1
North Carolina,1


**A couple of unusual ips, one in Nebraska and one in Arizona**

In [120]:
darden[darden.ip == '99.203.40.221']

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-10-22 09:19:16,clustering/explore,690,59,99.203.40.221,Darden,2020-07-13,2021-01-12
2020-10-22 09:19:45,regression/model,690,59,99.203.40.221,Darden,2020-07-13,2021-01-12
2020-10-22 09:38:00,classification/decision-trees,690,59,99.203.40.221,Darden,2020-07-13,2021-01-12


In [121]:
darden[darden.user_id == 690]

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-07-13 14:43:14,/,690,59,67.11.40.170,Darden,2020-07-13,2021-01-12
2020-07-14 08:47:42,1-fundamentals/1.1-intro-to-data-science,690,59,184.226.101.48,Darden,2020-07-13,2021-01-12
2020-07-14 08:47:43,1-fundamentals/modern-data-scientist.jpg,690,59,184.226.101.48,Darden,2020-07-13,2021-01-12
2020-07-14 08:47:43,1-fundamentals/AI-ML-DL-timeline.jpg,690,59,184.226.101.48,Darden,2020-07-13,2021-01-12
2020-07-14 19:15:39,1-fundamentals/1.1-intro-to-data-science,690,59,184.203.177.106,Darden,2020-07-13,2021-01-12
2020-07-14 19:15:39,1-fundamentals/modern-data-scientist.jpg,690,59,184.203.177.106,Darden,2020-07-13,2021-01-12
2020-07-14 19:15:40,1-fundamentals/AI-ML-DL-timeline.jpg,690,59,184.203.177.106,Darden,2020-07-13,2021-01-12
2020-07-15 08:39:29,1-fundamentals/1.1-intro-to-data-science,690,59,67.11.40.170,Darden,2020-07-13,2021-01-12
2020-07-15 08:39:29,1-fundamentals/modern-data-scientist.jpg,690,59,67.11.40.170,Darden,2020-07-13,2021-01-12
2020-07-15 08:39:30,1-fundamentals/AI-ML-DL-timeline.jpg,690,59,67.11.40.170,Darden,2020-07-13,2021-01-12


In [122]:
#Appears to be a regular student who happened to use a separate ip on an occasion.

In [123]:
darden[darden.ip == '72.206.103.198']

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-09-05 21:21:07,/,682,59,72.206.103.198,Darden,2020-07-13,2021-01-12
2020-09-05 21:44:31,python/overview,682,59,72.206.103.198,Darden,2020-07-13,2021-01-12
2020-09-05 21:50:21,python/ds-libraries-overview,682,59,72.206.103.198,Darden,2020-07-13,2021-01-12
2020-09-05 21:50:35,python/intro-to-matplotlib,682,59,72.206.103.198,Darden,2020-07-13,2021-01-12


In [124]:
darden[darden.user_id == 682]

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-07-13 14:39:43,/,682,59,68.54.110.249,Darden,2020-07-13,2021-01-12
2020-07-13 14:40:19,1-fundamentals/1.1-intro-to-data-science,682,59,68.54.110.249,Darden,2020-07-13,2021-01-12
2020-07-13 14:40:20,1-fundamentals/modern-data-scientist.jpg,682,59,68.54.110.249,Darden,2020-07-13,2021-01-12
2020-07-13 14:40:20,1-fundamentals/AI-ML-DL-timeline.jpg,682,59,68.54.110.249,Darden,2020-07-13,2021-01-12
2020-07-13 14:40:55,1-fundamentals/3-vocabulary,682,59,68.54.110.249,Darden,2020-07-13,2021-01-12
2020-07-13 14:41:45,1-fundamentals/3-vocabulary,682,59,68.54.110.249,Darden,2020-07-13,2021-01-12
2020-07-13 14:41:47,1-fundamentals/3-vocabulary,682,59,68.54.110.249,Darden,2020-07-13,2021-01-12
2020-07-13 15:19:36,1-fundamentals/1.1-intro-to-data-science,682,59,68.54.110.249,Darden,2020-07-13,2021-01-12
2020-07-13 15:19:36,1-fundamentals/AI-ML-DL-timeline.jpg,682,59,68.54.110.249,Darden,2020-07-13,2021-01-12
2020-07-13 15:19:36,1-fundamentals/modern-data-scientist.jpg,682,59,68.54.110.249,Darden,2020-07-13,2021-01-12


In [129]:
#This also appears to be a regular student. Going to repeat this process with all of data science

In [130]:
ds_ip = data_science.ip.unique()

In [131]:
ds_ip = pd.DataFrame(ds_ip)

In [132]:
ds_ip.rename(columns={0: 'ip'}, inplace=True)

In [133]:
import requests
d = []
for index, row in ds_ip.iterrows():
    location = row['ip']
    url = "https://free-geo-ip.p.rapidapi.com/json/" + location

    headers = {
        'x-rapidapi-key': "e971744d3fmshe67b5c357e7ec4ap1aa107jsn4a07ff9a4175",
        'x-rapidapi-host': "free-geo-ip.p.rapidapi.com"
        }

    response = requests.request("GET", url, headers=headers)

    data = response.json()
    d.append(
        {
            'ip': row['ip'],
            'city': data['city'],
            'region': data['region_name']
            
        }
    )

d = pd.DataFrame(d)

In [134]:
d.groupby('region')[['ip']].agg('count').sort_values(by='ip',ascending=False)

Unnamed: 0_level_0,ip
region,Unnamed: 1_level_1
Texas,543
,14
Ontario,6
California,4
Jalisco,3
Massachusetts,2
Queensland,2
Florida,2
North Carolina,2
Arizona,2


**Data science overall has several international ip addresses, which is noteworthy.**

In [135]:
d[d.region == 'Queensland']

Unnamed: 0,ip,city,region
99,45.248.77.99,Brisbane,Queensland
106,103.137.12.164,Brisbane,Queensland


In [136]:
data_science[data_science.ip == '103.137.12.164']

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-12-12 10:13:03,6-regression/1-overview,469,34,103.137.12.164,Bayes,2019-08-19,2020-01-30
2019-12-12 10:13:23,7-classification/6.2-decision-trees,469,34,103.137.12.164,Bayes,2019-08-19,2020-01-30
2019-12-12 10:24:47,10-anomaly-detection/1-overview,469,34,103.137.12.164,Bayes,2019-08-19,2020-01-30
2019-12-12 10:24:48,10-anomaly-detection/AnomalyDetectionCartoon.jpeg,469,34,103.137.12.164,Bayes,2019-08-19,2020-01-30
2019-12-12 10:24:53,11-nlp/project,469,34,103.137.12.164,Bayes,2019-08-19,2020-01-30
2019-12-12 10:24:54,11-nlp/github_repo_language.gif,469,34,103.137.12.164,Bayes,2019-08-19,2020-01-30
2019-12-12 10:24:56,11-nlp/6-model,469,34,103.137.12.164,Bayes,2019-08-19,2020-01-30
2019-12-12 11:05:10,11-nlp/4-prepare,469,34,103.137.12.164,Bayes,2019-08-19,2020-01-30
2019-12-12 11:20:50,11-nlp/6-model,469,34,103.137.12.164,Bayes,2019-08-19,2020-01-30


In [137]:
#These page views were made in 2019, while class was in person.

In [138]:
perp1 = data_science[data_science.user_id == 469]

In [139]:
perp1.ip.value_counts()

97.105.19.58       749
67.11.115.125      192
196.247.56.62       96
162.219.176.244     46
185.145.38.235      41
68.206.101.245      38
172.98.66.16        24
196.196.192.52      18
89.187.175.105      15
104.200.138.33      13
173.232.243.3       11
103.137.12.164       9
184.75.208.254       9
104.254.95.84        8
185.153.179.81       7
45.248.77.99         6
129.115.195.45       6
184.75.223.44        5
172.98.66.4          4
89.187.175.48        1
Name: ip, dtype: int64

In [140]:
# Perp1 from the Bayes cohort used multiple ip addresses, possibly using a VPN?

In [141]:
d[d.region == 'North Rhine-Westphalia']

Unnamed: 0,ip,city,region
98,185.145.38.235,Cologne,North Rhine-Westphalia


In [142]:
data_science[data_science.ip == '185.145.38.235']

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-12-08 11:14:46,1-fundamentals/1.1-intro-to-data-science,469,34,185.145.38.235,Bayes,2019-08-19,2020-01-30
2019-12-08 11:14:46,1-fundamentals/modern-data-scientist.jpg,469,34,185.145.38.235,Bayes,2019-08-19,2020-01-30
2019-12-08 11:14:47,1-fundamentals/AI-ML-DL-timeline.jpg,469,34,185.145.38.235,Bayes,2019-08-19,2020-01-30
2019-12-08 11:57:39,1-fundamentals/1.2-data-science-pipeline,469,34,185.145.38.235,Bayes,2019-08-19,2020-01-30
2019-12-08 11:57:39,1-fundamentals/DataToAction_v2.jpg,469,34,185.145.38.235,Bayes,2019-08-19,2020-01-30
2019-12-08 12:02:51,1-fundamentals/1.1-intro-to-data-science,469,34,185.145.38.235,Bayes,2019-08-19,2020-01-30
2019-12-08 12:02:52,1-fundamentals/modern-data-scientist.jpg,469,34,185.145.38.235,Bayes,2019-08-19,2020-01-30
2019-12-08 12:02:52,1-fundamentals/AI-ML-DL-timeline.jpg,469,34,185.145.38.235,Bayes,2019-08-19,2020-01-30
2019-12-08 12:08:51,1-fundamentals/3-vocabulary,469,34,185.145.38.235,Bayes,2019-08-19,2020-01-30
2019-12-08 12:14:52,10-anomaly-detection/1-overview,469,34,185.145.38.235,Bayes,2019-08-19,2020-01-30


In [143]:
#Same student id and the perp who used an ip from Queensland.

In [144]:
#Merging the geo-location onto the main data science dataframe
result = pd.merge(data_science, d, on='ip', how='left')

In [145]:
result.head()

Unnamed: 0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date,city,region
0,/,466,34,97.105.19.58,Bayes,2019-08-19,2020-01-30,Fredericksburg,Texas
1,/,467,34,97.105.19.58,Bayes,2019-08-19,2020-01-30,Fredericksburg,Texas
2,/,468,34,97.105.19.58,Bayes,2019-08-19,2020-01-30,Fredericksburg,Texas
3,/,469,34,97.105.19.58,Bayes,2019-08-19,2020-01-30,Fredericksburg,Texas
4,/,470,34,97.105.19.58,Bayes,2019-08-19,2020-01-30,Fredericksburg,Texas



# 5. At some point in the last year, ability for students and alumni to cross-access curriculum (web dev to ds, ds to web dev) should have been shut off. Do you see any evidence of that happening? Did it happen before? 

In [147]:
# Looking at each individual ds cohort and searching for any lessons not in the ds curriculum.
curie.page_viewed.unique()

array(['login', '/', '4-python/1-overview',
       '1-fundamentals/1.1-intro-to-data-science',
       '1-fundamentals/modern-data-scientist.jpg',
       '1-fundamentals/AI-ML-DL-timeline.jpg',
       '4-python/2-introduction-to-python', '3-sql/1-mysql-overview',
       'appendix/cli-git-overview',
       '1-fundamentals/1.2-data-science-pipeline',
       '1-fundamentals/DataToAction_v2.jpg',
       '1-fundamentals/1.3-pipeline-demo',
       '1-fundamentals/2.1-excel-overview',
       '1-fundamentals/2.2-excel-functions',
       '1-fundamentals/2.3-visualization-with-excel',
       '1-fundamentals/2.4-more-excel-features',
       '2-storytelling/1-overview', '2-storytelling/3-tableau',
       '2-storytelling/project', '1-fundamentals/3-vocabulary',
       '5-stats/4.2-compare-means', '2-storytelling/2.1-understand',
       '2-storytelling/chart-keywords', '2-storytelling/bad-charts',
       '2-storytelling/misleading1_fox.jpg',
       '2-storytelling/misleading1_baseball.jpg',
       '2

In [148]:
darden.page_viewed.unique()

array(['/', '13-advanced-topics/1-tidy-data',
       '1-fundamentals/1.1-intro-to-data-science',
       '1-fundamentals/AI-ML-DL-timeline.jpg',
       '1-fundamentals/modern-data-scientist.jpg',
       '1-fundamentals/3-vocabulary', '3-sql/1-mysql-overview',
       '6-regression/1-overview', '10-anomaly-detection/1-overview',
       '10-anomaly-detection/AnomalyDetectionCartoon.jpeg',
       '3-sql/database-design', '1-fundamentals/1.3-pipeline-demo',
       '1-fundamentals/1.2-data-science-pipeline',
       '1-fundamentals/DataToAction_v2.jpg', '2-storytelling/3-tableau',
       '2-storytelling/3.3-creating-charts',
       '4-python/8.4.1-pandas-overview', '4-python/4-control-structures',
       '1-fundamentals/2.1-spreadsheets-overview', '4-python/5-functions',
       '4-python/6-imports', '4-python/7-working-with-files',
       '4-python/8.1-ds-libraries-overview', 'modern-data-scientist.jpg',
       'AI-ML-DL-timeline.jpg', '1-fundamentals',
       '1-fundamentals/2.2-functions',
 

In [149]:
bayes.page_viewed.unique()

array(['/', '3-sql/1-mysql-overview', '2-storytelling/bad-charts',
       '2-storytelling/misleading1_baseball.jpg',
       '2-storytelling/misleading1_fox.jpg',
       '2-storytelling/misleading3_deaths.jpg',
       'appendix/cli-git-overview',
       '1-fundamentals/1.1-intro-to-data-science',
       '1-fundamentals/modern-data-scientist.jpg',
       '1-fundamentals/AI-ML-DL-timeline.jpg',
       '1-fundamentals/1.2-data-science-pipeline',
       '1-fundamentals/DataToAction_v2.jpg', 'search/search_index.json',
       '13-advanced-topics/3.7-styling-webpages',
       '1-fundamentals/1.3-pipeline-demo',
       '1-fundamentals/2.1-excel-overview', '3-vocabulary.md',
       '6-regression/1-overview', '10-anomaly-detection/1-overview',
       '10-anomaly-detection/AnomalyDetectionCartoon.jpeg',
       '11-nlp/1-overview', '6-regression/2-regression-excel',
       '6-regression/3.1-acquire-and-prep',
       '6-regression/3.2-split-and-scale', '6-regression/3.3-explore',
       '6-regressi

**Bayes accessed some of the Java and Javascript curriculum**

In [154]:
#Creating a filtered df for 2020 web dev page views not including staff.
web_dev_20 = web_dev[(web_dev.index.year > 2019) & (web_dev['name'] != 'Staff')]

In [151]:
web_dev_20.name.unique()

array(['Hampton', 'Teddy', 'Sequoia', 'Arches', 'Niagara', 'Pinnacles',
       'Quincy', 'Kings', 'Lassen', 'Joshua', 'Olympic', 'Ulysses', 'Ike',
       'Voyageurs', 'Wrangell', 'Xanadu', 'Yosemite', 'Zion', 'Andromeda',
       'Betelgeuse', 'Ceres', 'Deimos', 'Europa', 'Fortuna', 'Apex',
       'Ganymede', 'Hyperion', 'Bash', 'Jupiter'], dtype=object)

In [153]:
web_dev_20[web_dev_20.page_viewed.str.contains('python')]

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-01-01 21:23:47,4-python/7.2-intro-to-matplotlib,18,22,45.20.117.182,Teddy,2018-01-08,2018-05-17
2020-01-01 21:24:04,4-python/7.3-intro-to-numpy,18,22,45.20.117.182,Teddy,2018-01-08,2018-05-17
2020-01-13 15:03:01,4-python/1-overview,410,32,172.58.107.0,Betelgeuse,2019-05-28,2019-10-08
2020-01-13 15:03:29,4-python/2-introduction-to-python,410,32,172.58.107.0,Betelgeuse,2019-05-28,2019-10-08
2020-09-13 18:51:46,python/control-structures,627,57,72.179.164.139,Ganymede,2020-03-23,2020-08-20
2020-11-01 02:16:30,python/overview,730,61,68.203.188.247,Bash,2020-07-20,2021-01-21
2020-11-01 02:17:00,python/introduction-to-python,730,61,68.203.188.247,Bash,2020-07-20,2021-01-21
2020-11-01 02:17:30,python/data-types-and-variables,730,61,68.203.188.247,Bash,2020-07-20,2021-01-21
2020-11-01 02:18:00,python/control-structures,730,61,68.203.188.247,Bash,2020-07-20,2021-01-21
2020-11-01 02:18:30,python/functions,730,61,68.203.188.247,Bash,2020-07-20,2021-01-21


In [155]:
web_dev_20[web_dev_20.page_viewed.str.contains('stats')]

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-11-01 02:31:00,stats/overview,730,61,68.203.188.247,Bash,2020-07-20,2021-01-21
2020-11-01 02:31:30,stats/simulation,730,61,68.203.188.247,Bash,2020-07-20,2021-01-21
2020-11-01 02:32:00,stats/probability-distributions,730,61,68.203.188.247,Bash,2020-07-20,2021-01-21
2020-11-01 02:32:30,stats/hypothesis-testing-overview,730,61,68.203.188.247,Bash,2020-07-20,2021-01-21
2020-11-01 02:33:00,stats/compare-means,730,61,68.203.188.247,Bash,2020-07-20,2021-01-21
2020-11-01 02:33:30,stats/correlation,730,61,68.203.188.247,Bash,2020-07-20,2021-01-21
2020-11-01 02:34:00,stats/compare-group-membership,730,61,68.203.188.247,Bash,2020-07-20,2021-01-21
2020-11-01 02:34:30,stats/more-statistical-testing-examples,730,61,68.203.188.247,Bash,2020-07-20,2021-01-21


In [156]:
web_dev_20[web_dev_20.page_viewed.str.contains('data-science')]

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-01-13 15:00:39,1-fundamentals/1.1-intro-to-data-science,410,32,172.58.107.0,Betelgeuse,2019-05-28,2019-10-08
2020-01-13 15:02:29,1-fundamentals/1.1-intro-to-data-science,410,32,172.58.107.0,Betelgeuse,2019-05-28,2019-10-08
2020-01-13 15:18:55,1-fundamentals/1.1-intro-to-data-science,410,32,172.58.107.0,Betelgeuse,2019-05-28,2019-10-08
2020-01-13 15:18:58,1-fundamentals/1.1-intro-to-data-science,410,32,172.58.107.0,Betelgeuse,2019-05-28,2019-10-08
2020-01-13 15:19:08,1-fundamentals/1.2-data-science-pipeline,410,32,172.58.107.0,Betelgeuse,2019-05-28,2019-10-08
2020-11-01 02:00:30,fundamentals/intro-to-data-science,730,61,68.203.188.247,Bash,2020-07-20,2021-01-21
2020-11-01 02:01:00,fundamentals/data-science-pipeline,730,61,68.203.188.247,Bash,2020-07-20,2021-01-21


**Student id #730 is primarily the one accessing the data science curriculum from web dev**

#  6. What topics are grads continuing to reference after graduation and into their jobs (for each program)? 

In [157]:
bayes.head(10)

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-08-20 09:39:58,/,466,34,97.105.19.58,Bayes,2019-08-19,2020-01-30
2019-08-20 09:39:59,/,467,34,97.105.19.58,Bayes,2019-08-19,2020-01-30
2019-08-20 09:39:59,/,468,34,97.105.19.58,Bayes,2019-08-19,2020-01-30
2019-08-20 09:40:02,/,469,34,97.105.19.58,Bayes,2019-08-19,2020-01-30
2019-08-20 09:40:08,/,470,34,97.105.19.58,Bayes,2019-08-19,2020-01-30
2019-08-20 09:40:15,/,471,34,97.105.19.58,Bayes,2019-08-19,2020-01-30
2019-08-20 09:40:15,/,472,34,97.105.19.58,Bayes,2019-08-19,2020-01-30
2019-08-20 09:40:17,/,473,34,97.105.19.58,Bayes,2019-08-19,2020-01-30
2019-08-20 09:40:18,/,474,34,97.105.19.58,Bayes,2019-08-19,2020-01-30
2019-08-20 09:40:19,/,475,34,97.105.19.58,Bayes,2019-08-19,2020-01-30


In [160]:
#Create a dataframe of Bayes cohort page views after graduation
bayes_after = bayes.sort_index(ascending=False).head(3000)

In [161]:
bayes_after.index.min()

Timestamp('2020-02-07 17:41:37')

In [162]:
bayes_after.page_viewed.value_counts().head(20)

/                                                    334
search/search_index.json                             139
1-fundamentals/modern-data-scientist.jpg              78
1-fundamentals/1.1-intro-to-data-science              78
1-fundamentals/AI-ML-DL-timeline.jpg                  78
6-regression/1-overview                               63
10-anomaly-detection/1-overview                       36
10-anomaly-detection/AnomalyDetectionCartoon.jpeg     36
3-sql/1-mysql-overview                                34
7-classification/3-prep                               32
12-distributed-ml/3-spark-api                         31
6-regression/7.0-model                                30
7-classification/6.1-logistic-regression              30
7-classification/4-explore                            29
6-regression/2.0-acquire-and-prep                     28
7-classification/6.2-decision-trees                   28
6-regression/5.0-evaluate                             28
5-stats/4.2-compare-means      

In [163]:
curie_after = curie.sort_index(ascending=False).head(2000)

In [164]:
curie_after.index.min()

Timestamp('2020-07-22 15:25:15')

In [165]:
curie_after.page_viewed.value_counts().head(20)

/                                                 297
sql/mysql-overview                                 99
search/search_index.json                           92
classification/overview                            91
fundamentals/AI-ML-DL-timeline.jpg                 59
fundamentals/modern-data-scientist.jpg             58
fundamentals/intro-to-data-science                 56
classification/scale_features_or_not.svg           46
sql/database-design                                40
anomaly-detection/AnomalyDetectionCartoon.jpeg     36
anomaly-detection/overview                         34
timeseries/prep                                    24
timeseries/acquire                                 23
4-python/5-functions                               21
classification/prep                                21
sql/temporary-tables                               20
sql/databases                                      19
sql/functions                                      19
python/dataframes           

**For data science it looks like Fundamentals, Anomaly Detection, and Classification are some of the most common pages accessed.**

In [166]:
#Create a df to compare
web20 = web_dev[web_dev.end_date.str.contains('2020')]

In [167]:
web20.name.value_counts()

Fortuna     34085
Ganymede    30829
Apex        30426
Deimos      28369
Hyperion    27109
Europa      26295
Name: name, dtype: int64

In [168]:
deimos = web20[web20.name == 'Deimos']

In [169]:
deimos_after = deimos.sort_index(ascending=False).head(4000)

In [184]:
deimos_after.head()

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-11-02 11:55:27,javascript-i/functions,495,51,72.191.58.18,Deimos,2019-09-16,2020-02-27
2020-11-02 11:53:21,javascript-i/javascript-with-html,495,51,72.191.58.18,Deimos,2019-09-16,2020-02-27
2020-11-02 11:53:17,javascript-i/introduction/working-with-data-ty...,495,51,72.191.58.18,Deimos,2019-09-16,2020-02-27
2020-11-02 11:52:52,javascript-i/javascript-with-html,495,51,72.191.58.18,Deimos,2019-09-16,2020-02-27
2020-11-02 11:52:45,javascript-i/introduction/working-with-data-ty...,495,51,72.191.58.18,Deimos,2019-09-16,2020-02-27


In [185]:
# Make sure the df is post grad
deimos_after.index.min()

Timestamp('2020-03-17 23:04:40')

In [171]:
deimos_after.page_viewed.value_counts().head(20)

search/search_index.json           204
javascript-i                       142
spring                             135
html-css                           133
toc                                119
appendix                           114
java-ii                            106
mysql                              105
javascript-ii                       93
java-iii                            90
jquery                              89
java-i                              74
spring/setup                        60
mysql/basic-statements              55
mysql/users                         55
mysql/sample-database               45
mysql/databases                     41
mysql/introduction                  38
spring/fundamentals/controllers     38
mysql/intellij                      37
Name: page_viewed, dtype: int64

In [181]:
europa = web20[web20.name == 'Europa']

In [182]:
europa_after = europa.sort_index(ascending=False).head(1500)

In [186]:
europa_after.head()

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-10-30 21:05:14,javascript-ii/RESTful-api,538,52,108.196.159.214,Europa,2019-11-04,2020-04-17
2020-10-30 21:05:12,javascript-ii,538,52,108.196.159.214,Europa,2019-11-04,2020-04-17
2020-10-29 17:06:50,java-iii/jsp-and-jstl,525,52,72.179.177.9,Europa,2019-11-04,2020-04-17
2020-10-29 17:04:59,java-iii,525,52,72.179.177.9,Europa,2019-11-04,2020-04-17
2020-10-28 13:32:36,mysql/basic-statements,525,52,72.179.177.9,Europa,2019-11-04,2020-04-17


In [180]:
europa_after.index.min()

Timestamp('2020-04-22 11:21:28')

In [183]:
europa_after.page_viewed.value_counts().head(20)

appendix                                                       76
toc                                                            63
appendix/professional-development/mock-behavioral-questions    45
html-css/css-i/selectors-and-properties                        43
javascript-i                                                   40
spring                                                         35
spring/setup                                                   34
javascript-ii                                                  33
java-i                                                         33
appendix/professional-development/t-block-resume               30
spring/fundamentals/views                                      30
html-css                                                       28
search/search_index.json                                       27
spring/fundamentals/controllers                                27
spring/fundamentals/repositories                               22
java-ii   

In [187]:
apex = web20[web20.name == 'Apex']

In [188]:
apex_after = apex.sort_index(ascending=False).head(1500)

In [189]:
apex_after.head()

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-11-02 12:12:08,java-i/introduction-to-java,608,56,76.218.3.43,Apex,2020-02-24,2020-07-29
2020-11-02 12:12:05,java-i,608,56,76.218.3.43,Apex,2020-02-24,2020-07-29
2020-11-01 19:26:05,capstone/sw-project-planning,608,56,76.218.3.43,Apex,2020-02-24,2020-07-29
2020-10-31 22:16:29,java-ii/file-io,612,56,172.14.176.83,Apex,2020-02-24,2020-07-29
2020-10-31 22:16:27,java-ii,612,56,172.14.176.83,Apex,2020-02-24,2020-07-29


In [190]:
apex_after.page_viewed.value_counts().head(20)

search/search_index.json                   98
spring                                     80
java-ii                                    60
java-i                                     49
html-css                                   42
java-iii                                   42
spring/fundamentals/controllers            41
mysql                                      33
spring/fundamentals/repositories           32
javascript-i                               30
java-ii/object-oriented-programming        30
spring/fundamentals/views                  28
spring/setup                               26
toc                                        24
java-ii/arrays                             23
appendix                                   23
java-ii/interfaces-and-abstract-classes    23
java-i/syntax-types-and-variables          22
spring/fundamentals/form-model-binding     21
java-ii/inheritance-and-polymorphism       20
Name: page_viewed, dtype: int64

In [191]:
hyperion = web20[web20.name == 'Hyperion']

In [192]:
hyperion_after = hyperion.sort_index(ascending=False).head(1500)

In [193]:
apex_after.head()

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-11-02 12:12:08,java-i/introduction-to-java,608,56,76.218.3.43,Apex,2020-02-24,2020-07-29
2020-11-02 12:12:05,java-i,608,56,76.218.3.43,Apex,2020-02-24,2020-07-29
2020-11-01 19:26:05,capstone/sw-project-planning,608,56,76.218.3.43,Apex,2020-02-24,2020-07-29
2020-10-31 22:16:29,java-ii/file-io,612,56,172.14.176.83,Apex,2020-02-24,2020-07-29
2020-10-31 22:16:27,java-ii,612,56,172.14.176.83,Apex,2020-02-24,2020-07-29


In [194]:
apex_after.index.min()

Timestamp('2020-08-19 11:57:42')

In [195]:
hyperion_after.page_viewed.value_counts().head(20)

spring                                                   142
spring/fundamentals/security/authentication               92
spring/fundamentals/relationships                         72
capstone/workbook                                         66
spring/fundamentals/repositories                          58
spring/fundamentals/views                                 51
mysql                                                     46
java-iii                                                  46
capstone                                                  42
spring/fundamentals/services                              42
spring/fundamentals/security                              41
spring/fundamentals/controllers                           38
spring/fundamentals/form-model-binding                    35
capstone/sw-project-planning                              34
spring/setup                                              30
toc                                                       30
spring/fundamentals/inte

**From taking a sample of web dev cohorts, it looks like Spring, Java, and Javascript were accessed frequently after graduation.**