In [1]:
import numpy as np
import pandas as pd
import math
from sklearn import metrics

from scipy.stats import entropy

import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import matplotlib.dates as mdates #to format dates on our plots
%matplotlib inline
import seaborn as sns

Is there any suspicious activity, such as users/machines/etc accessing the curriculum who shouldn’t be? Does it appear that any web-scraping is happening? Are there any suspicious IP addresses? Any odd user-agents?

In [2]:
import requests

In [3]:
df = pd.read_csv('curriculum.txt',
                engine='python',
                 header=None,
                 index_col=False,
                 sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
                 na_values='"-"',)

In [4]:
df.columns = ['date', 'time', 'page_viewed', 'user_id', 'cohort_id', 'ip']

In [5]:
cohort = pd.read_csv('cohort_name.csv')

In [6]:
cohort.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,,cohort_id,name,start_date,end_date,program_id,
1,,1,Arches,2014-02-04,2014-04-22,1,
2,,2,Badlands,2014-06-04,2014-08-22,1,
3,,3,Carlsbad,2014-09-04,2014-11-05,1,
4,,4,Denali,2014-10-20,2015-01-18,1,


In [7]:
cohort.columns = cohort.iloc[0]

In [8]:
cohort = cohort.iloc[1:]

In [9]:
cohort.head()

Unnamed: 0,NaN,cohort_id,name,start_date,end_date,program_id,NaN.1
1,,1,Arches,2014-02-04,2014-04-22,1,
2,,2,Badlands,2014-06-04,2014-08-22,1,
3,,3,Carlsbad,2014-09-04,2014-11-05,1,
4,,4,Denali,2014-10-20,2015-01-18,1,
5,,5,Everglades,2014-11-18,2015-02-24,1,


In [10]:
cohort = cohort[['cohort_id', 'name', 'start_date', 'end_date']]

In [11]:
cohort.cohort_id = cohort.cohort_id.astype('int')


In [12]:
result = pd.merge(df, cohort, on='cohort_id')

In [13]:
result.cohort_id = result.cohort_id.astype('int')
result['date'] = result.date + " " + result.time
result.drop(columns=('time'), inplace=True)
result.date = pd.to_datetime(result.date)
result = result.set_index('date')

In [14]:
data_science = result[(result.name == 'Curie') | (result.name == 'Bayes') | (result.name == 'Ada') | (result.name == 'Darden')]

In [15]:
web_dev = result[(result.name != 'Curie') & (result.name != 'Bayes') & (result.name != 'Ada') & (result.name != 'Darden')]

In [16]:
bayes = data_science[data_science.name == 'Bayes']

In [17]:
darden = data_science[data_science.name == 'Darden']

In [18]:
curie = data_science[data_science.name == 'Curie']

In [19]:
# for index, row in darden.iterrows():
#     location = row['ip']
#     url = 'http://ip-api.com/csv/' + location
#     response = requests.get(url)
#     res = response.text.split(',')
#     location = []
#     location.append(res[5:6])
#     print(res[5:6])

In [20]:
ips = darden.ip.unique()

In [21]:
ips.shape

(155,)

In [22]:
ips = pd.DataFrame(ips)

In [23]:
ips

Unnamed: 0,0
0,76.201.20.193
1,24.28.146.155
2,136.50.56.155
3,108.239.188.205
4,68.54.110.249
...,...
150,172.58.110.195
151,172.58.67.160
152,107.13.184.11
153,72.128.139.130


In [24]:
ips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       155 non-null    object
dtypes: object(1)
memory usage: 1.3+ KB


In [25]:
ips.rename(columns={0: 'ip'}, inplace=True)

In [26]:
ips

Unnamed: 0,ip
0,76.201.20.193
1,24.28.146.155
2,136.50.56.155
3,108.239.188.205
4,68.54.110.249
...,...
150,172.58.110.195
151,172.58.67.160
152,107.13.184.11
153,72.128.139.130


In [27]:

# for index, row in ips.iloc[:45].iterrows():
#     location = row['ip']
#     url = 'http://ip-api.com/csv/' + location
#     response = requests.get(url)
#     res = response.text.split(',')
#     location = []
#     location.append(res[4:6])
#     print(row['ip'],res[4:6])

function for a single value, with response data. (.apply)

In [28]:
darden.shape

(22425, 7)

In [29]:
# for index, row in ips.iloc[46:90].iterrows():
#     location = row['ip']
#     url = 'http://ip-api.com/csv/' + location
#     response = requests.get(url)
#     res = response.text.split(',')
#     location = []
#     location.append(res[4:6])
#     print(row['ip'],res[4:6])

In [30]:
# for index, row in ips.iloc[91:130].iterrows():
#     location = row['ip']
#     url = 'http://ip-api.com/csv/' + location
#     response = requests.get(url)
#     res = response.text.split(',')
#     location = []
#     location.append(res[4:6])
#     print(row['ip'],res[4:6])

In [31]:
# for index, row in ips.iloc[131:180].iterrows():
#     location = row['ip']
#     url = 'http://ip-api.com/csv/' + location
#     response = requests.get(url)
#     res = response.text.split(',')
#     location = []
#     location.append(res[4:6])
#     print(row['ip'],res[4:6])

In [32]:
# for index, row in ips.iloc[181:220].iterrows():
#     location = row['ip']
#     url = 'http://ip-api.com/csv/' + location
#     response = requests.get(url)
#     res = response.text.split(',')
#     location = []
#     location.append(res[4:6])
#     print(row['ip'],res[4:6])

In [33]:
darden.shape

(22425, 7)

In [34]:
d = []

In [35]:
import requests

for index, row in ips.iterrows():
    location = row['ip']
    url = "https://free-geo-ip.p.rapidapi.com/json/" + location

    headers = {
        'x-rapidapi-key': "e971744d3fmshe67b5c357e7ec4ap1aa107jsn4a07ff9a4175",
        'x-rapidapi-host': "free-geo-ip.p.rapidapi.com"
        }

    response = requests.request("GET", url, headers=headers)

    data = response.json()
    d.append(
        {
            'ip': row['ip'],
            'city': data['city'],
            'region': data['region_name']
            
        }
    )

pd.DataFrame(d)

Unnamed: 0,ip,city,region
0,76.201.20.193,Austin,Texas
1,24.28.146.155,San Antonio,Texas
2,136.50.56.155,San Antonio,Texas
3,108.239.188.205,San Antonio,Texas
4,68.54.110.249,Burnsville,Minnesota
...,...,...,...
150,172.58.110.195,Dallas,Texas
151,172.58.67.160,Irving,Texas
152,107.13.184.11,Raleigh,North Carolina
153,72.128.139.130,San Antonio,Texas


In [36]:
data.keys()

dict_keys(['ip', 'country_code', 'country_name', 'region_code', 'region_name', 'city', 'zip_code', 'time_zone', 'latitude', 'longitude', 'metro_code'])

In [37]:
d = pd.DataFrame(d)

In [38]:
d

Unnamed: 0,ip,city,region
0,76.201.20.193,Austin,Texas
1,24.28.146.155,San Antonio,Texas
2,136.50.56.155,San Antonio,Texas
3,108.239.188.205,San Antonio,Texas
4,68.54.110.249,Burnsville,Minnesota
...,...,...,...
150,172.58.110.195,Dallas,Texas
151,172.58.67.160,Irving,Texas
152,107.13.184.11,Raleigh,North Carolina
153,72.128.139.130,San Antonio,Texas


In [39]:
d.groupby('region')[['ip']].agg('count').sort_values(by='ip',ascending=False)

Unnamed: 0_level_0,ip
region,Unnamed: 1_level_1
Texas,150
Minnesota,2
Arizona,1
Nebraska,1
North Carolina,1


In [40]:
result = pd.merge(darden, d, on='ip', how='left')

In [41]:
result.head(20)

Unnamed: 0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date,city,region
0,/,678,59,76.201.20.193,Darden,2020-07-13,2021-01-12,Austin,Texas
1,/,679,59,24.28.146.155,Darden,2020-07-13,2021-01-12,San Antonio,Texas
2,/,680,59,136.50.56.155,Darden,2020-07-13,2021-01-12,San Antonio,Texas
3,13-advanced-topics/1-tidy-data,679,59,24.28.146.155,Darden,2020-07-13,2021-01-12,San Antonio,Texas
4,1-fundamentals/1.1-intro-to-data-science,679,59,24.28.146.155,Darden,2020-07-13,2021-01-12,San Antonio,Texas
5,1-fundamentals/AI-ML-DL-timeline.jpg,679,59,24.28.146.155,Darden,2020-07-13,2021-01-12,San Antonio,Texas
6,1-fundamentals/modern-data-scientist.jpg,679,59,24.28.146.155,Darden,2020-07-13,2021-01-12,San Antonio,Texas
7,/,681,59,108.239.188.205,Darden,2020-07-13,2021-01-12,San Antonio,Texas
8,/,682,59,68.54.110.249,Darden,2020-07-13,2021-01-12,Burnsville,Minnesota
9,/,683,59,173.174.165.12,Darden,2020-07-13,2021-01-12,San Antonio,Texas


In [42]:
ds_ip = data_science.ip.unique()

In [43]:
ds_ip.shape

(594,)

In [44]:
ds_ip = pd.DataFrame(ds_ip)

In [45]:
ds_ip.head()

Unnamed: 0,0
0,97.105.19.58
1,67.11.115.125
2,172.8.173.224
3,173.175.104.33
4,12.197.196.242


In [46]:
ds_ip.rename(columns={0: 'ip'}, inplace=True)

In [47]:
d = []

In [48]:
import requests

for index, row in ds_ip.iterrows():
    location = row['ip']
    url = "https://free-geo-ip.p.rapidapi.com/json/" + location

    headers = {
        'x-rapidapi-key': "e971744d3fmshe67b5c357e7ec4ap1aa107jsn4a07ff9a4175",
        'x-rapidapi-host': "free-geo-ip.p.rapidapi.com"
        }

    response = requests.request("GET", url, headers=headers)

    data = response.json()
    d.append(
        {
            'ip': row['ip'],
            'city': data['city'],
            'region': data['region_name']
            
        }
    )

pd.DataFrame(d)

Unnamed: 0,ip,city,region
0,97.105.19.58,Fredericksburg,Texas
1,67.11.115.125,San Antonio,Texas
2,172.8.173.224,San Antonio,Texas
3,173.175.104.33,Pharr,Texas
4,12.197.196.242,San Antonio,Texas
...,...,...,...
589,172.58.110.195,Dallas,Texas
590,172.58.67.160,Irving,Texas
591,107.13.184.11,Raleigh,North Carolina
592,72.128.139.130,San Antonio,Texas


In [49]:
d = pd.DataFrame(d)

In [50]:
d.head()

Unnamed: 0,ip,city,region
0,97.105.19.58,Fredericksburg,Texas
1,67.11.115.125,San Antonio,Texas
2,172.8.173.224,San Antonio,Texas
3,173.175.104.33,Pharr,Texas
4,12.197.196.242,San Antonio,Texas


In [51]:
d.groupby('region')[['ip']].agg('count').sort_values(by='ip',ascending=False)

Unnamed: 0_level_0,ip
region,Unnamed: 1_level_1
Texas,543
,14
Ontario,6
California,4
Jalisco,3
Massachusetts,2
Queensland,2
Florida,2
North Carolina,2
Arizona,2


In [52]:
d[d.region == 'Queensland']

Unnamed: 0,ip,city,region
99,45.248.77.99,Brisbane,Queensland
106,103.137.12.164,Brisbane,Queensland


In [53]:
data_science.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 68017 entries, 2019-08-20 09:39:58 to 2020-11-02 15:52:23
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   page_viewed  68016 non-null  object
 1   user_id      68017 non-null  int64 
 2   cohort_id    68017 non-null  int64 
 3   ip           68017 non-null  object
 4   name         68017 non-null  object
 5   start_date   68017 non-null  object
 6   end_date     68017 non-null  object
dtypes: int64(2), object(5)
memory usage: 4.2+ MB


In [54]:
data_science[data_science.ip == '103.137.12.164']

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-12-12 10:13:03,6-regression/1-overview,469,34,103.137.12.164,Bayes,2019-08-19,2020-01-30
2019-12-12 10:13:23,7-classification/6.2-decision-trees,469,34,103.137.12.164,Bayes,2019-08-19,2020-01-30
2019-12-12 10:24:47,10-anomaly-detection/1-overview,469,34,103.137.12.164,Bayes,2019-08-19,2020-01-30
2019-12-12 10:24:48,10-anomaly-detection/AnomalyDetectionCartoon.jpeg,469,34,103.137.12.164,Bayes,2019-08-19,2020-01-30
2019-12-12 10:24:53,11-nlp/project,469,34,103.137.12.164,Bayes,2019-08-19,2020-01-30
2019-12-12 10:24:54,11-nlp/github_repo_language.gif,469,34,103.137.12.164,Bayes,2019-08-19,2020-01-30
2019-12-12 10:24:56,11-nlp/6-model,469,34,103.137.12.164,Bayes,2019-08-19,2020-01-30
2019-12-12 11:05:10,11-nlp/4-prepare,469,34,103.137.12.164,Bayes,2019-08-19,2020-01-30
2019-12-12 11:20:50,11-nlp/6-model,469,34,103.137.12.164,Bayes,2019-08-19,2020-01-30


In [55]:
perp1 = data_science[data_science.user_id == 469]

In [56]:
perp1.ip.value_counts()

97.105.19.58       749
67.11.115.125      192
196.247.56.62       96
162.219.176.244     46
185.145.38.235      41
68.206.101.245      38
172.98.66.16        24
196.196.192.52      18
89.187.175.105      15
104.200.138.33      13
173.232.243.3       11
184.75.208.254       9
103.137.12.164       9
104.254.95.84        8
185.153.179.81       7
129.115.195.45       6
45.248.77.99         6
184.75.223.44        5
172.98.66.4          4
89.187.175.48        1
Name: ip, dtype: int64

In [57]:
d[d.region == 'North Rhine-Westphalia']

Unnamed: 0,ip,city,region
98,185.145.38.235,Cologne,North Rhine-Westphalia


In [58]:
data_science[data_science.ip == '185.145.38.235']

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-12-08 11:14:46,1-fundamentals/1.1-intro-to-data-science,469,34,185.145.38.235,Bayes,2019-08-19,2020-01-30
2019-12-08 11:14:46,1-fundamentals/modern-data-scientist.jpg,469,34,185.145.38.235,Bayes,2019-08-19,2020-01-30
2019-12-08 11:14:47,1-fundamentals/AI-ML-DL-timeline.jpg,469,34,185.145.38.235,Bayes,2019-08-19,2020-01-30
2019-12-08 11:57:39,1-fundamentals/1.2-data-science-pipeline,469,34,185.145.38.235,Bayes,2019-08-19,2020-01-30
2019-12-08 11:57:39,1-fundamentals/DataToAction_v2.jpg,469,34,185.145.38.235,Bayes,2019-08-19,2020-01-30
2019-12-08 12:02:51,1-fundamentals/1.1-intro-to-data-science,469,34,185.145.38.235,Bayes,2019-08-19,2020-01-30
2019-12-08 12:02:52,1-fundamentals/modern-data-scientist.jpg,469,34,185.145.38.235,Bayes,2019-08-19,2020-01-30
2019-12-08 12:02:52,1-fundamentals/AI-ML-DL-timeline.jpg,469,34,185.145.38.235,Bayes,2019-08-19,2020-01-30
2019-12-08 12:08:51,1-fundamentals/3-vocabulary,469,34,185.145.38.235,Bayes,2019-08-19,2020-01-30
2019-12-08 12:14:52,10-anomaly-detection/1-overview,469,34,185.145.38.235,Bayes,2019-08-19,2020-01-30


In [59]:
d2 = []

In [60]:
import requests

for index, row in ds_ip.iterrows():
    location = row['ip']
    url = "https://free-geo-ip.p.rapidapi.com/json/" + location

    headers = {
        'x-rapidapi-key': "e971744d3fmshe67b5c357e7ec4ap1aa107jsn4a07ff9a4175",
        'x-rapidapi-host': "free-geo-ip.p.rapidapi.com"
        }

    response = requests.request("GET", url, headers=headers)

    data = response.json()
    d2.append(
        {
            'ip': row['ip'],
            'city': data['city'],
            'region': data['region_name'],
            'country': data['country_name']
            
        }
    )

pd.DataFrame(d2)

Unnamed: 0,ip,city,region,country
0,97.105.19.58,Fredericksburg,Texas,United States
1,67.11.115.125,San Antonio,Texas,United States
2,172.8.173.224,San Antonio,Texas,United States
3,173.175.104.33,Pharr,Texas,United States
4,12.197.196.242,San Antonio,Texas,United States
...,...,...,...,...
589,172.58.110.195,Dallas,Texas,United States
590,172.58.67.160,Irving,Texas,United States
591,107.13.184.11,Raleigh,North Carolina,United States
592,72.128.139.130,San Antonio,Texas,United States


In [61]:
d2 = pd.DataFrame(d2)

In [62]:
d2.head()

Unnamed: 0,ip,city,region,country
0,97.105.19.58,Fredericksburg,Texas,United States
1,67.11.115.125,San Antonio,Texas,United States
2,172.8.173.224,San Antonio,Texas,United States
3,173.175.104.33,Pharr,Texas,United States
4,12.197.196.242,San Antonio,Texas,United States


In [63]:
d2.country.value_counts()

United States    576
Canada             9
Mexico             4
Australia          2
China              1
Germany            1
Ireland            1
Name: country, dtype: int64

# At some point in the last year, ability for students and alumni to cross-access curriculum (web dev to ds, ds to web dev) should have been shut off. Do you see any evidence of that happening? Did it happen before?

In [64]:
curie.head()

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-02-03 15:39:34,login,575,55,97.105.19.58,Curie,2020-02-03,2020-07-07
2020-02-03 15:39:35,/,576,55,97.105.19.58,Curie,2020-02-03,2020-07-07
2020-02-03 15:39:37,/,577,55,97.105.19.58,Curie,2020-02-03,2020-07-07
2020-02-03 15:39:37,login,575,55,97.105.19.58,Curie,2020-02-03,2020-07-07
2020-02-03 15:39:43,/,578,55,97.105.19.58,Curie,2020-02-03,2020-07-07


In [65]:
curie.page_viewed.unique()

array(['login', '/', '4-python/1-overview',
       '1-fundamentals/1.1-intro-to-data-science',
       '1-fundamentals/modern-data-scientist.jpg',
       '1-fundamentals/AI-ML-DL-timeline.jpg',
       '4-python/2-introduction-to-python', '3-sql/1-mysql-overview',
       'appendix/cli-git-overview',
       '1-fundamentals/1.2-data-science-pipeline',
       '1-fundamentals/DataToAction_v2.jpg',
       '1-fundamentals/1.3-pipeline-demo',
       '1-fundamentals/2.1-excel-overview',
       '1-fundamentals/2.2-excel-functions',
       '1-fundamentals/2.3-visualization-with-excel',
       '1-fundamentals/2.4-more-excel-features',
       '2-storytelling/1-overview', '2-storytelling/3-tableau',
       '2-storytelling/project', '1-fundamentals/3-vocabulary',
       '5-stats/4.2-compare-means', '2-storytelling/2.1-understand',
       '2-storytelling/chart-keywords', '2-storytelling/bad-charts',
       '2-storytelling/misleading1_fox.jpg',
       '2-storytelling/misleading1_baseball.jpg',
       '2

In [66]:
darden.page_viewed.unique()

array(['/', '13-advanced-topics/1-tidy-data',
       '1-fundamentals/1.1-intro-to-data-science',
       '1-fundamentals/AI-ML-DL-timeline.jpg',
       '1-fundamentals/modern-data-scientist.jpg',
       '1-fundamentals/3-vocabulary', '3-sql/1-mysql-overview',
       '6-regression/1-overview', '10-anomaly-detection/1-overview',
       '10-anomaly-detection/AnomalyDetectionCartoon.jpeg',
       '3-sql/database-design', '1-fundamentals/1.3-pipeline-demo',
       '1-fundamentals/1.2-data-science-pipeline',
       '1-fundamentals/DataToAction_v2.jpg', '2-storytelling/3-tableau',
       '2-storytelling/3.3-creating-charts',
       '4-python/8.4.1-pandas-overview', '4-python/4-control-structures',
       '1-fundamentals/2.1-spreadsheets-overview', '4-python/5-functions',
       '4-python/6-imports', '4-python/7-working-with-files',
       '4-python/8.1-ds-libraries-overview', 'modern-data-scientist.jpg',
       'AI-ML-DL-timeline.jpg', '1-fundamentals',
       '1-fundamentals/2.2-functions',
 

In [67]:
bayes.page_viewed.unique()

array(['/', '3-sql/1-mysql-overview', '2-storytelling/bad-charts',
       '2-storytelling/misleading1_baseball.jpg',
       '2-storytelling/misleading1_fox.jpg',
       '2-storytelling/misleading3_deaths.jpg',
       'appendix/cli-git-overview',
       '1-fundamentals/1.1-intro-to-data-science',
       '1-fundamentals/modern-data-scientist.jpg',
       '1-fundamentals/AI-ML-DL-timeline.jpg',
       '1-fundamentals/1.2-data-science-pipeline',
       '1-fundamentals/DataToAction_v2.jpg', 'search/search_index.json',
       '13-advanced-topics/3.7-styling-webpages',
       '1-fundamentals/1.3-pipeline-demo',
       '1-fundamentals/2.1-excel-overview', '3-vocabulary.md',
       '6-regression/1-overview', '10-anomaly-detection/1-overview',
       '10-anomaly-detection/AnomalyDetectionCartoon.jpeg',
       '11-nlp/1-overview', '6-regression/2-regression-excel',
       '6-regression/3.1-acquire-and-prep',
       '6-regression/3.2-split-and-scale', '6-regression/3.3-explore',
       '6-regressi

In [68]:
bayes.head()

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-08-20 09:39:58,/,466,34,97.105.19.58,Bayes,2019-08-19,2020-01-30
2019-08-20 09:39:59,/,467,34,97.105.19.58,Bayes,2019-08-19,2020-01-30
2019-08-20 09:39:59,/,468,34,97.105.19.58,Bayes,2019-08-19,2020-01-30
2019-08-20 09:40:02,/,469,34,97.105.19.58,Bayes,2019-08-19,2020-01-30
2019-08-20 09:40:08,/,470,34,97.105.19.58,Bayes,2019-08-19,2020-01-30


**Bayes cohort did access web dev curriculum, but the other data science cohorts did not**

In [69]:
web_dev.head()

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-01-26 09:55:03,/,1,8,97.105.19.61,Hampton,2015-09-22,2016-02-06
2018-01-26 09:56:02,java-ii,1,8,97.105.19.61,Hampton,2015-09-22,2016-02-06
2018-01-26 09:56:05,java-ii/object-oriented-programming,1,8,97.105.19.61,Hampton,2015-09-22,2016-02-06
2018-01-26 09:56:06,slides/object_oriented_programming,1,8,97.105.19.61,Hampton,2015-09-22,2016-02-06
2018-01-26 10:40:15,javascript-i/functions,1,8,97.105.19.61,Hampton,2015-09-22,2016-02-06


In [70]:
web_dev_20 = web_dev[(web_dev.index.year > 2019) & (web_dev['name'] != 'Staff')]

In [71]:
web_dev_20.name.unique()

array(['Hampton', 'Teddy', 'Sequoia', 'Arches', 'Niagara', 'Pinnacles',
       'Quincy', 'Kings', 'Lassen', 'Joshua', 'Olympic', 'Ulysses', 'Ike',
       'Voyageurs', 'Wrangell', 'Xanadu', 'Yosemite', 'Zion', 'Andromeda',
       'Betelgeuse', 'Ceres', 'Deimos', 'Europa', 'Fortuna', 'Apex',
       'Ganymede', 'Everglades', 'Hyperion', 'Bash', 'Jupiter'],
      dtype=object)

In [72]:
web_dev_20.page_viewed.unique()

array(['/', 'content/html-css',
       'content/html-css/gitbook/images/favicon.ico',
       'content/html-css/introduction.html',
       'content/html-css/elements.html',
       'content/examples/html/gitbook/images/favicon.ico',
       'content/examples/examples/html/gitbook/images/favicon.ico',
       'content/examples/examples/html', 'content/examples/html',
       'content/html-css/forms.html', 'content/javascript_ii',
       'content/javascript_ii/gitbook/images/favicon.ico',
       'content/javascript',
       'content/javascript/gitbook/images/favicon.ico',
       'content/javascript/introduction-to-javascript.html',
       'content/javascript/primitive-types.html',
       'content/javascript/javascript-with-html.html',
       'content/javascript/conditionals.html',
       'content/javascript/arrays',
       'content/javascript/arrays/gitbook/images/favicon.ico',
       'content/javascript/arrays/iterating.html',
       'content/javascript/functions.html',
       'content/javas

In [73]:
web_dev_20.name

date
2020-04-09 12:56:24    Hampton
2020-04-09 15:35:20    Hampton
2020-04-09 15:35:20    Hampton
2020-04-09 15:35:35    Hampton
2020-04-09 16:06:28    Hampton
                        ...   
2020-11-02 16:48:13    Jupiter
2020-11-02 16:48:17    Jupiter
2020-11-02 16:48:18    Jupiter
2020-11-02 16:48:28    Jupiter
2020-11-02 16:48:47    Jupiter
Name: name, Length: 194169, dtype: object

In [74]:
pd.options.display.max_seq_items = 2000


In [75]:
web_dev_20.page_viewed.unique()

array(['/', 'content/html-css',
       'content/html-css/gitbook/images/favicon.ico',
       'content/html-css/introduction.html',
       'content/html-css/elements.html',
       'content/examples/html/gitbook/images/favicon.ico',
       'content/examples/examples/html/gitbook/images/favicon.ico',
       'content/examples/examples/html', 'content/examples/html',
       'content/html-css/forms.html', 'content/javascript_ii',
       'content/javascript_ii/gitbook/images/favicon.ico',
       'content/javascript',
       'content/javascript/gitbook/images/favicon.ico',
       'content/javascript/introduction-to-javascript.html',
       'content/javascript/primitive-types.html',
       'content/javascript/javascript-with-html.html',
       'content/javascript/conditionals.html',
       'content/javascript/arrays',
       'content/javascript/arrays/gitbook/images/favicon.ico',
       'content/javascript/arrays/iterating.html',
       'content/javascript/functions.html',
       'content/javas

In [76]:
import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)

In [77]:
web_dev_20.page_viewed.unique()

array(['/', 'content/html-css',
       'content/html-css/gitbook/images/favicon.ico',
       'content/html-css/introduction.html',
       'content/html-css/elements.html',
       'content/examples/html/gitbook/images/favicon.ico',
       'content/examples/examples/html/gitbook/images/favicon.ico',
       'content/examples/examples/html', 'content/examples/html',
       'content/html-css/forms.html', 'content/javascript_ii',
       'content/javascript_ii/gitbook/images/favicon.ico',
       'content/javascript',
       'content/javascript/gitbook/images/favicon.ico',
       'content/javascript/introduction-to-javascript.html',
       'content/javascript/primitive-types.html',
       'content/javascript/javascript-with-html.html',
       'content/javascript/conditionals.html',
       'content/javascript/arrays',
       'content/javascript/arrays/gitbook/images/favicon.ico',
       'content/javascript/arrays/iterating.html',
       'content/javascript/functions.html',
       'content/javas

In [78]:
web_dev_20.head(100)

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-04-09 12:56:24,/,646,8,173.174.197.146,Hampton,2015-09-22,2016-02-06
2020-04-09 15:35:20,content/html-css,646,8,173.174.197.146,Hampton,2015-09-22,2016-02-06
2020-04-09 15:35:20,content/html-css/gitbook/images/favicon.ico,646,8,173.174.197.146,Hampton,2015-09-22,2016-02-06
2020-04-09 15:35:35,content/html-css/introduction.html,646,8,173.174.197.146,Hampton,2015-09-22,2016-02-06
2020-04-09 16:06:28,content/html-css/elements.html,646,8,173.174.197.146,Hampton,2015-09-22,2016-02-06
...,...,...,...,...,...,...,...
2020-02-10 19:11:07,javascript-ii/modules,20,22,104.190.242.242,Teddy,2018-01-08,2018-05-17
2020-02-10 19:22:00,java-i,20,22,104.190.242.242,Teddy,2018-01-08,2018-05-17
2020-02-10 19:22:12,java-ii,20,22,104.190.242.242,Teddy,2018-01-08,2018-05-17
2020-02-10 19:22:21,java-ii/inheritance-and-polymorphism,20,22,104.190.242.242,Teddy,2018-01-08,2018-05-17


In [79]:
web_dev_20[web_dev_20.page_viewed.str.contains('python')]

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-01-01 21:23:47,4-python/7.2-intro-to-matplotlib,18,22,45.20.117.182,Teddy,2018-01-08,2018-05-17
2020-01-01 21:24:04,4-python/7.3-intro-to-numpy,18,22,45.20.117.182,Teddy,2018-01-08,2018-05-17
2020-01-13 15:03:01,4-python/1-overview,410,32,172.58.107.0,Betelgeuse,2019-05-28,2019-10-08
2020-01-13 15:03:29,4-python/2-introduction-to-python,410,32,172.58.107.0,Betelgeuse,2019-05-28,2019-10-08
2020-09-13 18:51:46,python/control-structures,627,57,72.179.164.139,Ganymede,2020-03-23,2020-08-20
2020-11-01 02:16:30,python/overview,730,61,68.203.188.247,Bash,2020-07-20,2021-01-21
2020-11-01 02:17:00,python/introduction-to-python,730,61,68.203.188.247,Bash,2020-07-20,2021-01-21
2020-11-01 02:17:30,python/data-types-and-variables,730,61,68.203.188.247,Bash,2020-07-20,2021-01-21
2020-11-01 02:18:00,python/control-structures,730,61,68.203.188.247,Bash,2020-07-20,2021-01-21
2020-11-01 02:18:30,python/functions,730,61,68.203.188.247,Bash,2020-07-20,2021-01-21


In [80]:
perp = web_dev_20[web_dev_20.user_id == 730]

In [81]:
perp.page_viewed.value_counts()

/                                         26
javascript-i                              17
jquery                                    16
html-css/css-ii/bootstrap-introduction    13
search/search_index.json                  13
                                          ..
html-css/css-i/positioning                 1
fundamentals/cli/intro                     1
html-css/introduction                      1
regression/explore                         1
clustering/overview                        1
Name: page_viewed, Length: 281, dtype: int64

# What topics are grads continuing to reference after graduation and into their jobs (for each program)?

In [82]:
bayes.head(10)

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-08-20 09:39:58,/,466,34,97.105.19.58,Bayes,2019-08-19,2020-01-30
2019-08-20 09:39:59,/,467,34,97.105.19.58,Bayes,2019-08-19,2020-01-30
2019-08-20 09:39:59,/,468,34,97.105.19.58,Bayes,2019-08-19,2020-01-30
2019-08-20 09:40:02,/,469,34,97.105.19.58,Bayes,2019-08-19,2020-01-30
2019-08-20 09:40:08,/,470,34,97.105.19.58,Bayes,2019-08-19,2020-01-30
2019-08-20 09:40:15,/,471,34,97.105.19.58,Bayes,2019-08-19,2020-01-30
2019-08-20 09:40:15,/,472,34,97.105.19.58,Bayes,2019-08-19,2020-01-30
2019-08-20 09:40:17,/,473,34,97.105.19.58,Bayes,2019-08-19,2020-01-30
2019-08-20 09:40:18,/,474,34,97.105.19.58,Bayes,2019-08-19,2020-01-30
2019-08-20 09:40:19,/,475,34,97.105.19.58,Bayes,2019-08-19,2020-01-30


In [83]:
bayes_after = bayes.sort_index(ascending=False).head(3000)

In [84]:
bayes_after.shape

(3000, 7)

In [85]:
bayes_after.index.min()

Timestamp('2020-02-07 17:41:37')

In [86]:
bayes_after.page_viewed.value_counts().head(20)

/                                                    334
search/search_index.json                             139
1-fundamentals/1.1-intro-to-data-science              78
1-fundamentals/AI-ML-DL-timeline.jpg                  78
1-fundamentals/modern-data-scientist.jpg              78
6-regression/1-overview                               63
10-anomaly-detection/AnomalyDetectionCartoon.jpeg     36
10-anomaly-detection/1-overview                       36
3-sql/1-mysql-overview                                34
7-classification/3-prep                               32
12-distributed-ml/3-spark-api                         31
7-classification/6.1-logistic-regression              30
6-regression/7.0-model                                30
7-classification/4-explore                            29
7-classification/6.2-decision-trees                   28
6-regression/2.0-acquire-and-prep                     28
6-regression/5.0-evaluate                             28
5-stats/4.2-compare-means      

In [87]:
curie_after = curie.sort_index(ascending=False).head(2000)

In [88]:
curie_after.head()

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-11-02 15:57:29,fundamentals/git,616,55,70.114.9.241,Curie,2020-02-03,2020-07-07
2020-11-02 15:57:25,search/search_index.json,616,55,70.114.9.241,Curie,2020-02-03,2020-07-07
2020-11-02 15:57:22,/,616,55,70.114.9.241,Curie,2020-02-03,2020-07-07
2020-11-02 15:33:27,advanced-topics/cross-validation,581,55,70.112.179.142,Curie,2020-02-03,2020-07-07
2020-11-02 15:33:20,search/search_index.json,581,55,70.112.179.142,Curie,2020-02-03,2020-07-07


In [89]:
curie_after.index.min()

Timestamp('2020-07-22 15:25:15')

In [90]:
curie_after.page_viewed.value_counts().head(20)

/                                                 297
sql/mysql-overview                                 99
search/search_index.json                           92
classification/overview                            91
fundamentals/AI-ML-DL-timeline.jpg                 59
fundamentals/modern-data-scientist.jpg             58
fundamentals/intro-to-data-science                 56
classification/scale_features_or_not.svg           46
sql/database-design                                40
anomaly-detection/AnomalyDetectionCartoon.jpeg     36
anomaly-detection/overview                         34
timeseries/prep                                    24
timeseries/acquire                                 23
classification/prep                                21
4-python/5-functions                               21
sql/temporary-tables                               20
python/dataframes                                  19
sql/functions                                      19
sql/databases               

In [91]:
web20 = web_dev[web_dev.end_date.str.contains('2020')]

In [92]:
web20.name.value_counts()

Fortuna     36047
Ganymede    32447
Apex        31670
Deimos      29688
Hyperion    28354
Europa      27518
Name: name, dtype: int64

In [93]:
deimos = web20[web20.name == 'Deimos']

In [94]:
deimos_after = deimos.sort_index(ascending=False).head(4000)

In [95]:
deimos_after.head()

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-11-02 11:55:27,javascript-i/functions,495,51,72.191.58.18,Deimos,2019-09-16,2020-02-27
2020-11-02 11:53:21,javascript-i/javascript-with-html,495,51,72.191.58.18,Deimos,2019-09-16,2020-02-27
2020-11-02 11:53:17,javascript-i/introduction/working-with-data-ty...,495,51,72.191.58.18,Deimos,2019-09-16,2020-02-27
2020-11-02 11:52:52,javascript-i/javascript-with-html,495,51,72.191.58.18,Deimos,2019-09-16,2020-02-27
2020-11-02 11:52:45,javascript-i/introduction/working-with-data-ty...,495,51,72.191.58.18,Deimos,2019-09-16,2020-02-27


In [96]:
deimos_after.index.min()

Timestamp('2020-03-28 21:48:34')

In [97]:
deimos_after.page_viewed.value_counts().head(20)

/                           358
search/search_index.json    188
javascript-i                129
html-css                    127
spring                      122
toc                         112
appendix                    112
mysql                       102
java-ii                      90
javascript-ii                84
jquery                       82
java-iii                     80
java-i                       57
mysql/users                  55
mysql/basic-statements       51
spring/setup                 50
mysql/sample-database        45
mysql/databases              40
mysql/introduction           37
mysql/intellij               37
Name: page_viewed, dtype: int64

In [98]:
europa = web20[web20.name == 'Europa']

In [99]:
europa_after = europa.sort_index(ascending=False).head(1500)

In [100]:
europa_after.head()

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-10-31 21:57:29,/,533,52,38.70.11.18,Europa,2019-11-04,2020-04-17
2020-10-30 21:05:14,javascript-ii/RESTful-api,538,52,108.196.159.214,Europa,2019-11-04,2020-04-17
2020-10-30 21:05:12,javascript-ii,538,52,108.196.159.214,Europa,2019-11-04,2020-04-17
2020-10-30 21:05:09,/,538,52,108.196.159.214,Europa,2019-11-04,2020-04-17
2020-10-29 17:06:50,java-iii/jsp-and-jstl,525,52,72.179.177.9,Europa,2019-11-04,2020-04-17


In [101]:
europa_after.index.min()

Timestamp('2020-04-28 11:02:04')

In [102]:
europa_after.page_viewed.value_counts().head(20)

/                                                              193
appendix                                                        72
toc                                                             60
appendix/professional-development/mock-behavioral-questions     45
html-css/css-i/selectors-and-properties                         40
javascript-i                                                    37
javascript-ii                                                   32
java-i                                                          32
appendix/professional-development/t-block-resume                30
html-css                                                        28
search/search_index.json                                        25
spring                                                          25
java-ii                                                         22
appendix/professional-development/cover-letter                  18
html-css/forms                                                

In [103]:
apex = web20[web20.name == 'Apex']

In [104]:
apex_after = apex.sort_index(ascending=False).head(1500)

In [105]:
apex_after.head()

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-11-02 12:12:08,java-i/introduction-to-java,608,56,76.218.3.43,Apex,2020-02-24,2020-07-29
2020-11-02 12:12:05,java-i,608,56,76.218.3.43,Apex,2020-02-24,2020-07-29
2020-11-01 19:26:05,capstone/sw-project-planning,608,56,76.218.3.43,Apex,2020-02-24,2020-07-29
2020-10-31 22:16:29,java-ii/file-io,612,56,172.14.176.83,Apex,2020-02-24,2020-07-29
2020-10-31 22:16:27,java-ii,612,56,172.14.176.83,Apex,2020-02-24,2020-07-29


In [106]:
apex_after.index.min()

Timestamp('2020-08-20 13:40:49')

In [107]:
apex_after.page_viewed.value_counts().head(10)

search/search_index.json            92
spring                              78
/                                   77
java-ii                             52
java-i                              44
spring/fundamentals/controllers     41
java-iii                            41
html-css                            40
spring/fundamentals/repositories    32
mysql                               32
Name: page_viewed, dtype: int64

In [108]:
web_ip = web_dev.ip.unique()

In [109]:
web_ip.shape

(3522,)

In [110]:
web_ip = pd.DataFrame(web_ip)

In [111]:
web_ip.rename(columns={0: 'ip'}, inplace=True)

In [112]:
d3 = []

In [113]:
# for index, row in web_ip.iterrows():
#     location = row['ip']
#     url = "https://free-geo-ip.p.rapidapi.com/json/" + location

#     headers = {
#         'x-rapidapi-key': "e971744d3fmshe67b5c357e7ec4ap1aa107jsn4a07ff9a4175",
#         'x-rapidapi-host': "free-geo-ip.p.rapidapi.com"
#         }

#     response = requests.request("GET", url, headers=headers)

#     data = response.json()
#     d3.append(
#         {
#             'ip': row['ip'],
#             'city': data['city'],
#             'region': data['region_name']
            
#         }
#     )

# d3 = pd.DataFrame(d3)

In [114]:
europa_ip = europa.ip.unique()

In [115]:
europa_ip = pd.DataFrame(europa_ip)

In [116]:
europa_ip.rename(columns={0: 'ip'}, inplace=True)

In [117]:
for index, row in europa_ip.iterrows():
    location = row['ip']
    url = "https://free-geo-ip.p.rapidapi.com/json/" + location

    headers = {
        'x-rapidapi-key': "e971744d3fmshe67b5c357e7ec4ap1aa107jsn4a07ff9a4175",
        'x-rapidapi-host': "free-geo-ip.p.rapidapi.com"
        }

    response = requests.request("GET", url, headers=headers)

    data = response.json()
    d3.append(
        {
            'ip': row['ip'],
            'city': data['city'],
            'region': data['region_name']
            
        }
    )

d3 = pd.DataFrame(d3)

In [119]:
d3.region.value_counts()

Texas          209
                87
Illinois         6
Florida          3
New Jersey       2
Virginia         1
Mississippi      1
Name: region, dtype: int64