In [1]:
import numpy as np
import pandas as pd
import math
from sklearn import metrics

from scipy.stats import entropy

import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import matplotlib.dates as mdates #to format dates on our plots
%matplotlib inline
import seaborn as sns

Is there any suspicious activity, such as users/machines/etc accessing the curriculum who shouldn’t be? Does it appear that any web-scraping is happening? Are there any suspicious IP addresses? Any odd user-agents?

In [2]:
import requests

In [3]:
df = pd.read_csv('curriculum.txt',
                engine='python',
                 header=None,
                 index_col=False,
                 sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
                 na_values='"-"',)

In [4]:
df.columns = ['date', 'time', 'page_viewed', 'user_id', 'cohort_id', 'ip']

In [5]:
cohort = pd.read_csv('cohort_name.csv')

In [6]:
cohort.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,,cohort_id,name,start_date,end_date,program_id,
1,,1,Arches,2014-02-04,2014-04-22,1,
2,,2,Badlands,2014-06-04,2014-08-22,1,
3,,3,Carlsbad,2014-09-04,2014-11-05,1,
4,,4,Denali,2014-10-20,2015-01-18,1,


In [7]:
cohort.columns = cohort.iloc[0]

In [8]:
cohort = cohort.iloc[1:]

In [9]:
cohort.head()

Unnamed: 0,NaN,cohort_id,name,start_date,end_date,program_id,NaN.1
1,,1,Arches,2014-02-04,2014-04-22,1,
2,,2,Badlands,2014-06-04,2014-08-22,1,
3,,3,Carlsbad,2014-09-04,2014-11-05,1,
4,,4,Denali,2014-10-20,2015-01-18,1,
5,,5,Everglades,2014-11-18,2015-02-24,1,


In [10]:
cohort = cohort[['cohort_id', 'name', 'start_date', 'end_date']]

In [11]:
cohort.cohort_id = cohort.cohort_id.astype('int')


In [12]:
result = pd.merge(df, cohort, on='cohort_id')

In [13]:
result.cohort_id = result.cohort_id.astype('int')
result['date'] = result.date + " " + result.time
result.drop(columns=('time'), inplace=True)
result.date = pd.to_datetime(result.date)
result = result.set_index('date')

In [14]:
data_science = result[(result.name == 'Curie') | (result.name == 'Bayes') | (result.name == 'Ada') | (result.name == 'Darden')]

In [15]:
web_dev = result[(result.name != 'Curie') & (result.name != 'Bayes') & (result.name != 'Ada') & (result.name != 'Darden')]

In [16]:
bayes = data_science[data_science.name == 'Bayes']

In [17]:
darden = data_science[data_science.name == 'Darden']

In [18]:
curie = data_science[data_science.name == 'Curie']

In [19]:
# for index, row in darden.iterrows():
#     location = row['ip']
#     url = 'http://ip-api.com/csv/' + location
#     response = requests.get(url)
#     res = response.text.split(',')
#     location = []
#     location.append(res[5:6])
#     print(res[5:6])

In [20]:
ips = darden.ip.unique()

In [21]:
ips.shape

(155,)

In [22]:
ips = pd.DataFrame(ips)

In [23]:
ips

Unnamed: 0,0
0,76.201.20.193
1,24.28.146.155
2,136.50.56.155
3,108.239.188.205
4,68.54.110.249
...,...
150,172.58.110.195
151,172.58.67.160
152,107.13.184.11
153,72.128.139.130


In [24]:
ips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       155 non-null    object
dtypes: object(1)
memory usage: 1.3+ KB


In [25]:
ips.rename(columns={0: 'ip'}, inplace=True)

In [26]:
ips

Unnamed: 0,ip
0,76.201.20.193
1,24.28.146.155
2,136.50.56.155
3,108.239.188.205
4,68.54.110.249
...,...
150,172.58.110.195
151,172.58.67.160
152,107.13.184.11
153,72.128.139.130


In [27]:

# for index, row in ips.iloc[:45].iterrows():
#     location = row['ip']
#     url = 'http://ip-api.com/csv/' + location
#     response = requests.get(url)
#     res = response.text.split(',')
#     location = []
#     location.append(res[4:6])
#     print(row['ip'],res[4:6])

function for a single value, with response data. (.apply)

In [28]:
darden.shape

(22425, 7)

In [29]:
# for index, row in ips.iloc[46:90].iterrows():
#     location = row['ip']
#     url = 'http://ip-api.com/csv/' + location
#     response = requests.get(url)
#     res = response.text.split(',')
#     location = []
#     location.append(res[4:6])
#     print(row['ip'],res[4:6])

In [30]:
# for index, row in ips.iloc[91:130].iterrows():
#     location = row['ip']
#     url = 'http://ip-api.com/csv/' + location
#     response = requests.get(url)
#     res = response.text.split(',')
#     location = []
#     location.append(res[4:6])
#     print(row['ip'],res[4:6])

In [31]:
# for index, row in ips.iloc[131:180].iterrows():
#     location = row['ip']
#     url = 'http://ip-api.com/csv/' + location
#     response = requests.get(url)
#     res = response.text.split(',')
#     location = []
#     location.append(res[4:6])
#     print(row['ip'],res[4:6])

In [32]:
# for index, row in ips.iloc[181:220].iterrows():
#     location = row['ip']
#     url = 'http://ip-api.com/csv/' + location
#     response = requests.get(url)
#     res = response.text.split(',')
#     location = []
#     location.append(res[4:6])
#     print(row['ip'],res[4:6])

In [33]:
darden.shape

(22425, 7)

In [None]:
d = []

In [None]:
import requests

for index, row in ips.iterrows():
    location = row['ip']
    url = "https://free-geo-ip.p.rapidapi.com/json/" + location

    headers = {
        'x-rapidapi-key': "e971744d3fmshe67b5c357e7ec4ap1aa107jsn4a07ff9a4175",
        'x-rapidapi-host': "free-geo-ip.p.rapidapi.com"
        }

    response = requests.request("GET", url, headers=headers)

    data = response.json()
    d.append(
        {
            'ip': row['ip'],
            'city': data['city'],
            'region': data['region_name']
            
        }
    )

pd.DataFrame(d)

In [57]:
data.keys()

dict_keys(['ip', 'country_code', 'country_name', 'region_code', 'region_name', 'city', 'zip_code', 'time_zone', 'latitude', 'longitude', 'metro_code'])

In [None]:
d = pd.DataFrame(d)

In [None]:
d

In [None]:
d.groupby('region')[['ip']].agg('count').sort_values(by='ip',ascending=False)

In [None]:
result = pd.merge(darden, d, on='ip', how='left')

In [None]:
result.head(20)

In [34]:
ds_ip = data_science.ip.unique()

In [35]:
ds_ip.shape

(594,)

In [36]:
ds_ip = pd.DataFrame(ds_ip)

In [37]:
ds_ip.head()

Unnamed: 0,0
0,97.105.19.58
1,67.11.115.125
2,172.8.173.224
3,173.175.104.33
4,12.197.196.242


In [38]:
ds_ip.rename(columns={0: 'ip'}, inplace=True)

In [40]:
d = []

In [41]:
import requests

for index, row in ds_ip.iterrows():
    location = row['ip']
    url = "https://free-geo-ip.p.rapidapi.com/json/" + location

    headers = {
        'x-rapidapi-key': "e971744d3fmshe67b5c357e7ec4ap1aa107jsn4a07ff9a4175",
        'x-rapidapi-host': "free-geo-ip.p.rapidapi.com"
        }

    response = requests.request("GET", url, headers=headers)

    data = response.json()
    d.append(
        {
            'ip': row['ip'],
            'city': data['city'],
            'region': data['region_name']
            
        }
    )

pd.DataFrame(d)

Unnamed: 0,ip,city,region
0,97.105.19.58,Fredericksburg,Texas
1,67.11.115.125,San Antonio,Texas
2,172.8.173.224,San Antonio,Texas
3,173.175.104.33,Pharr,Texas
4,12.197.196.242,San Antonio,Texas
...,...,...,...
589,172.58.110.195,Dallas,Texas
590,172.58.67.160,Irving,Texas
591,107.13.184.11,Raleigh,North Carolina
592,72.128.139.130,San Antonio,Texas


In [43]:
d = pd.DataFrame(d)

In [44]:
d.head()

Unnamed: 0,ip,city,region
0,97.105.19.58,Fredericksburg,Texas
1,67.11.115.125,San Antonio,Texas
2,172.8.173.224,San Antonio,Texas
3,173.175.104.33,Pharr,Texas
4,12.197.196.242,San Antonio,Texas


In [45]:
d.groupby('region')[['ip']].agg('count').sort_values(by='ip',ascending=False)

Unnamed: 0_level_0,ip
region,Unnamed: 1_level_1
Texas,543
,14
Ontario,6
California,4
Jalisco,3
Massachusetts,2
Queensland,2
Florida,2
North Carolina,2
Arizona,2


In [46]:
d[d.region == 'Queensland']

Unnamed: 0,ip,city,region
99,45.248.77.99,Brisbane,Queensland
106,103.137.12.164,Brisbane,Queensland


In [47]:
data_science.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 68017 entries, 2019-08-20 09:39:58 to 2020-11-02 15:52:23
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   page_viewed  68016 non-null  object
 1   user_id      68017 non-null  int64 
 2   cohort_id    68017 non-null  int64 
 3   ip           68017 non-null  object
 4   name         68017 non-null  object
 5   start_date   68017 non-null  object
 6   end_date     68017 non-null  object
dtypes: int64(2), object(5)
memory usage: 4.2+ MB


In [48]:
data_science[data_science.ip == '103.137.12.164']

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-12-12 10:13:03,6-regression/1-overview,469,34,103.137.12.164,Bayes,2019-08-19,2020-01-30
2019-12-12 10:13:23,7-classification/6.2-decision-trees,469,34,103.137.12.164,Bayes,2019-08-19,2020-01-30
2019-12-12 10:24:47,10-anomaly-detection/1-overview,469,34,103.137.12.164,Bayes,2019-08-19,2020-01-30
2019-12-12 10:24:48,10-anomaly-detection/AnomalyDetectionCartoon.jpeg,469,34,103.137.12.164,Bayes,2019-08-19,2020-01-30
2019-12-12 10:24:53,11-nlp/project,469,34,103.137.12.164,Bayes,2019-08-19,2020-01-30
2019-12-12 10:24:54,11-nlp/github_repo_language.gif,469,34,103.137.12.164,Bayes,2019-08-19,2020-01-30
2019-12-12 10:24:56,11-nlp/6-model,469,34,103.137.12.164,Bayes,2019-08-19,2020-01-30
2019-12-12 11:05:10,11-nlp/4-prepare,469,34,103.137.12.164,Bayes,2019-08-19,2020-01-30
2019-12-12 11:20:50,11-nlp/6-model,469,34,103.137.12.164,Bayes,2019-08-19,2020-01-30


In [53]:
perp1 = data_science[data_science.user_id == 469]

In [54]:
perp1.ip.value_counts()

97.105.19.58       749
67.11.115.125      192
196.247.56.62       96
162.219.176.244     46
185.145.38.235      41
68.206.101.245      38
172.98.66.16        24
196.196.192.52      18
89.187.175.105      15
104.200.138.33      13
173.232.243.3       11
103.137.12.164       9
184.75.208.254       9
104.254.95.84        8
185.153.179.81       7
45.248.77.99         6
129.115.195.45       6
184.75.223.44        5
172.98.66.4          4
89.187.175.48        1
Name: ip, dtype: int64

In [55]:
d[d.region == 'North Rhine-Westphalia']

Unnamed: 0,ip,city,region
98,185.145.38.235,Cologne,North Rhine-Westphalia


In [56]:
data_science[data_science.ip == '185.145.38.235']

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-12-08 11:14:46,1-fundamentals/1.1-intro-to-data-science,469,34,185.145.38.235,Bayes,2019-08-19,2020-01-30
2019-12-08 11:14:46,1-fundamentals/modern-data-scientist.jpg,469,34,185.145.38.235,Bayes,2019-08-19,2020-01-30
2019-12-08 11:14:47,1-fundamentals/AI-ML-DL-timeline.jpg,469,34,185.145.38.235,Bayes,2019-08-19,2020-01-30
2019-12-08 11:57:39,1-fundamentals/1.2-data-science-pipeline,469,34,185.145.38.235,Bayes,2019-08-19,2020-01-30
2019-12-08 11:57:39,1-fundamentals/DataToAction_v2.jpg,469,34,185.145.38.235,Bayes,2019-08-19,2020-01-30
2019-12-08 12:02:51,1-fundamentals/1.1-intro-to-data-science,469,34,185.145.38.235,Bayes,2019-08-19,2020-01-30
2019-12-08 12:02:52,1-fundamentals/modern-data-scientist.jpg,469,34,185.145.38.235,Bayes,2019-08-19,2020-01-30
2019-12-08 12:02:52,1-fundamentals/AI-ML-DL-timeline.jpg,469,34,185.145.38.235,Bayes,2019-08-19,2020-01-30
2019-12-08 12:08:51,1-fundamentals/3-vocabulary,469,34,185.145.38.235,Bayes,2019-08-19,2020-01-30
2019-12-08 12:14:52,10-anomaly-detection/1-overview,469,34,185.145.38.235,Bayes,2019-08-19,2020-01-30


In [59]:
d2 = []

In [60]:
import requests

for index, row in ds_ip.iterrows():
    location = row['ip']
    url = "https://free-geo-ip.p.rapidapi.com/json/" + location

    headers = {
        'x-rapidapi-key': "e971744d3fmshe67b5c357e7ec4ap1aa107jsn4a07ff9a4175",
        'x-rapidapi-host': "free-geo-ip.p.rapidapi.com"
        }

    response = requests.request("GET", url, headers=headers)

    data = response.json()
    d2.append(
        {
            'ip': row['ip'],
            'city': data['city'],
            'region': data['region_name'],
            'country': data['country_name']
            
        }
    )

pd.DataFrame(d2)

Unnamed: 0,ip,city,region,country
0,97.105.19.58,Fredericksburg,Texas,United States
1,67.11.115.125,San Antonio,Texas,United States
2,172.8.173.224,San Antonio,Texas,United States
3,173.175.104.33,Pharr,Texas,United States
4,12.197.196.242,San Antonio,Texas,United States
...,...,...,...,...
589,172.58.110.195,Dallas,Texas,United States
590,172.58.67.160,Irving,Texas,United States
591,107.13.184.11,Raleigh,North Carolina,United States
592,72.128.139.130,San Antonio,Texas,United States


In [61]:
d2 = pd.DataFrame(d2)

In [62]:
d2.head()

Unnamed: 0,ip,city,region,country
0,97.105.19.58,Fredericksburg,Texas,United States
1,67.11.115.125,San Antonio,Texas,United States
2,172.8.173.224,San Antonio,Texas,United States
3,173.175.104.33,Pharr,Texas,United States
4,12.197.196.242,San Antonio,Texas,United States


In [63]:
d2.country.value_counts()

United States    576
Canada             9
Mexico             4
Australia          2
Germany            1
Ireland            1
China              1
Name: country, dtype: int64