In [1]:
import numpy as np
import pandas as pd
import math
from sklearn import metrics

from scipy.stats import entropy

import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import matplotlib.dates as mdates #to format dates on our plots
%matplotlib inline
import seaborn as sns

Is there any suspicious activity, such as users/machines/etc accessing the curriculum who shouldn’t be? Does it appear that any web-scraping is happening? Are there any suspicious IP addresses? Any odd user-agents?

In [2]:
import requests

In [3]:
df = pd.read_csv('curriculum.txt',
                engine='python',
                 header=None,
                 index_col=False,
                 sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
                 na_values='"-"',)

In [4]:
df.columns = ['date', 'time', 'page_viewed', 'user_id', 'cohort_id', 'ip']

In [5]:
cohort = pd.read_csv('cohort_name.csv')

In [6]:
cohort.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,,cohort_id,name,start_date,end_date,program_id,
1,,1,Arches,2014-02-04,2014-04-22,1,
2,,2,Badlands,2014-06-04,2014-08-22,1,
3,,3,Carlsbad,2014-09-04,2014-11-05,1,
4,,4,Denali,2014-10-20,2015-01-18,1,


In [7]:
cohort.columns = cohort.iloc[0]

In [8]:
cohort = cohort.iloc[1:]

In [9]:
cohort.head()

Unnamed: 0,NaN,cohort_id,name,start_date,end_date,program_id,NaN.1
1,,1,Arches,2014-02-04,2014-04-22,1,
2,,2,Badlands,2014-06-04,2014-08-22,1,
3,,3,Carlsbad,2014-09-04,2014-11-05,1,
4,,4,Denali,2014-10-20,2015-01-18,1,
5,,5,Everglades,2014-11-18,2015-02-24,1,


In [10]:
cohort = cohort[['cohort_id', 'name', 'start_date', 'end_date']]

In [11]:
cohort.cohort_id = cohort.cohort_id.astype('int')


In [12]:
result = pd.merge(df, cohort, on='cohort_id')

In [13]:
result.cohort_id = result.cohort_id.astype('int')
result['date'] = result.date + " " + result.time
result.drop(columns=('time'), inplace=True)
result.date = pd.to_datetime(result.date)
result = result.set_index('date')

In [14]:
data_science = result[(result.name == 'Curie') | (result.name == 'Bayes') | (result.name == 'Ada') | (result.name == 'Darden')]

In [15]:
web_dev = result[(result.name != 'Curie') & (result.name != 'Bayes') & (result.name != 'Ada') & (result.name != 'Darden')]

In [16]:
bayes = data_science[data_science.name == 'Bayes']

In [17]:
darden = data_science[data_science.name == 'Darden']

In [18]:
curie = data_science[data_science.name == 'Curie']

In [19]:
# for index, row in darden.iterrows():
#     location = row['ip']
#     url = 'http://ip-api.com/csv/' + location
#     response = requests.get(url)
#     res = response.text.split(',')
#     location = []
#     location.append(res[5:6])
#     print(res[5:6])

In [20]:
ips = darden.ip.unique()

In [21]:
ips.shape

(155,)

In [22]:
ips = pd.DataFrame(ips)

In [23]:
ips

Unnamed: 0,0
0,76.201.20.193
1,24.28.146.155
2,136.50.56.155
3,108.239.188.205
4,68.54.110.249
...,...
150,172.58.110.195
151,172.58.67.160
152,107.13.184.11
153,72.128.139.130


In [24]:
ips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       155 non-null    object
dtypes: object(1)
memory usage: 1.3+ KB


In [25]:
ips.rename(columns={0: 'ip'}, inplace=True)

In [26]:
ips

Unnamed: 0,ip
0,76.201.20.193
1,24.28.146.155
2,136.50.56.155
3,108.239.188.205
4,68.54.110.249
...,...
150,172.58.110.195
151,172.58.67.160
152,107.13.184.11
153,72.128.139.130


In [28]:

# for index, row in ips.iloc[:45].iterrows():
#     location = row['ip']
#     url = 'http://ip-api.com/csv/' + location
#     response = requests.get(url)
#     res = response.text.split(',')
#     location = []
#     location.append(res[4:6])
#     print(row['ip'],res[4:6])

function for a single value, with response data. (.apply)

In [None]:
darden.shape

In [None]:
# for index, row in ips.iloc[46:90].iterrows():
#     location = row['ip']
#     url = 'http://ip-api.com/csv/' + location
#     response = requests.get(url)
#     res = response.text.split(',')
#     location = []
#     location.append(res[4:6])
#     print(row['ip'],res[4:6])

In [None]:
# for index, row in ips.iloc[91:130].iterrows():
#     location = row['ip']
#     url = 'http://ip-api.com/csv/' + location
#     response = requests.get(url)
#     res = response.text.split(',')
#     location = []
#     location.append(res[4:6])
#     print(row['ip'],res[4:6])

In [None]:
# for index, row in ips.iloc[131:180].iterrows():
#     location = row['ip']
#     url = 'http://ip-api.com/csv/' + location
#     response = requests.get(url)
#     res = response.text.split(',')
#     location = []
#     location.append(res[4:6])
#     print(row['ip'],res[4:6])

In [None]:
# for index, row in ips.iloc[181:220].iterrows():
#     location = row['ip']
#     url = 'http://ip-api.com/csv/' + location
#     response = requests.get(url)
#     res = response.text.split(',')
#     location = []
#     location.append(res[4:6])
#     print(row['ip'],res[4:6])

In [None]:
darden.shape

In [30]:
d = []

In [32]:
import requests

for index, row in ips.iterrows():
    location = row['ip']
    url = "https://free-geo-ip.p.rapidapi.com/json/" + location

    headers = {
        'x-rapidapi-key': "e971744d3fmshe67b5c357e7ec4ap1aa107jsn4a07ff9a4175",
        'x-rapidapi-host': "free-geo-ip.p.rapidapi.com"
        }

    response = requests.request("GET", url, headers=headers)

    data = response.json()
    d.append(
        {
            'ip': row['ip'],
            'city': data['city']
            
        }
    )

pd.DataFrame(d)

Unnamed: 0,ip,city
0,76.201.20.193,Austin
1,24.28.146.155,San Antonio
2,136.50.56.155,San Antonio
3,108.239.188.205,San Antonio
4,68.54.110.249,Burnsville
...,...,...
150,172.58.110.195,Dallas
151,172.58.67.160,Irving
152,107.13.184.11,Raleigh
153,72.128.139.130,San Antonio


In [33]:
d

[{'ip': '76.201.20.193', 'city': 'Austin'},
 {'ip': '24.28.146.155', 'city': 'San Antonio'},
 {'ip': '136.50.56.155', 'city': 'San Antonio'},
 {'ip': '108.239.188.205', 'city': 'San Antonio'},
 {'ip': '68.54.110.249', 'city': 'Burnsville'},
 {'ip': '173.174.165.12', 'city': 'San Antonio'},
 {'ip': '70.120.16.59', 'city': 'San Antonio'},
 {'ip': '99.76.233.212', 'city': 'San Antonio'},
 {'ip': '72.177.148.181', 'city': 'San Antonio'},
 {'ip': '99.132.128.255', 'city': 'San Antonio'},
 {'ip': '70.125.137.220', 'city': 'San Antonio'},
 {'ip': '72.181.127.233', 'city': 'San Antonio'},
 {'ip': '67.11.40.170', 'city': 'San Antonio'},
 {'ip': '66.69.83.140', 'city': 'Boerne'},
 {'ip': '96.8.130.201', 'city': 'San Antonio'},
 {'ip': '136.50.3.247', 'city': 'San Antonio'},
 {'ip': '96.8.130.186', 'city': 'San Antonio'},
 {'ip': '162.197.239.66', 'city': 'San Antonio'},
 {'ip': '173.174.200.226', 'city': 'San Antonio'},
 {'ip': '67.11.85.236', 'city': 'San Antonio'},
 {'ip': '136.50.70.27', 'cit

In [35]:
d = pd.DataFrame(d)

In [36]:
d

Unnamed: 0,ip,city
0,76.201.20.193,Austin
1,24.28.146.155,San Antonio
2,136.50.56.155,San Antonio
3,108.239.188.205,San Antonio
4,68.54.110.249,Burnsville
...,...,...
150,172.58.110.195,Dallas
151,172.58.67.160,Irving
152,107.13.184.11,Raleigh
153,72.128.139.130,San Antonio


In [41]:
d.groupby('city')[['ip']].agg('count').sort_values(by='ip',ascending=False)

Unnamed: 0_level_0,ip
city,Unnamed: 1_level_1
Houston,77
San Antonio,47
Dallas,6
Austin,3
Cypress,2
Irving,2
,1
Marshall,1
Royse City,1
Raleigh,1


In [46]:
result = pd.merge(darden, d, on='ip', how='left')

In [47]:
result.head(20)

Unnamed: 0,page_viewed,user_id,cohort_id,ip,name,start_date,end_date,city
0,/,678,59,76.201.20.193,Darden,2020-07-13,2021-01-12,Austin
1,/,679,59,24.28.146.155,Darden,2020-07-13,2021-01-12,San Antonio
2,/,680,59,136.50.56.155,Darden,2020-07-13,2021-01-12,San Antonio
3,13-advanced-topics/1-tidy-data,679,59,24.28.146.155,Darden,2020-07-13,2021-01-12,San Antonio
4,1-fundamentals/1.1-intro-to-data-science,679,59,24.28.146.155,Darden,2020-07-13,2021-01-12,San Antonio
5,1-fundamentals/AI-ML-DL-timeline.jpg,679,59,24.28.146.155,Darden,2020-07-13,2021-01-12,San Antonio
6,1-fundamentals/modern-data-scientist.jpg,679,59,24.28.146.155,Darden,2020-07-13,2021-01-12,San Antonio
7,/,681,59,108.239.188.205,Darden,2020-07-13,2021-01-12,San Antonio
8,/,682,59,68.54.110.249,Darden,2020-07-13,2021-01-12,Burnsville
9,/,683,59,173.174.165.12,Darden,2020-07-13,2021-01-12,San Antonio
