In [1]:
import numpy as np
import pandas as pd
import math
from sklearn import metrics

from scipy.stats import entropy

import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import matplotlib.dates as mdates #to format dates on our plots
%matplotlib inline
import seaborn as sns

In [2]:
# This is to make sure matplotlib doesn't throw the following error:
# The next line fixes "TypeError: float() argument must be a string or a number, not 'Timestamp' matplotlib"
pd.plotting.register_matplotlib_converters()

In [7]:
# colnames=['ip', 'timestamp', 'request_method', 'status', 'size',
#           'destination', 'request_agent']
# df_orig = pd.read_csv('anonymized-curriculum-access.txt',          
#                  engine='python',
#                  header=None,
#                  index_col=False,
#                  names=colnames,
#                  sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
#                  na_values='"-"',
#                  usecols=[0, 3, 4, 5, 6, 7, 8]
# )

In [21]:
df = pd.read_csv('curriculum.txt',
                engine='python',
                 header=None,
                 index_col=False,
                 sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
                 na_values='"-"',)




In [22]:
df.head()

Unnamed: 0,0,1,2,3,4,5
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61


In [23]:
df.columns = ['date', 'time', 'page_viewed', 'user_id', 'cohort_id', 'ip']

In [24]:
df.head()

Unnamed: 0,date,time,page_viewed,user_id,cohort_id,ip
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 719459 entries, 0 to 719458
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   date         719459 non-null  object 
 1   time         719459 non-null  object 
 2   page_viewed  719458 non-null  object 
 3   user_id      719459 non-null  int64  
 4   cohort_id    674619 non-null  float64
 5   ip           719459 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 32.9+ MB


In [26]:
df.dropna(inplace=True)

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 674618 entries, 0 to 719458
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   date         674618 non-null  object 
 1   time         674618 non-null  object 
 2   page_viewed  674618 non-null  object 
 3   user_id      674618 non-null  int64  
 4   cohort_id    674618 non-null  float64
 5   ip           674618 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 36.0+ MB


In [28]:
df.cohort_id = df.cohort_id.astype('int')


In [29]:
df['date'] = df.date + df.time

In [30]:
df.head()

Unnamed: 0,date,time,page_viewed,user_id,cohort_id,ip
0,2018-01-2609:55:03,09:55:03,/,1,8,97.105.19.61
1,2018-01-2609:56:02,09:56:02,java-ii,1,8,97.105.19.61
2,2018-01-2609:56:05,09:56:05,java-ii/object-oriented-programming,1,8,97.105.19.61
3,2018-01-2609:56:06,09:56:06,slides/object_oriented_programming,1,8,97.105.19.61
4,2018-01-2609:56:24,09:56:24,javascript-i/conditionals,2,22,97.105.19.61


In [31]:
df.drop(columns=('time'), inplace=True)

In [32]:
df.head()

Unnamed: 0,date,page_viewed,user_id,cohort_id,ip
0,2018-01-2609:55:03,/,1,8,97.105.19.61
1,2018-01-2609:56:02,java-ii,1,8,97.105.19.61
2,2018-01-2609:56:05,java-ii/object-oriented-programming,1,8,97.105.19.61
3,2018-01-2609:56:06,slides/object_oriented_programming,1,8,97.105.19.61
4,2018-01-2609:56:24,javascript-i/conditionals,2,22,97.105.19.61


In [33]:
df['page_viewed'].value_counts()

/                                                                                                                                                                                                                                                                          36074
javascript-i                                                                                                                                                                                                                                                               13912
search/search_index.json                                                                                                                                                                                                                                                   13602
toc                                                                                                                                                                                  