In [1]:
# from __future__ import division
import itertools

# To get rid of those blocks of red warnings
import warnings
warnings.filterwarnings("ignore")

# Standard Imports
import numpy as np
from scipy import stats
import pandas as pd
from math import sqrt
import os
from scipy.stats import spearmanr
from sklearn import metrics
from random import randint

# Vis Imports
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
import plotly.express as px
from pandas.plotting import register_matplotlib_converters
from mpl_toolkits.mplot3d import Axes3D

# Modeling Imports
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.feature_selection import f_regression 
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression, RFE
import sklearn.preprocessing
import statsmodels.api as sm
from sklearn.cluster import DBSCAN

# Custom Module Imports
import env

In [10]:
def acquire():
    """
    This function acquires the data from the codeup database
    and save it into a dataframe.
    """
    url = f'mysql+pymysql://{env.user}:{env.password}@{env.host}/curriculum_logs'
    query = '''
    SELECT *
    FROM logs
    LEFT JOIN cohorts ON logs.cohort_id=cohorts.id;
    '''
    df = pd.read_sql(query, url)
    return df

In [11]:
df = acquire()

In [12]:
df.head()

Unnamed: 0,date,time,path,user_id,cohort_id,ip,id,name,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61,22.0,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,,2.0


In [13]:
df.shape

(900223, 15)

In [14]:
df.isnull().sum()

date               0
time               0
path               1
user_id            0
cohort_id      52893
ip                 0
id             52893
name           52893
slack          52893
start_date     52893
end_date       52893
created_at     52893
updated_at     52893
deleted_at    900223
program_id     52893
dtype: int64

In [15]:
df.cohort_id.min()

1.0

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900223 entries, 0 to 900222
Data columns (total 15 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   date        900223 non-null  object 
 1   time        900223 non-null  object 
 2   path        900222 non-null  object 
 3   user_id     900223 non-null  int64  
 4   cohort_id   847330 non-null  float64
 5   ip          900223 non-null  object 
 6   id          847330 non-null  float64
 7   name        847330 non-null  object 
 8   slack       847330 non-null  object 
 9   start_date  847330 non-null  object 
 10  end_date    847330 non-null  object 
 11  created_at  847330 non-null  object 
 12  updated_at  847330 non-null  object 
 13  deleted_at  0 non-null       object 
 14  program_id  847330 non-null  float64
dtypes: float64(3), int64(1), object(11)
memory usage: 103.0+ MB


In [21]:
df = df.drop(columns=['deleted_at', 'updated_at', 'created_at', 'slack'])

In [22]:
df.head()

Unnamed: 0,date,time,path,user_id,cohort_id,ip,id,name,start_date,end_date,program_id
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61,8.0,Hampton,2015-09-22,2016-02-06,1.0
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61,8.0,Hampton,2015-09-22,2016-02-06,1.0
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61,8.0,Hampton,2015-09-22,2016-02-06,1.0
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61,8.0,Hampton,2015-09-22,2016-02-06,1.0
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61,22.0,Teddy,2018-01-08,2018-05-17,2.0


In [23]:
df.isnull().sum()

date              0
time              0
path              1
user_id           0
cohort_id     52893
ip                0
id            52893
name          52893
start_date    52893
end_date      52893
program_id    52893
dtype: int64

In [24]:
df['cohort_id'] = df['cohort_id'].fillna(0.0)

In [25]:
df.isnull().sum()

date              0
time              0
path              1
user_id           0
cohort_id         0
ip                0
id            52893
name          52893
start_date    52893
end_date      52893
program_id    52893
dtype: int64

In [109]:
df.cohort_id.max()

139.0

## 1. Which lesson appears to attract the most traffic consistently across cohorts (per program)?

In [37]:
df.path.value_counts()

/                                                               50313
search/search_index.json                                        19519
javascript-i                                                    18983
toc                                                             18297
java-iii                                                        13733
                                                                ...  
javascript/loops                                                    1
content/control-structures-ii                                       1
javascript-ii/promisesdfghjkjhgfs                                   1
app                                                                 1
appendix/professional-development/post-interview-review-form        1
Name: path, Length: 2313, dtype: int64

In [73]:
page_by_cohort = df.groupby(['cohort_id'])['path'].value_counts()

In [74]:
page_by_cohort = pd.DataFrame(page_by_cohort)

In [75]:
page_by_cohort

Unnamed: 0_level_0,Unnamed: 1_level_0,path
cohort_id,path,Unnamed: 2_level_1
0.0,/,4459
0.0,search/search_index.json,1985
0.0,javascript-i,780
0.0,toc,706
0.0,spring,641
...,...,...
139.0,java-iii/servlets,1
139.0,javascript-i/bom-and-dom/dom,1
139.0,javascript-i/objects,1
139.0,javascript-i/objects/math,1


In [77]:
page_by_cohort.columns=['path_value_count']

In [118]:
page_by_cohort = page_by_cohort.reset_index()

In [122]:
page_by_cohort = page_by_cohort[page_by_cohort['path'] != '/']

In [126]:
page_by_cohort

Unnamed: 0,cohort_id,path,path_value_count
1,0.0,search/search_index.json,1985
2,0.0,javascript-i,780
3,0.0,toc,706
4,0.0,spring,641
5,0.0,java-iii,567
...,...,...,...
14672,139.0,java-iii/servlets,1
14673,139.0,javascript-i/bom-and-dom/dom,1
14674,139.0,javascript-i/objects,1
14675,139.0,javascript-i/objects/math,1


In [123]:
max_page_by_cohort = page_by_cohort.groupby('cohort_id').max()

In [125]:
max_page_by_cohort

Unnamed: 0_level_0,path,path_value_count
cohort_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,web-design/ux/purpose,1985
1.0,uploads/5762c2946250b.jpg,294
2.0,toc,6
4.0,prework/versioning/github,1
6.0,spring/setup,10
7.0,toc,29
8.0,uploads/58a217a705bde.jpg,57
9.0,content/html-css/introduction.html,2
11.0,toc,12
12.0,toc,19


In [128]:
page_by_cohort = page_by_cohort.set_index('cohort_id')

In [129]:
max_page_by_cohort_df = pd.DataFrame()
for x in max_page_by_cohort.index:
    df1 = page_by_cohort.loc[x][page_by_cohort.loc[x]['path_value_count'] == (max_page_by_cohort.loc[x]['path_value_count'])]
    df1['cohort_id'] = x
    max_page_by_cohort_df = pd.concat([max_page_by_cohort_df, df1])
max_page_by_cohort_df

Unnamed: 0_level_0,path,path_value_count,cohort_id
cohort_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,search/search_index.json,1985,0.0
1.0,javascript-i,294,1.0
2.0,content/php_ii/command-line,6,2.0
4.0,mkdocs/search_index.json,1,4.0
4.0,prework/databases,1,4.0
4.0,prework/versioning/github,1,4.0
6.0,javascript-ii/es6,10,6.0
7.0,content/html-css,29,7.0
8.0,java-iii,57,8.0
9.0,content/html-css,2,9.0


In [115]:
df[df['cohort_id'] == 137.0]

Unnamed: 0,date,time,path,user_id,cohort_id,ip,id,name,start_date,end_date,program_id
847090,2021-03-15,15:22:48,/,896,137.0,162.205.226.39,137.0,Florence,2021-03-15,2021-09-03,3.0
847092,2021-03-15,15:23:17,/,897,137.0,72.181.115.96,137.0,Florence,2021-03-15,2021-09-03,3.0
847094,2021-03-15,15:23:33,/,898,137.0,208.84.155.68,137.0,Florence,2021-03-15,2021-09-03,3.0
847095,2021-03-15,15:23:48,/,899,137.0,173.173.110.122,137.0,Florence,2021-03-15,2021-09-03,3.0
847096,2021-03-15,15:24:05,fundamentals/intro-to-data-science,898,137.0,208.84.155.68,137.0,Florence,2021-03-15,2021-09-03,3.0
...,...,...,...,...,...,...,...,...,...,...,...
899841,2021-04-21,12:10:26,python/advanced-dataframes,898,137.0,99.147.232.101,137.0,Florence,2021-03-15,2021-09-03,3.0
899888,2021-04-21,12:47:06,/,903,137.0,70.117.8.141,137.0,Florence,2021-03-15,2021-09-03,3.0
899889,2021-04-21,12:47:09,fundamentals/git,903,137.0,70.117.8.141,137.0,Florence,2021-03-15,2021-09-03,3.0
899901,2021-04-21,12:55:54,python/dataframes,908,137.0,68.206.160.11,137.0,Florence,2021-03-15,2021-09-03,3.0


In [130]:
df[df['path'] == 'toc']

Unnamed: 0,date,time,path,user_id,cohort_id,ip,id,name,start_date,end_date,program_id
85409,2018-07-13,09:13:05,toc,1,8.0,97.105.19.61,8.0,Hampton,2015-09-22,2016-02-06,1.0
85413,2018-07-13,09:13:52,toc,1,8.0,97.105.19.61,8.0,Hampton,2015-09-22,2016-02-06,1.0
85686,2018-07-13,14:53:38,toc,134,23.0,97.105.19.61,23.0,Ulysses,2018-03-05,2018-07-19,2.0
85722,2018-07-13,15:57:29,toc,203,24.0,97.105.19.61,24.0,Voyageurs,2018-05-29,2018-10-11,2.0
85746,2018-07-13,17:35:58,toc,1,8.0,72.177.226.58,8.0,Hampton,2015-09-22,2016-02-06,1.0
...,...,...,...,...,...,...,...,...,...,...,...
900041,2021-04-21,14:41:08,toc,939,138.0,174.197.2.198,138.0,Neptune,2021-03-15,2021-09-03,2.0
900043,2021-04-21,14:42:21,toc,495,51.0,72.191.50.129,51.0,Deimos,2019-09-16,2020-02-27,2.0
900076,2021-04-21,14:57:59,toc,863,135.0,173.174.149.192,135.0,Marco,2021-01-25,2021-07-19,2.0
900126,2021-04-21,15:41:55,toc,887,135.0,107.77.169.13,135.0,Marco,2021-01-25,2021-07-19,2.0


## 2. Is there a cohort that referred to a lesson significantly more than other cohorts seemed to gloss over?

In [131]:
min_page_by_cohort = page_by_cohort.groupby('cohort_id').min()

In [132]:
min_page_by_cohort

Unnamed: 0_level_0,path,path_value_count
cohort_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,00_,1
1.0,0_Classification_Algorithms,1
2.0,content/git,1
4.0,mkdocs/search_index.json,1
6.0,11._DistributedML,1
7.0,1-fundamentals/1.1-intro-to-data-science,1
8.0,0_Classification_Algorithms,1
9.0,content/html-css,1
11.0,appendix,1
12.0,appendix/git/intellij-intro,1


In [133]:
min_page_by_cohort_df = pd.DataFrame()
for x in min_page_by_cohort.index:
    df2 = page_by_cohort.loc[x][page_by_cohort.loc[x]['path_value_count'] == (min_page_by_cohort.loc[x]['path_value_count'])]
    df2['cohort_id'] = x
    min_page_by_cohort_df = pd.concat([min_page_by_cohort_df, df2])
min_page_by_cohort_df

Unnamed: 0_level_0,path,path_value_count,cohort_id
cohort_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,1-fundamentals/2.1-excel-overview,1,0.0
0.0,1-fundamentals/2.1-intro-to-excel,1,0.0
0.0,1-fundamentals/3-vocabulary,1,0.0
0.0,10-anomaly-detection/5-detecting-with-clustering,1,0.0
0.0,10-anomaly-detection/exercises,1,0.0
...,...,...,...
139.0,java-iii/servlets,1,139.0
139.0,javascript-i/bom-and-dom/dom,1,139.0
139.0,javascript-i/objects,1,139.0
139.0,javascript-i/objects/math,1,139.0


In [139]:
min_page_by_cohort_df.path.value_counts()

examples/postwork/reports.yml                                     9
content/php_iii/classes-and-objects-i                             9
content/javascript_ii/gitbook/images/favicon.ico                  9
content/php_ii/control-structures-i/gitbook/images/favicon.ico    8
appendix/git/merge-conflict-demo                                  8
                                                                 ..
app                                                               1
appendix/spring/seeder                                            1
css-ii/bootstrap-grid-system                                      1
css-ii/bootstrap-introduction                                     1
java-iii/jsp-and-jstl                                             1
Name: path, Length: 1626, dtype: int64

In [140]:
max_page_by_cohort_df.path.value_counts()

toc                                         10
javascript-i                                 9
search/search_index.json                     6
content/html-css                             3
spring                                       3
index.html                                   2
mkdocs/search_index.json                     2
javascript-ii/es6                            1
prework/versioning/github                    1
java-iii                                     1
html-css                                     1
prework/databases                            1
java-i                                       1
content/laravel/intro                        1
content/php_ii/command-line                  1
1-fundamentals/modern-data-scientist.jpg     1
6-regression/1-overview                      1
classification/overview                      1
classification/scale_features_or_not.svg     1
fundamentals/modern-data-scientist.jpg       1
javascript-i/introduction/operators          1
Name: path, d

In [149]:
min_page_by_cohort_df[min_page_by_cohort_df.path == 'toc']

Unnamed: 0_level_0,path,path_value_count,cohort_id
cohort_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
15.0,toc,1,15.0


In [150]:
min_page_by_cohort_df[min_page_by_cohort_df.path == 'javascript-i']

Unnamed: 0_level_0,path,path_value_count,cohort_id
cohort_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2.0,javascript-i,1,2.0
12.0,javascript-i,1,12.0


In [151]:
min_page_by_cohort_df[min_page_by_cohort_df.path == 'search/search_index.json']

Unnamed: 0_level_0,path,path_value_count,cohort_id
cohort_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11.0,search/search_index.json,1,11.0
15.0,search/search_index.json,1,15.0


In [152]:
min_page_by_cohort_df[min_page_by_cohort_df.path == 'content/html-css']

Unnamed: 0_level_0,path,path_value_count,cohort_id
cohort_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2.0,content/html-css,1,2.0
8.0,content/html-css,1,8.0
26.0,content/html-css,1,26.0


In [153]:
min_page_by_cohort_df[min_page_by_cohort_df.path == 'spring']

Unnamed: 0_level_0,path,path_value_count,cohort_id
cohort_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
55.0,spring,1,55.0


In [154]:
min_page_by_cohort_df[min_page_by_cohort_df.path == 'index.html']

Unnamed: 0_level_0,path,path_value_count,cohort_id
cohort_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,index.html,1,1.0
8.0,index.html,1,8.0
15.0,index.html,1,15.0
25.0,index.html,1,25.0
33.0,index.html,1,33.0
34.0,index.html,1,34.0


In [155]:
min_page_by_cohort_df[min_page_by_cohort_df.path == 'mkdocs/search_index.json']

Unnamed: 0_level_0,path,path_value_count,cohort_id
cohort_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4.0,mkdocs/search_index.json,1,4.0
15.0,mkdocs/search_index.json,1,15.0
17.0,mkdocs/search_index.json,1,17.0
34.0,mkdocs/search_index.json,1,34.0


## 3. Which lessons are least accessed? 


In [163]:
df.head(1)

Unnamed: 0,date,time,path,user_id,cohort_id,ip,id,name,start_date,end_date,program_id
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61,8.0,Hampton,2015-09-22,2016-02-06,1.0


In [264]:
least_lessons = df.root_path_2.value_counts().sort_values()

In [265]:
least_lessons = pd.DataFrame(least_lessons)

In [266]:
least_lessons

Unnamed: 0,root_path_2
javascript-i/dom,1
6-regression/3-univariate-regression-in-excel,1
6-regression/ordinary_least_squares.jpeg,1
6-regression/2-intro-to-regression,1
cohorts/%7Bid%7D,1
...,...
html-css/css-ii,21002
javascript-i/introduction,25163
html-css/css-i,32875
spring/fundamentals,36271


In [267]:
least_lessons[least_lessons['root_path_2'] == 1]

Unnamed: 0,root_path_2
javascript-i/dom,1
6-regression/3-univariate-regression-in-excel,1
6-regression/ordinary_least_squares.jpeg,1
6-regression/2-intro-to-regression,1
cohorts/%7Bid%7D,1
...,...
further-reading/html-css,1
bom-and-dom/dom,1
appendix/www.opensecrets.org,1
appendix/css-practice,1


In [179]:
df[df['cohort_id'] == 28.0]['path'].value_counts().sort_values()

appendix/professional-development/post-interview-review-form       1
content/php_iii/php-with-html                                      1
overview                                                           1
5.02_Prep                                                          1
content/php_iii/php-with-html/gitbook/images/favicon.ico           1
                                                                ... 
search/search_index.json                                        1349
java-iii                                                        1393
spring                                                          1403
javascript-i                                                    1817
/                                                               6340
Name: path, Length: 1404, dtype: int64

In [269]:
df_without_staff = df[df['cohort_id'] != 28.0]

In [270]:
least_lessons_without_staff = df_without_staff.root_path_2.value_counts().sort_values()

In [271]:
least_lessons_without_staff = pd.DataFrame(least_lessons_without_staff)

In [274]:
least_lessons_without_staff[least_lessons_without_staff['root_path_2'] > 25]

Unnamed: 0,root_path_2
advanced-topics/developing-data-products,27
appendix/git-teamwork,27
4-python/pandas-time-series,27
10-anomaly-detection/2-continuous-probabilistic-methods.ipynb.md,28
python/project,28
...,...
html-css/css-ii,20033
javascript-i/introduction,24179
html-css/css-i,30676
spring/fundamentals,32809


## 4. What topics are grads continuing to reference after graduation and into their jobs (for each program)?

In [185]:
df.dtypes

date           object
time           object
path           object
user_id         int64
cohort_id     float64
ip             object
id            float64
name           object
start_date     object
end_date       object
program_id    float64
dtype: object

In [186]:
df.date = pd.to_datetime(df.date)
df.start_date = pd.to_datetime(df.start_date)
df.end_date = pd.to_datetime(df.end_date)

In [187]:
df.dtypes

date          datetime64[ns]
time                  object
path                  object
user_id                int64
cohort_id            float64
ip                    object
id                   float64
name                  object
start_date    datetime64[ns]
end_date      datetime64[ns]
program_id           float64
dtype: object

In [189]:
df.shape

(900223, 11)

In [190]:
after_grad_df = df[df.end_date < df.date]

In [192]:
after_grad_df.head()

Unnamed: 0,date,time,path,user_id,cohort_id,ip,id,name,start_date,end_date,program_id
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61,8.0,Hampton,2015-09-22,2016-02-06,1.0
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61,8.0,Hampton,2015-09-22,2016-02-06,1.0
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61,8.0,Hampton,2015-09-22,2016-02-06,1.0
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61,8.0,Hampton,2015-09-22,2016-02-06,1.0
30,2018-01-26,10:14:47,/,11,1.0,97.105.19.61,1.0,Arches,2014-02-04,2014-04-22,1.0


In [198]:
after_grad_df = after_grad_df[after_grad_df['path'] != '/']

In [199]:
most_lessons_after_grad = after_grad_df.path.value_counts().sort_values()

In [200]:
most_lessons_after_grad = pd.DataFrame(most_lessons_after_grad)

In [205]:
most_lessons_after_grad.nlargest(n=5, columns='path')

Unnamed: 0,path
javascript-i,4965
spring,4262
search/search_index.json,4174
html-css,3678
java-iii,3537


## 5. Are there students who, when active, hardly access the curriculum? If so, what information do you have about these students?

In [238]:
str_split = df.path.str.split('/', expand=True)

In [241]:
str_split = str_split.drop(columns=[2,3,4,5,6,7])


In [248]:
str_split = str_split.dropna(axis=0)

In [253]:
root = str_split[0] + '/' + str_split[1]

In [255]:
root = pd.DataFrame(root)

In [257]:
df = pd.merge(df, root, how='left', left_index=True, right_index=True)

In [261]:
df.rename(columns={0:'root_path_2'}, inplace=True)

In [262]:
df

Unnamed: 0,date,time,path,user_id,cohort_id,ip,id,name,start_date,end_date,program_id,root_path,root_path_2
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61,8.0,Hampton,2015-09-22,2016-02-06,1.0,,/
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61,8.0,Hampton,2015-09-22,2016-02-06,1.0,java-ii,
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61,8.0,Hampton,2015-09-22,2016-02-06,1.0,java-ii,java-ii/object-oriented-programming
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61,8.0,Hampton,2015-09-22,2016-02-06,1.0,slides,slides/object_oriented_programming
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61,22.0,Teddy,2018-01-08,2018-05-17,2.0,javascript-i,javascript-i/conditionals
...,...,...,...,...,...,...,...,...,...,...,...,...,...
900218,2021-04-21,16:41:51,jquery/personal-site,64,28.0,71.150.217.33,28.0,Staff,2014-02-04,2014-02-04,2.0,jquery,jquery/personal-site
900219,2021-04-21,16:42:02,jquery/mapbox-api,64,28.0,71.150.217.33,28.0,Staff,2014-02-04,2014-02-04,2.0,jquery,jquery/mapbox-api
900220,2021-04-21,16:42:09,jquery/ajax/weather-map,64,28.0,71.150.217.33,28.0,Staff,2014-02-04,2014-02-04,2.0,jquery,jquery/ajax
900221,2021-04-21,16:44:37,anomaly-detection/discrete-probabilistic-methods,744,28.0,24.160.137.86,28.0,Staff,2014-02-04,2014-02-04,2.0,anomaly-detection,anomaly-detection/discrete-probabilistic-methods


In [263]:
df['root_path_2'].value_counts()

/                            50313
spring/fundamentals          36271
html-css/css-i               32875
javascript-i/introduction    25163
html-css/css-ii              21002
                             ...  
user/913                         1
6-regression/8-Project           1
ui/typography                    1
relationships/indexes            1
javascript-i/dom                 1
Name: root_path_2, Length: 1018, dtype: int64

In [275]:
df.date.min()

Timestamp('2018-01-26 00:00:00')

In [276]:
df.date.max()

Timestamp('2021-04-21 00:00:00')