In [1]:
import pandas as pd
import numpy as np
from env import get_db_url
import os

There are no column headers in txt file we received, so we manually added on into the file based on the SQL table 'logs' from 'curriculum_logs'.

In [2]:
df = pd.read_csv('anonymized-curriculum-access.txt', sep=' ')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900223 entries, 0 to 900222
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Date       900223 non-null  object 
 1   time       900223 non-null  object 
 2   path       900222 non-null  object 
 3   user_id    900223 non-null  int64  
 4   cohort_id  847330 non-null  float64
 5   ip         900223 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 41.2+ MB


Unnamed: 0,Date,time,path,user_id,cohort_id,ip
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61


In [3]:
def get_curriculum_data():
    '''
    Acquires curriculum dataframe based on SQL query found below
    '''
    filename = 'curriculum_access.csv'

    if os.path.isfile(filename):
        return pd.read_csv(filename, index_col=0)
    else:
        df = pd.read_sql(
            '''
            SELECT 
               logs.date,
               logs.time,
               logs.path,
               logs.user_id,
               logs.ip,
               cohorts.name
            FROM
                logs
            LEFT JOIN
                cohorts ON logs.cohort_id = cohorts.id;
            '''
            ,
            get_db_url('curriculum_logs')
        )

        df.to_csv(filename)

        return df

In [4]:
df2 = get_curriculum_data()
df2

Unnamed: 0,date,time,path,user_id,ip,name
0,2018-01-26,09:55:03,/,1,97.105.19.61,Hampton
1,2018-01-26,09:56:02,java-ii,1,97.105.19.61,Hampton
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,97.105.19.61,Hampton
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,97.105.19.61,Hampton
4,2018-01-26,09:56:24,javascript-i/conditionals,2,97.105.19.61,Teddy
...,...,...,...,...,...,...
900218,2021-04-21,16:41:51,jquery/personal-site,64,71.150.217.33,Staff
900219,2021-04-21,16:42:02,jquery/mapbox-api,64,71.150.217.33,Staff
900220,2021-04-21,16:42:09,jquery/ajax/weather-map,64,71.150.217.33,Staff
900221,2021-04-21,16:44:37,anomaly-detection/discrete-probabilistic-methods,744,24.160.137.86,Staff


#### Will check all rows to ensure the txt file is the same as the SQL query.

In [21]:
(df.Date != df2.date).sum()

0

In [22]:
(df.time != df2.time).sum()

0

In [23]:
(df.path != df2.path).sum()

1

There is one path that appears to be different:

In [24]:
df[(df.path != df2.path)]

Unnamed: 0,Date,time,path,user_id,cohort_id,ip
506305,2020-04-08,09:25:18,,586,55.0,72.177.240.51


In [28]:
df.iloc[506305], df2.iloc[506305]

(Date            2020-04-08
 time              09:25:18
 path                   NaN
 user_id                586
 cohort_id             55.0
 ip           72.177.240.51
 Name: 506305, dtype: object,
 date          2020-04-08
 time            09:25:18
 path                 NaN
 user_id              586
 ip         72.177.240.51
 name               Curie
 Name: 506305, dtype: object)

*This entry has a NaN for path on both, otherwise they are both the same.  Recommend throwing this out in cleanup.*

In [29]:
(df.user_id != df2.user_id).sum()

0

In [30]:
(df.ip != df2.ip).sum()

0

### Looks like the SQL query is good for acquire!

In [31]:
import acquire

In [32]:
df = acquire.get_curriculum_data()
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900223 entries, 0 to 900222
Data columns (total 6 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   date     900223 non-null  object
 1   time     900223 non-null  object
 2   path     900222 non-null  object
 3   user_id  900223 non-null  int64 
 4   ip       900223 non-null  object
 5   name     847330 non-null  object
dtypes: int64(1), object(5)
memory usage: 41.2+ MB


Unnamed: 0,date,time,path,user_id,ip,name
0,2018-01-26,09:55:03,/,1,97.105.19.61,Hampton
1,2018-01-26,09:56:02,java-ii,1,97.105.19.61,Hampton
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,97.105.19.61,Hampton
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,97.105.19.61,Hampton
4,2018-01-26,09:56:24,javascript-i/conditionals,2,97.105.19.61,Teddy


In [49]:
df[df.name.isnull()]

Unnamed: 0,date,time,path,user_id,ip,name
411,2018-01-26,16:46:16,/,48,97.105.19.61,
412,2018-01-26,16:46:24,spring/extra-features/form-validation,48,97.105.19.61,
425,2018-01-26,17:54:24,/,48,97.105.19.61,
435,2018-01-26,18:32:03,/,48,97.105.19.61,
436,2018-01-26,18:32:17,mysql/relationships/joins,48,97.105.19.61,
...,...,...,...,...,...,...
899897,2021-04-21,12:49:00,javascript-ii,717,136.50.102.126,
899898,2021-04-21,12:49:02,javascript-ii/es6,717,136.50.102.126,
899899,2021-04-21,12:51:27,javascript-ii/map-filter-reduce,717,136.50.102.126,
899900,2021-04-21,12:52:37,javascript-ii/promises,717,136.50.102.126,


In [47]:
df[df.name.isnull()].groupby('user_id').user_id.count()

user_id
48     1581
54      146
58      462
59       42
61      816
       ... 
744      69
782     370
810     709
814     187
815      46
Name: user_id, Length: 78, dtype: int64