In [1]:
#imports
import pandas as pd
import numpy as np
from env import get_db_url
import os
import acquire
from sklearn import metrics

In [2]:
#using acquire code and file to call in curriculum_log data:
df = acquire.get_curriculum_data()
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 900223 entries, 0 to 900222
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   date        900223 non-null  object 
 1   time        900223 non-null  object 
 2   path        900222 non-null  object 
 3   user_id     900223 non-null  int64  
 4   ip          900223 non-null  object 
 5   name        847330 non-null  object 
 6   start_date  847330 non-null  object 
 7   end_date    847330 non-null  object 
 8   program_id  847330 non-null  float64
dtypes: float64(1), int64(1), object(7)
memory usage: 68.7+ MB


Unnamed: 0,date,time,path,user_id,ip,name,start_date,end_date,program_id
0,2018-01-26,09:55:03,/,1,97.105.19.61,Hampton,2015-09-22,2016-02-06,1.0
1,2018-01-26,09:56:02,java-ii,1,97.105.19.61,Hampton,2015-09-22,2016-02-06,1.0
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,97.105.19.61,Hampton,2015-09-22,2016-02-06,1.0
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,97.105.19.61,Hampton,2015-09-22,2016-02-06,1.0
4,2018-01-26,09:56:24,javascript-i/conditionals,2,97.105.19.61,Teddy,2018-01-08,2018-05-17,2.0


In [3]:
df.index

Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                 8,      9,
            ...
            900213, 900214, 900215, 900216, 900217, 900218, 900219, 900220,
            900221, 900222],
           dtype='int64', length=900223)

In [4]:
df.columns

Index(['date', 'time', 'path', 'user_id', 'ip', 'name', 'start_date',
       'end_date', 'program_id'],
      dtype='object')

In [5]:
#looking at the stats numbers for the dataset:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
user_id,900223.0,458.825707,249.296767,1.0,269.0,475.0,660.0,981.0
program_id,847330.0,2.086004,0.388231,1.0,2.0,2.0,2.0,4.0


### Take aways:
- `date`/ `time` / `start_date` and `end_date` needs to be run through `pd.to_datetime`
- `date` could also become new index
- it could be helpful to create a `program_name` column
- setting `ip` as float will allow boxplots and other charts for anomaly checks.

In [6]:
#passing df through pd.to_datetime for date and time format/dtype for all date/time columns
#df.date = pd.to_datetime(df.date)
#df.start_date = pd.to_datetime(df.start_date)
#df.end_date = pd.to_datetime(df.end_date)
#setting date as the index
#df = df.set_index('date')

In [8]:
def date_dtypes_index(df):
    #passing df through pd.to_datetime for date and time format/dtype for all date/time columns
    df.date = pd.to_datetime(df.date)
    df.start_date = pd.to_datetime(df.start_date)
    df = df[df.end_date <= pd.to_datetime("today")]
    df.time = df.dt.time
    #setting date as the index
    df = df.set_index('date')
    return df

In [9]:
df = date_dtypes_index(df)

TypeError: '<=' not supported between instances of 'str' and 'Timestamp'

In [None]:
#checking dtypes:
df.dtypes

In [None]:
#checking new index:
df.index

In [None]:
#adding in datetime columns to find stats values:
df.describe(datetime_is_numeric=True)

_______________________________________________

### Looking at nulls and anomalies:

In [None]:
#out of the 90,223 rows and 9 columns, what nulls are in the data?
df.isnull().sum()

#### Checking anomalies on ip addresses 

In [None]:
#looking at ip column for outliers:
df.ip.value_counts()

In [None]:
#defining a function to create a temp df for anomaly counts/frequencies:
def value_counts_and_frequencies(s: pd.Series, dropna=True) -> pd.DataFrame:
    return pd.merge(
    s.value_counts(dropna=False).rename('count'),
    s.value_counts(dropna=False, normalize=True).rename('proba'),
    left_index=True,
    right_index=True,
    )

ip_df = value_counts_and_frequencies(df.ip)
ip_df.head()

In [None]:
#looking at ip addresses that have the lowest frequencies occuring
df[df.ip == '172.58.109.140']

In [None]:
#looking at ip addresses that have the lowest frequencies occuring
df[df.ip == '99.203.213.59']

In [None]:
#looking at ip addresses that have the lowest frequencies occuring
df[df.ip == '72.181.124.244']

#### interesting find/note about some of the nulls:

In [None]:
#This last user_id had nulls. When following it, this shows that possible nulls might be when students are
#going through their prework phase at Codeup...good to know for another time...
df[df.user_id == 88]

In [None]:
#possible webscraper
df[df.user_id == 48]

In [None]:

df[df.name == 'Olympic']

#### Next steps: 
- Turning `ip` into int?? (UPDATE: not needed. can be explored w/o ...verified by Ryan O.
- Handling nulls

### What is needed?
- Most frequent lessons logged into (per program)
    - Lesson counts
    - Program id/name
- Most/least frequent lessons logged into (per cohort)
    - Lesson log counts
    - name (cohort)
- Active students that do not engage with curriculum?
    - user_id frequency counts
    - current dates of cohorts
    - ip addresses?
- Suspicious activity and any webscrapping happening?
    - time (looking at odd hours of day)
    - ip addresses and no cohorts attached
    - user_id?
- 2019 dataframe showing shutoff of cross program curriculum access
    - program id and path page counts (?)
    - date (only 2019)
- Topics (path) alumni are still using?
    - path frequencies
    - anything > end_date of cohorts
- Least accessed lessons (overall)
    - path frequencies
- Anything else??


## Looking at Nulls:

In [None]:
#checking nulls:
df.isnull().sum()

In [None]:
#finding any missing names of cohorts (any possible scrapers?)
df[df.isnull().any(axis=1)]

In [None]:
#looking through mid-sections of data
df[df.isnull().any(axis=1)][100:500]

In [None]:
#looking through mid-sections of data
df[df.isnull().any(axis=1)][505:800]

In [None]:
#who dis? 
df[df.user_id == ]

In [None]:
#who dis? (this is showing a staff being a staff prior to student? possible?)
df[df.user_id == 53]

In [None]:
#finding any missing names of cohorts (any possible scrapers?)
df[df.isnull().any(axis=1)].tail()

In [None]:
#who dis?
df[df.user_id == 717]

In [None]:
#possibly webscraper...pages are being accessed only one day by this unknown user every minute
df[df.ip == '136.50.102.126']

In [None]:
df.program_id.value_counts()

In [None]:
df[df.path == '/'].value_counts()

In [None]:
df2 = df.groupby(['user_id'])['name'].count()
df2.head()

In [None]:
df(df.program_id == 4)

In [None]:
df.path.value_counts()[55:100
                      ]