In [1]:
#imports libraries
import pandas as pd
import numpy as np
from sklearn import metrics
import seaborn as sns
from datetime import timedelta
from datetime import datetime
import matplotlib.pyplot as plt

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

#imports for acquire and prep
from env import get_db_url
import os
import acquire
import wrangle

## Team Anomaly Detection Project:

This repo is my side of the exploration, in first, answering question #3, and then using our Team's wrangle.py file to do more exploring on finding other anomalies and/or takeaways that might have been overlooked.

#### Pulling in the wrangle dataframe with the acquire file, and then doing simple changes to further the process of answering questions 3.

In [2]:
#calling in acquire function for OG dataset (minimized cleaning/wrangle)
df=acquire.get_access_data()

In [3]:
#calling in wrangle.py functions (minus outliers/imputers) to change dtypes and add/drop columns:
df = wrangle.add_columns(df)


In [4]:
# Create DateTime for index, convert dates to DateTime, add an hour column, drop old date and time
df['accessed'] = df['date'] + ' ' + df['time']
df.accessed = pd.to_datetime(df.accessed)
df.start_date = pd.to_datetime(df.start_date)
df.end_date = pd.to_datetime(df.end_date)
df['hour'] = df['accessed'].dt.hour
df = df.drop(columns=['date','time'])
df = df[df.end_date <= pd.to_datetime("today")]
df['Date'] = df['accessed'].dt.date
df['Time'] = df['accessed'].dt.time
#setting date as the index
df = df.set_index('accessed')

In [5]:
#dropping program 4.0 (newest program) as it ony shows 5 of the students...might be a data submission error:
df=df[df.program_id != 4.0]

In [7]:
df.head(2)

Unnamed: 0_level_0,path,user_id,ip,name,start_date,end_date,program_id,program_type,hour,Date,Time
accessed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2018-01-26 09:55:03,/,1,97.105.19.61,Hampton,2015-09-22,2016-02-06,1.0,Web Development,9,2018-01-26,09:55:03
2018-01-26 09:56:02,java-ii,1,97.105.19.61,Hampton,2015-09-22,2016-02-06,1.0,Web Development,9,2018-01-26,09:56:02


### 3. Are there students who, when active, hardly access the curriculum? If so, what information do you have about these students?
Active students that do not engage with curriculum?

user_id frequency counts
current dates of cohorts
ip addresses?

In [None]:
df.acquire

In [None]:
df.head()

In [None]:
#Question: are user_ids being recycled? 
#And was there another website for cohorts in 2014?
df[df.user_id == 64]

In [None]:
df.index

In [None]:
df.columns

In [None]:
#looking at the stats numbers for the dataset:
df.describe().T

### Take aways:
- `date`/ `time` / `start_date` and `end_date` needs to be run through `pd.to_datetime`
- `date` could also become new index
- it could be helpful to create a `program_name` column
- setting `ip` as float will allow boxplots and other charts for anomaly checks.

In [None]:
def date_dtypes_index(df):
    """passing df through pd.to_datetime for date and time format/dtype for all date/time columns;
    and combining date + time and setting as index"""
    df['access_date'] = df['date'] + ' ' + df['time']
    df.access_date = pd.to_datetime(df.access_date)
    df.start_date = pd.to_datetime(df.start_date)
    df.end_date = pd.to_datetime(df.end_date)
    df = df[df.end_date <= pd.to_datetime("today")]
    #setting date as the index
    df = df.set_index('access_date')
    return df

In [None]:
df = date_dtypes_index(df)
df.head()

In [None]:
#checking dtypes:
df.dtypes

In [None]:
#checking new index:
df.index

In [None]:
#adding in datetime columns to find stats values:
df.describe(datetime_is_numeric=True)

In [None]:
## non curriculum pages (Ray's codes)
def drop_noncurriculum(df):
    df=df[df.path != '/']
    df=df[df.path != 'toc']
    df=df[df.path.str.contains('jpeg') != True]
    df=df[df.path.str.contains('json') != True]
    df=df[df.path.str.contains('jpg') != True]
    df=df[df.path.str.contains('appendix') != True]
    df=df[df.path.str.contains('Appendix') != True]
    return df

In [None]:
df = drop_noncurriculum(df)
df

In [None]:
def drop_other(df):
    df=df[df.program_id != 4.0]
    df=df[df.name !='Staff']
    return df

In [None]:
df = drop_other(df)
df.head()

In [None]:
# assigns values in program_id their program name
df.loc[df['program_id'] == 1.0, 'program_id'] = 'Web Development'
df.loc[df['program_id'] == 2.0, 'program_id'] = 'Web Development'         
df.loc[df['program_id'] == 3.0, 'program_id'] = 'Data Science'

_______________________________________________

### Looking at nulls and anomalies:

In [None]:
#out of the 90,223 rows and 9 columns, what nulls are in the data?
df.isnull().sum()

#### Checking anomalies on ip addresses 

In [None]:
#looking at ip column for outliers:
df.ip.value_counts()

In [None]:
#defining a function to create a temp df for anomaly counts/frequencies:
def value_counts_and_frequencies(s: pd.Series, dropna=True) -> pd.DataFrame:
    return pd.merge(
    s.value_counts(dropna=False).rename('count'),
    s.value_counts(dropna=False, normalize=True).rename('proba'),
    left_index=True,
    right_index=True,
    )

ip_df = value_counts_and_frequencies(df.ip)
ip_df.head()

#### interesting find/note about some of the nulls:

In [None]:
#possible webscraper
df[df.user_id == 48]

#### Next steps: 
- Turning `ip` into int?? (UPDATE: not needed. can be explored w/o ...verified by Ryan O.
- Handling nulls

### What is needed?
- Most frequent lessons logged into (per program)
    - Lesson counts
    - Program id/name
- Most/least frequent lessons logged into (per cohort)
    - Lesson log counts
    - name (cohort)
- Active students that do not engage with curriculum?
    - user_id frequency counts
    - current dates of cohorts
    - ip addresses?
- Suspicious activity and any webscrapping happening?
    - time (looking at odd hours of day)
    - ip addresses and no cohorts attached
    - user_id?
- 2019 dataframe showing shutoff of cross program curriculum access
    - program id and path page counts (?)
    - date (only 2019)
- Topics (path) alumni are still using?
    - path frequencies
    - anything > end_date of cohorts
- Least accessed lessons (overall)
    - path frequencies
- Anything else??


## Using OG dataset for student/user_id outliers:

#### User_id examples of users that went from student to staff

In [None]:
#who dis? 
df[df.user_id == 53]

In [None]:
#This last user_id had nulls. When following it, this shows that possible nulls might be when students are
#going through their prework phase at Codeup...good to know for another time...
df[df.user_id == 88]

In [None]:
df.program_id.value_counts()

In [None]:
#looking at log counts for each user id
df2 = df.groupby(['user_id'])['name'].count()
df2

In [None]:
df.path.value_counts()[55:100
                      ]

## 3. Are there students who, when active, hardly access the curriculum? If so, what information do you have about these students?
Active students that do not engage with curriculum? 
- user_id frequency counts
- current dates of cohorts
- ip addresses?


In [None]:
#creating a df that holds students (user_id) that have accessed curriculum less than 5 times:
df3  = df.user_id.value_counts() 
df3

In [None]:
df3 = pd.DataFrame(df.user_id.value_counts() < 5)
df3

In [None]:
#all of the user_id that are accessing curriculum 4 times or less
df3[df3.user_id == True]

Looking at each of these students to learn more:

In [None]:
#user 95 (wondering if this is a webscraper? start date and access dates do not make sense)
df[df.user_id == 95]

In [None]:
#user 679: maybe they quit??
df[df.user_id == 679]

In [None]:
#user 71 : access dates and start_date do not coincide..
df[df.user_id == 71]

In [None]:
#user 246 : looks like webscaper | odd sign-in times and access/start dates do not match
df[df.user_id == 246]

In [None]:
#user 399: pathways and quick scan times look like a webscraper
df[df.user_id == 399]

In [None]:
#user 956: looks like a student that my have dropped out
df[df.user_id == 956]

In [None]:
#user 539: student that dropped?
df[df.user_id == 539]

In [None]:
#user 216: possible web scraper?
df[df.user_id == 216]

In [None]:
#user 177: path would indicate web scrap (images and favicon only)
df[df.user_id == 177]

In [None]:
#user 81: path would indicate web scrap (html and css only)
df[df.user_id == 81]

In [None]:
#user 169: looks like web scraper
df[df.user_id == 169]

In [None]:
#user 85: looks like a past possible student that dropped out or never started
df[df.user_id == 85]

In [None]:
#user 832: student that didn't interact with curriculum??
df[df.user_id == 832]

In [None]:
#user 97: web scraper
df[df.user_id == 97]

In [None]:
#user 857: probably student that dropped out
df[df.user_id == 857]

In [None]:
#user 348: probably web scraper
df[df.user_id == 348]

In [None]:
#user 212: probably web scraper or maybe a past student?? (is there much info from 2014?)
df[df.user_id == 212]

In [None]:
#user 593: probably web scraper 
df[df.user_id == 593]

In [None]:
#user 165: probably web scraper 
df[df.user_id == 165]

#### Creating objects based on exploration of these students:

In [None]:
#creating object of known dropped students:
df_drop_students = df[df.user_id == 85],df[df.user_id == 956]

In [None]:
#info together:
df_drop_students

In [None]:
#creating object of known web scrapers:
web_scrapes = df[df.user_id == 177],df[df.user_id == 399], df[df.user_id == 246]

In [None]:
#creating object of possible web scrapers:
poss_web_scrapes = df[df.user_id == 165],df[df.user_id == 593], df[df.user_id == 212], df[df.user_id == 348], df[df.user_id == 857],df[df.user_id == 97], df[df.user_id == 832], df[df.user_id == 85], df[df.user_id == 169], df[df.user_id == 81],df[df.user_id == 216]

In [None]:
#creating object of student then staff users:
student_staff = df[df.user_id == 539],df[df.user_id == 64]

In [None]:
#reminding myself of column names:
df.columns

In [None]:
#histograph of user_id counts
df.user_id.hist(figsize=(15, 10),
            ec='black',
            grid=True,
            bins=25);

In [None]:
#sns is having issues with index as it is showing duplicates:
df.index.is_unique

In [None]:
#dropping index duplicates
df=df.loc[~df.index.duplicated(), :]

In [None]:
#splitting webdev and datascience into two different df:
wd = df[df.program_id != 'Data Science']
ds = df[df.program_id == 'Data Science']

### Active Students with low curriculum access during each program:

In [None]:
#Ray's code to filter dataframe for time where students are active during their programs
active_wd = wd.loc[(wd.index >= wd.start_date) & (wd.index <= wd.end_date)]
active_ds = ds.loc[(ds.index >= ds.start_date) & (ds.index <= ds.end_date)]

#### WEB DEV:

In [None]:
#webdev students with lowest log access count:
hardly_access_wd = active_wd.groupby('user_id').size().sort_values().head(20)
hardly_access_wd

In [None]:
#histogram of these users under 20 logged access dates:
user_id_count = active_wd.groupby('user_id').size().sort_values()
user_id_count = user_id_count[:16]
plt.figure(figsize=(10,5))
sns.barplot(user_id_count.index, user_id_count.values, alpha=0.8)
plt.title ('Hardly Accessed Web Dev Curriculum')
plt.ylabel('Number of Occurences')
plt.xlabel('User Id', fontsize=12)
plt.show()


#### Data Science:

In [None]:
#looking at ds active students that had lowest curriculum log access counts
active_ds.groupby('user_id').size().sort_values().head()

In [None]:
#ds students with lowest log access count:
hardly_access_ds = active_ds.groupby('user_id').size().sort_values().head(3)
hardly_access_ds

In [None]:
#histogram of these 3 users under 114 access logs on curriculum:
sns.barplot(hardly_access_ds.index, hardly_access_ds.values, alpha=0.8)
plt.title ('Hardly Accessed Data Science Curriculum')
plt.ylabel('Number of Occurences')
plt.xlabel('User Id', fontsize=12)
plt.show()


#### Can we look at users' access log times? (help determine any web scrapers)

In [None]:
#adding hour column to df for hours shown:
df['hour'] = pd.to_datetime(df.time).dt.hour
df.head(1)

In [None]:
#showing hours these hardly accessed students logged in at:


In [None]:
#user_id to cohort: are there student ranges here?
sns.histplot(data=df, x=df.user_id, hue=df.program_id, bins=30)

#### Question: Are there overlaps or recycling happening with user_id?

In [None]:
#user_id to cohort: are there student ranges here?
sns.histplot(data=df, x=df.user_id, hue=df.program_id, bins=60, multiple="stack")

In [None]:
#user example of student then staff member
#df[df.user_id ==539]

In [None]:
#using wrangle code to call in all dataframes and file to call in curriculum_log data:
df, df_staff, df_multicohort, df_unimputed, df_non_curriculum, df_outliers=wrangle.full_wrangle()