# Time Series Project

##### Luke Becker, Data Scientist

In [1]:
# Importing libraries and functions for use.
# from __future__ import division
import itertools
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import pandas as pd
import math
from sklearn import metrics
from random import randint
from matplotlib import style
import seaborn as sns
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import MinMaxScaler

%matplotlib inline

# splitting data:
from sklearn.model_selection import train_test_split
from sklearn import metrics

from scipy.stats import entropy

# web-based requests
import requests
import rapid_env

# Importing the os library specifically for reading the csv once I've created the file in my working directory.
import os

import acquire
import prepare
import env
import rapid_env


# This is to make sure matplotlib doesn't throw the following error:
# The next line fixes "TypeError: float() argument must be a string or a number, not 'Timestamp' matplotlib"
pd.plotting.register_matplotlib_converters()

Imported env successfully.
credentials loaded successfully
End of acquire.py file.
Loaded all prepare functions.


# Plan

Data source is from Codeup's curriculum logs from January 2018 until October 2020. These logs track income IP addresses which are accessing the website's curriculum and possibly other areas of the website.

#### Objectives:
1. Which lesson appears to attract the most traffic consistently across cohorts (per program)?
2. Is there a cohort that referred to a lesson significantly more that other cohorts seemed to gloss over? 
3. Are there students who, when active, hardly access the curriculum? If so, what information do you have about these students? 
4. Is there any suspicious activity, such as users/machines/etc accessing the curriculum who shouldn’t be? Does it appear that any web-scraping is happening? Are there any suspicious IP addresses? Any odd user-agents? 
5. At some point in the last year, ability for students and alumni to cross-access curriculum (web dev to ds, ds to web dev) should have been shut off. Do you see any evidence of that happening? Did it happen before? 
6. What topics are grads continuing to reference after graduation and into their jobs (for each program)? 
7. Which lessons are least accessed? 
8. Anything else I should be aware of? 


### Ideas:
- Try creating continuous variables from my categorical variables.
- Filter down the data to a specific cohort and looking at data that way to hopefully find something useful.
- Use as my attribute a single user id; loop through each user id. Then, *by user* find anomalies with regard to that *individual user*, not to the patterns across *all* users.
- Use this code to help create a categorical variable from continuous variables: `groupby().size() = continuous variable`
- *Nice to have:* to help me visualize, install an ip address library to map out where each ip address is.

In [3]:
colnames = ['date', 'timestamp', 'web_page', 'user_id', 'cohort_id', 'ip']

df = pd.read_csv('anonymized-curriculum-access.txt', engine='python',
                 header=None,
                 index_col=False,
                 names=colnames,
                 sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
                 na_values='"-"',
                 usecols=[0,1,2,3,4,5])
df

Unnamed: 0,date,timestamp,web_page,user_id,cohort_id,ip
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61
...,...,...,...,...,...,...
719454,2020-11-02,16:48:13,javascript-i/coffee-project,763,62.0,107.192.148.199
719455,2020-11-02,16:48:17,javascript-i/mapbox-api,771,62.0,172.125.226.175
719456,2020-11-02,16:48:18,javascript-i/coffee-project,771,62.0,172.125.226.175
719457,2020-11-02,16:48:28,javascript-i/bom-and-dom/bom,771,62.0,172.125.226.175


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 719459 entries, 0 to 719458
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   date       719459 non-null  object 
 1   timestamp  719459 non-null  object 
 2   web_page   719458 non-null  object 
 3   user_id    719459 non-null  int64  
 4   cohort_id  674619 non-null  float64
 5   ip         719459 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 32.9+ MB


In [None]:
# I need to combine the date and timestamp into one column and make it a datetime dtype.

df['date_time'] = df['date'] + " " + df["timestamp"]
df['date_time'] = pd.to_datetime(df.date_time)
df.i