In [19]:
import altair as alt
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import set_config
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error

In [44]:
np.random.seed(10)

# load data
url = 'https://drive.google.com/uc?export=download&id=14O91N5OlVkvdGxXNJUj5jIsV5RexhzbB'
sessions = pd.read_csv(url)
sessions_times = sessions.drop(['hashedEmail', 'original_start_time', 'original_end_time'], axis = 1)
sessions_times = sessions_times.dropna()

## wrangling
# datetime conversion
sessions_times['start_dt'] = pd.to_datetime(sessions_times['start_time'], dayfirst = True)
sessions_times['end_dt'] = pd.to_datetime(sessions_times['end_time'], dayfirst = True)

# calculating duration of sessions in minutes
sessions_times['s_duration'] = (sessions_times['end_dt'] - sessions_times['start_dt']).dt.total_seconds() / 60

# getting weekdays and hour data
sessions_times['s_weekday'] = sessions_times['start_dt'].dt.day_name()
sessions_times['s_weekday_num'] = sessions_times['start_dt'].dt.weekday
sessions_times['s_hour'] = sessions_times['start_dt'].dt.hour

# x and y
X = sessions_times[['s_weekday_num', 's_hour']]
y = sessions_times['s_duration']

# split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.3)

# preprocessor and knn
preprocessor = make_column_transformer(
    (StandardScaler(), ['s_weekday_num', 's_hour']))
pipeline = make_pipeline(preprocessor, KNeighborsRegressor())

# 5-fold gridsearch
param_grid = {
    "kneighborsregressor__n_neighbors": range(1, 25, 2),
}
gridsearch = GridSearchCV(
    estimator = pipeline,
    param_grid = param_grid, 
    cv = 5, 
    scoring = 'neg_root_mean_squared_error'
)

# fit gridsearch
gridsearch.fit(X_train, y_train)

results = pd.DataFrame(gridsearch.cv_results_)
results['sem_test_score'] = results['std_test_score'] / 5**(1/2)
results = (
    results[[
        "param_kneighborsregressor__n_neighbors",
        "mean_test_score",
        "sem_test_score"
    ]]
    .rename(columns = {"param_kneighborsregressor__n_neighbors": "n_neighbors"})
)

results['mean_test_score'] = -results['mean_test_score']

gridsearch.best_params_

{'kneighborsregressor__n_neighbors': 21}

Good K

In [46]:
sessions_times['predicted'] = gridsearch.predict(
    sessions_times[['s_weekday_num', 's_hour']]
)

RMSPE = mean_squared_error(
    y_true = sessions_times['s_duration'],
    y_pred = sessions_times['predicted']
)**(1/2)
RMSPE

np.float64(53.82552063617244)

~53.8 minutes off

High variability because I'm looking at every weekday, lets try Saturday and Sunday only

In [49]:
np.random.seed(10)

mf_sessions = sessions_times[
    sessions_times['s_weekday_num'].isin([0, 1, 2, 3, 4])].dropna()  # sat = 5, sun = 6


X_mf = wkd_sessions[['s_weekday_num', 's_hour']]
y_mf = wkd_sessions['s_duration']

mf_X_train, mf_X_test, mf_y_train, mf_y_test = train_test_split(
    X_mf, y_mf, test_size = 0.3
)

wkd_preprocessor = make_column_transformer(
    (StandardScaler(), ['s_weekday_num', 's_hour']))
wkd_pipeline = make_pipeline(wkd_preprocessor, KNeighborsRegressor())

param_grid = {
    "kneighborsregressor__n_neighbors": range(1, 25, 2),
}
wkd_gridsearch = GridSearchCV(
    estimator = wkd_pipeline,
    param_grid = param_grid, 
    cv = 5, 
    scoring = 'neg_root_mean_squared_error',
    #n_jobs = -1
)

# fit gridsearch
wkd_gridsearch.fit(wkd_X_train, wkd_y_train)

wkd_results = pd.DataFrame(wkd_gridsearch.cv_results_)
wkd_results['sem_test_score'] = wkd_results['std_test_score'] / 5**(1/2)
wkd_results = (
    wkd_results[[
        "param_kneighborsregressor__n_neighbors",
        "mean_test_score",
        "sem_test_score"
    ]]
    .rename(columns = {"param_kneighborsregressor__n_neighbors": "n_neighbors"})
)

wkd_results['mean_test_score'] = -wkd_results['mean_test_score']

wkd_gridsearch.best_params_

{'kneighborsregressor__n_neighbors': 21}

In [48]:
wkd_sessions['predicted'] = wkd_gridsearch.predict(
    wkd_sessions[['s_weekday_num', 's_hour']]
)

RMSPE = mean_squared_error(
    y_true = wkd_sessions['s_duration'],
    y_pred = wkd_sessions['predicted']
)**(1/2)
RMSPE

np.float64(55.95489992996134)

~55.95 minutes off

Sessions on weekdays are much longer! So, I think this is much better.

Monday-Friday only

In [None]:
np.random.seed(10)

mf_sessions = sessions_times[
    sessions_times['s_weekday_num'].isin([0, 1, 2, 3, 4])].dropna()  # sat = 5, sun = 6


X_mf = wkd_sessions[['s_weekday_num', 's_hour']]
y_mf = wkd_sessions['s_duration']

mf_X_train, mf_X_test, mf_y_train, mf_y_test = train_test_split(
    X_mf, y_mf, test_size = 0.3
)

mf_preprocessor = make_column_transformer(
    (StandardScaler(), ['s_weekday_num', 's_hour']))
mf_pipeline = make_pipeline(mf_preprocessor, KNeighborsRegressor())

param_grid = {
    "kneighborsregressor__n_neighbors": range(1, 25, 2),
}
mf_gridsearch = GridSearchCV(
    estimator = mf_pipeline,
    param_grid = param_grid, 
    cv = 5, 
    scoring = 'neg_root_mean_squared_error',
    #n_jobs = -1
)

# fit gridsearch
mf_gridsearch.fit(mf_X_train, mf_y_train)

mf_results = pd.DataFrame(mf_gridsearch.cv_results_)
mf_results['sem_test_score'] = mf_results['std_test_score'] / 5**(1/2)
mf_results = (
    wkd_results[[
        "param_kneighborsregressor__n_neighbors",
        "mean_test_score",
        "sem_test_score"
    ]]
    .rename(columns = {"param_kneighborsregressor__n_neighbors": "n_neighbors"})
)

mf_results['mean_test_score'] = -mf_results['mean_test_score']

mf_gridsearch.best_params_