In [2]:
import pandas as pd
import numpy as np


players = pd.read_csv("players.csv")
sessions = pd.read_csv("sessions.csv")


sessions["start_time"] = pd.to_datetime(sessions["start_time"], format="%d/%m/%Y %H:%M")
sessions["end_time"]   = pd.to_datetime(sessions["end_time"],   format="%d/%m/%Y %H:%M")


sessions["duration_min"] = (sessions["end_time"] - sessions["start_time"]).dt.total_seconds() / 60

sessions["login_hour"] = sessions["start_time"].dt.hour         
sessions["day_of_week"] = sessions["start_time"].dt.day_name()   


player_cols = ["hashedEmail", "experience", "age", "gender", "played_hours"]
players_small = players[player_cols]

sessions = sessions.merge(players_small, on="hashedEmail", how="left")

sessions[["start_time", "end_time", "duration_min", "login_hour", "day_of_week",
          "experience", "age", "gender", "played_hours"]].head()



FileNotFoundError: [Errno 2] No such file or directory: 'players.csv'

In [7]:
sessions["duration_min"].describe()


count    1533.000000
mean       50.858447
std        55.573572
min         3.000000
25%         9.000000
50%        30.000000
75%        73.000000
max       259.000000
Name: duration_min, dtype: float64

In [8]:
sessions[["duration_min", "login_hour", "day_of_week"]].isna().sum()



duration_min    2
login_hour      0
day_of_week     0
dtype: int64

In [9]:

sessions = sessions[sessions["duration_min"] <= 180]


In [10]:

y = sessions["duration_min"]


X = sessions[["login_hour", "day_of_week"]]


X = pd.get_dummies(X, columns=["day_of_week"], drop_first=True)

X.head()


Unnamed: 0,login_hour,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday
0,18,False,False,True,False,False,False
1,23,True,False,False,False,False,False
2,17,False,False,False,True,False,False
3,3,False,False,False,True,False,False
4,16,False,True,False,False,False,False


In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123
)

X_train.shape, X_test.shape


((1173, 7), (294, 7))

In [12]:
from sklearn.linear_model import LinearRegression


lin_reg = LinearRegression()


lin_reg.fit(X_train, y_train)


print("Intercept:", lin_reg.intercept_)
coef_table = pd.DataFrame({
    "feature": X_train.columns,
    "coefficient": lin_reg.coef_
})
coef_table


Intercept: 50.80487719392369


Unnamed: 0,feature,coefficient
0,login_hour,-0.486191
1,day_of_week_Monday,-9.057287
2,day_of_week_Saturday,3.903272
3,day_of_week_Sunday,5.9561
4,day_of_week_Thursday,-8.853136
5,day_of_week_Tuesday,-4.899933
6,day_of_week_Wednesday,-8.121008


In [13]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np


y_pred = lin_reg.predict(X_test)


mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"MAE : {mae:.2f} minutes")
print(f"RMSE: {rmse:.2f} minutes")
print(f"R^2 : {r2:.3f}")


MAE : 32.80 minutes
RMSE: 40.56 minutes
R^2 : 0.036


In [14]:
import altair as alt

results_df = pd.DataFrame({
    "actual": y_test,
    "predicted": y_pred
})

chart_pred = alt.Chart(results_df).mark_circle(opacity=0.4).encode(
    x=alt.X("actual:Q", title="Actual Session Duration (min)"),
    y=alt.Y("predicted:Q", title="Predicted Session Duration (min)")
).properties(
    title="Actual vs Predicted Session Duration"
)

chart_pred


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline


y = sessions["duration_min"]


X = sessions[["login_hour", "day_of_week"]]
X = pd.get_dummies(X, columns=["day_of_week"], drop_first=True)


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123
)


In [16]:

knn_model = make_pipeline(
    StandardScaler(),
    KNeighborsRegressor(n_neighbors=5)
)

knn_model.fit(X_train, y_train)


In [17]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

y_pred_knn = knn_model.predict(X_test)

mae_knn = mean_absolute_error(y_test, y_pred_knn)
rmse_knn = np.sqrt(mean_squared_error(y_test, y_pred_knn))
r2_knn = r2_score(y_test, y_pred_knn)

print(f"KNN MAE  : {mae_knn:.2f} minutes")
print(f"KNN RMSE : {rmse_knn:.2f} minutes")
print(f"KNN R^2  : {r2_knn:.3f}")


KNN MAE  : 35.81 minutes
KNN RMSE : 46.04 minutes
KNN R^2  : -0.242


In [18]:
import altair as alt
results_knn = pd.DataFrame({
    "actual": y_test,
    "predicted": y_pred_knn
})

chart_knn = alt.Chart(results_knn).mark_circle(opacity=0.4).encode(
    x=alt.X("actual:Q", title="Actual Session Duration (min)"),
    y=alt.Y("predicted:Q", title="KNN Predicted Session Duration (min)")
).properties(
    title="Actual vs Predicted Session Duration (KNN)"
)

chart_knn


In [19]:
import scipy.stats as stats


morning = sessions[sessions["login_hour"] < 12]["duration_min"]
evening = sessions[sessions["login_hour"] >= 12]["duration_min"]


t_stat, p_value = stats.ttest_ind(morning, evening, equal_var=False)

print("T-statistic:", t_stat)
print("p-value:", p_value)


T-statistic: 0.9990156543996858
p-value: 0.31796942588141497


In [20]:
import scipy.stats as stats


groups = [
    sessions[sessions["day_of_week"] == day]["duration_min"]
    for day in ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
]


f_stat, p_value_anova = stats.f_oneway(*groups)

print("ANOVA F-statistic:", f_stat)
print("ANOVA p-value:", p_value_anova)


ANOVA F-statistic: 6.431540146839105
ANOVA p-value: 1.0546864023236378e-06


In [21]:
import altair as alt

chart_anova = alt.Chart(sessions).mark_boxplot().encode(
    x=alt.X("day_of_week:N", 
            sort=["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"],
            title="Day of Week"),
    y=alt.Y("duration_min:Q", title="Session Duration (min)")
).properties(
    title="Session Duration by Day of Week"
)

chart_anova
