Question 3: We would like to know something about our populations of users, in particular, we would like to have a good model of whether or not a player will continue contributing given past participation. 

In [1]:
import altair as alt
import numpy as np
import pandas as pd
from sklearn import set_config
from sklearn.compose import make_column_transformer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_validate,
    train_test_split,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_validate



alt.data_transformers.enable('vegafusion')

set_config(transform_output="pandas")

In [2]:
players_messy=pd.read_csv('players.csv')
sessions_messy=pd.read_csv('sessions.csv')

In [3]:
sessions_messy[['start_date','start_time']]=sessions_messy['start_time'].str.split(' ', expand=True)
sessions_messy[['end_date','end_time']]=sessions_messy['end_time'].str.split(' ', expand=True)

In [4]:
sessions=sessions_messy.drop(columns=['original_start_time','original_end_time', 'start_date', 'end_date'])
sessions

Unnamed: 0,hashedEmail,start_time,end_time
0,bfce39c89d6549f2bb94d8064d3ce69dc3d7e72b38f431...,18:12,18:24
1,36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f5...,23:33,23:46
2,f8f5477f5a2e53616ae37421b1c660b971192bd8ff77e3...,17:34,17:57
3,bfce39c89d6549f2bb94d8064d3ce69dc3d7e72b38f431...,03:22,03:58
4,36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f5...,16:01,16:12
...,...,...,...
1530,36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f5...,23:01,23:07
1531,7a4686586d290c67179275c7c3dfb4ea02f4d317d9ee0e...,04:08,04:19
1532,fd6563a4e0f6f4273580e5fedbd8dda64990447aea5a33...,15:36,15:57
1533,fd6563a4e0f6f4273580e5fedbd8dda64990447aea5a33...,06:15,06:22


In [39]:
sessions['start_time']=pd.to_datetime(sessions['start_time'], format='%H:%M')
sessions['end_time']=pd.to_datetime(sessions['end_time'], format='%H:%M')

#created new columns for start and end time but the data type is numerical
sessions['start_time_numerical']=sessions['start_time'].dt.hour+sessions['start_time'].dt.minute/60
sessions['end_time_numerical']=sessions['end_time'].dt.hour+sessions['end_time'].dt.minute/60

#got rid of 1900-01-01 numbers before the time
sessions['start_time']=sessions['start_time'].dt.strftime('%H:%M')
sessions['end_time']=sessions['end_time'].dt.strftime('%H:%M')

#rounded to 3 decimals
sessions['start_time_numerical']=sessions['start_time_numerical'].round(3)
sessions['end_time_numerical']=sessions['end_time_numerical'].round(3)
sessions['session_length']=sessions['session_length'].round(3)

#made a new column for the elapsed session time in hours
sessions['session_length']=(sessions['end_time']-sessions['start_time']).dt.total_seconds()/3600


sessions

Unnamed: 0,hashedEmail,start_time,end_time,session_length,start_time_numerical,end_time_numerical
0,bfce39c89d6549f2bb94d8064d3ce69dc3d7e72b38f431...,18:12,18:24,0.200,18.200,18.400
1,36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f5...,23:33,23:46,0.217,23.550,23.767
2,f8f5477f5a2e53616ae37421b1c660b971192bd8ff77e3...,17:34,17:57,0.383,17.567,17.950
3,bfce39c89d6549f2bb94d8064d3ce69dc3d7e72b38f431...,03:22,03:58,0.600,3.367,3.967
4,36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f5...,16:01,16:12,0.183,16.017,16.200
...,...,...,...,...,...,...
1530,36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f5...,23:01,23:07,0.100,23.017,23.117
1531,7a4686586d290c67179275c7c3dfb4ea02f4d317d9ee0e...,04:08,04:19,0.183,4.133,4.317
1532,fd6563a4e0f6f4273580e5fedbd8dda64990447aea5a33...,15:36,15:57,0.350,15.600,15.950
1533,fd6563a4e0f6f4273580e5fedbd8dda64990447aea5a33...,06:15,06:22,0.117,6.250,6.367


In [41]:
sessions_group=sessions.groupby('hashedEmail').mean(numeric_only=True).reset_index()
sessions_group.round(2)

Unnamed: 0,hashedEmail,session_length,start_time_numerical,end_time_numerical
0,0088b5e134c3f0498a18c7ea6b8d77b4b0ff1636fc9335...,-11.12,11.78,0.67
1,060aca80f8cfbf1c91553a72f4d5ec8034764b05ab59fe...,0.50,4.47,4.97
2,0ce7bfa910d47fc91f21a7b3acd8f33bde6db57912ce02...,0.18,21.02,21.20
3,0d4d71be33e2bc7266ee4983002bd930f69d304288a866...,0.54,10.57,11.11
4,0d70dd9cac34d646c810b1846fe6a85b9e288a76f5dcab...,0.58,20.07,20.65
...,...,...,...,...
120,fc0224c81384770e93ca717f32713960144bf0b52ff676...,0.27,23.43,23.70
121,fcab03c6d3079521e7f9665caed0f31fe3dae6b5ccb86e...,1.33,22.43,23.77
122,fd6563a4e0f6f4273580e5fedbd8dda64990447aea5a33...,-0.05,10.88,10.83
123,fe218a05c6c3fc6326f4f151e8cb75a2a9fa29e22b110d...,0.15,15.55,15.70


In [42]:
players_messy

Unnamed: 0,experience,subscribe,hashedEmail,played_hours,name,gender,age,individualId,organizationName
0,Pro,True,f6daba428a5e19a3d47574858c13550499be23603422e6...,30.3,Morgan,Male,9,,
1,Veteran,True,f3c813577c458ba0dfef80996f8f32c93b6e8af1fa9397...,3.8,Christian,Male,17,,
2,Veteran,False,b674dd7ee0d24096d1c019615ce4d12b20fcbff12d79d3...,0.0,Blake,Male,17,,
3,Amateur,True,23fe711e0e3b77f1da7aa221ab1192afe21648d47d2b4f...,0.7,Flora,Female,21,,
4,Regular,True,7dc01f10bf20671ecfccdac23812b1b415acd42c2147cb...,0.1,Kylie,Male,21,,
...,...,...,...,...,...,...,...,...,...
191,Amateur,True,b6e9e593b9ec51c5e335457341c324c34a2239531e1890...,0.0,Bailey,Female,17,,
192,Veteran,False,71453e425f07d10da4fa2b349c83e73ccdf0fb3312f778...,0.3,Pascal,Male,22,,
193,Amateur,False,d572f391d452b76ea2d7e5e53a3d38bfd7499c7399db29...,0.0,Dylan,Prefer not to say,17,,
194,Amateur,False,f19e136ddde68f365afc860c725ccff54307dedd13968e...,2.3,Harlow,Male,17,,


In [43]:
players=players_messy.drop(columns=['individualId','organizationName'])
players

Unnamed: 0,experience,subscribe,hashedEmail,played_hours,name,gender,age
0,Pro,True,f6daba428a5e19a3d47574858c13550499be23603422e6...,30.3,Morgan,Male,9
1,Veteran,True,f3c813577c458ba0dfef80996f8f32c93b6e8af1fa9397...,3.8,Christian,Male,17
2,Veteran,False,b674dd7ee0d24096d1c019615ce4d12b20fcbff12d79d3...,0.0,Blake,Male,17
3,Amateur,True,23fe711e0e3b77f1da7aa221ab1192afe21648d47d2b4f...,0.7,Flora,Female,21
4,Regular,True,7dc01f10bf20671ecfccdac23812b1b415acd42c2147cb...,0.1,Kylie,Male,21
...,...,...,...,...,...,...,...
191,Amateur,True,b6e9e593b9ec51c5e335457341c324c34a2239531e1890...,0.0,Bailey,Female,17
192,Veteran,False,71453e425f07d10da4fa2b349c83e73ccdf0fb3312f778...,0.3,Pascal,Male,22
193,Amateur,False,d572f391d452b76ea2d7e5e53a3d38bfd7499c7399db29...,0.0,Dylan,Prefer not to say,17
194,Amateur,False,f19e136ddde68f365afc860c725ccff54307dedd13968e...,2.3,Harlow,Male,17


In [44]:
players_sessions=sessions.merge(players, on='hashedEmail')
players_sessions

Unnamed: 0,hashedEmail,start_time,end_time,session_length,start_time_numerical,end_time_numerical,experience,subscribe,played_hours,name,gender,age
0,bfce39c89d6549f2bb94d8064d3ce69dc3d7e72b38f431...,18:12,18:24,0.200,18.200,18.400,Regular,True,223.1,Hiroshi,Male,17
1,36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f5...,23:33,23:46,0.217,23.550,23.767,Amateur,True,53.9,Alex,Male,17
2,f8f5477f5a2e53616ae37421b1c660b971192bd8ff77e3...,17:34,17:57,0.383,17.567,17.950,Amateur,True,150.0,Delara,Female,16
3,bfce39c89d6549f2bb94d8064d3ce69dc3d7e72b38f431...,03:22,03:58,0.600,3.367,3.967,Regular,True,223.1,Hiroshi,Male,17
4,36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f5...,16:01,16:12,0.183,16.017,16.200,Amateur,True,53.9,Alex,Male,17
...,...,...,...,...,...,...,...,...,...,...,...,...
1530,36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f5...,23:01,23:07,0.100,23.017,23.117,Amateur,True,53.9,Alex,Male,17
1531,7a4686586d290c67179275c7c3dfb4ea02f4d317d9ee0e...,04:08,04:19,0.183,4.133,4.317,Veteran,True,1.6,Lane,Female,23
1532,fd6563a4e0f6f4273580e5fedbd8dda64990447aea5a33...,15:36,15:57,0.350,15.600,15.950,Amateur,True,56.1,Dana,Male,23
1533,fd6563a4e0f6f4273580e5fedbd8dda64990447aea5a33...,06:15,06:22,0.117,6.250,6.367,Amateur,True,56.1,Dana,Male,23


In [22]:
new_column_order = ['name','start_time','end_time','session_length','mean_session','played_hours','subscribe','experience','gender','age',]


players_sessions= players_sessions[new_column_order]
players_sessions

Unnamed: 0,name,start_time,end_time,session_length,mean_session,played_hours,subscribe,experience,gender,age
0,Hiroshi,18:12,18:24,0.200,-0.592672,223.1,True,Regular,Male,17
1,Alex,23:33,23:46,0.217,-0.592672,53.9,True,Amateur,Male,17
2,Delara,17:34,17:57,0.383,-0.592672,150.0,True,Amateur,Female,16
3,Hiroshi,03:22,03:58,0.600,-0.592672,223.1,True,Regular,Male,17
4,Alex,16:01,16:12,0.183,-0.592672,53.9,True,Amateur,Male,17
...,...,...,...,...,...,...,...,...,...,...
1530,Alex,23:01,23:07,0.100,-0.592672,53.9,True,Amateur,Male,17
1531,Lane,04:08,04:19,0.183,-0.592672,1.6,True,Veteran,Female,23
1532,Dana,15:36,15:57,0.350,-0.592672,56.1,True,Amateur,Male,23
1533,Dana,06:15,06:22,0.117,-0.592672,56.1,True,Amateur,Male,23


In [45]:
ps=sessions_group.merge(players, on='hashedEmail')
ps

Unnamed: 0,hashedEmail,session_length,start_time_numerical,end_time_numerical,experience,subscribe,played_hours,name,gender,age
0,0088b5e134c3f0498a18c7ea6b8d77b4b0ff1636fc9335...,-11.116500,11.783500,0.667000,Regular,True,1.5,Isaac,Male,20
1,060aca80f8cfbf1c91553a72f4d5ec8034764b05ab59fe...,0.500000,4.467000,4.967000,Pro,False,0.4,Lyra,Male,21
2,0ce7bfa910d47fc91f21a7b3acd8f33bde6db57912ce02...,0.183000,21.017000,21.200000,Beginner,True,0.1,Osiris,Male,17
3,0d4d71be33e2bc7266ee4983002bd930f69d304288a866...,0.535923,10.574231,11.110308,Regular,True,5.6,Winslow,Male,17
4,0d70dd9cac34d646c810b1846fe6a85b9e288a76f5dcab...,0.583000,20.066500,20.650000,Pro,True,1.0,Knox,Male,17
...,...,...,...,...,...,...,...,...,...,...
120,fc0224c81384770e93ca717f32713960144bf0b52ff676...,0.267000,23.433000,23.700000,Amateur,True,0.2,Gemna,Male,27
121,fcab03c6d3079521e7f9665caed0f31fe3dae6b5ccb86e...,1.333000,22.433000,23.767000,Pro,True,1.2,Sakura,Male,17
122,fd6563a4e0f6f4273580e5fedbd8dda64990447aea5a33...,-0.051881,10.881135,10.829255,Amateur,True,56.1,Dana,Male,23
123,fe218a05c6c3fc6326f4f151e8cb75a2a9fa29e22b110d...,0.150000,15.550000,15.700000,Amateur,True,0.1,Fatima,Male,17


NameError: name 'index' is not defined