In [1]:
import pandas as pd
import altair as alt
import numpy as np
from sklearn import set_config
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Simplify working with large datasets in Altair
alt.data_transformers.enable('vegafusion')

# Output dataframes instead of arrays
set_config(transform_output="pandas")

In [6]:
players = pd.read_csv("players.csv")
players

Unnamed: 0,experience,subscribe,hashedEmail,played_hours,name,gender,age,individualId,organizationName
0,Pro,True,f6daba428a5e19a3d47574858c13550499be23603422e6...,30.3,Morgan,Male,9,,
1,Veteran,True,f3c813577c458ba0dfef80996f8f32c93b6e8af1fa9397...,3.8,Christian,Male,17,,
2,Veteran,False,b674dd7ee0d24096d1c019615ce4d12b20fcbff12d79d3...,0.0,Blake,Male,17,,
3,Amateur,True,23fe711e0e3b77f1da7aa221ab1192afe21648d47d2b4f...,0.7,Flora,Female,21,,
4,Regular,True,7dc01f10bf20671ecfccdac23812b1b415acd42c2147cb...,0.1,Kylie,Male,21,,
...,...,...,...,...,...,...,...,...,...
191,Amateur,True,b6e9e593b9ec51c5e335457341c324c34a2239531e1890...,0.0,Bailey,Female,17,,
192,Veteran,False,71453e425f07d10da4fa2b349c83e73ccdf0fb3312f778...,0.3,Pascal,Male,22,,
193,Amateur,False,d572f391d452b76ea2d7e5e53a3d38bfd7499c7399db29...,0.0,Dylan,Prefer not to say,17,,
194,Amateur,False,f19e136ddde68f365afc860c725ccff54307dedd13968e...,2.3,Harlow,Male,17,,


In [11]:
players_filtered = players.loc[players["gender"]!="Prefer not to say", ["experience", "gender", "age", "played_hours"]]
players_filtered = players_filtered[players_filtered["played_hours"] != 0.0]
players_filtered

Unnamed: 0,experience,gender,age,played_hours
0,Pro,Male,9,30.3
1,Veteran,Male,17,3.8
3,Amateur,Female,21,0.7
4,Regular,Male,21,0.1
8,Amateur,Male,17,0.1
...,...,...,...,...
185,Regular,Male,18,0.1
186,Veteran,Female,44,0.1
192,Veteran,Male,22,0.3
194,Amateur,Male,17,2.3


In [12]:
players_filtered["gender"] = players_filtered["gender"].replace({
    "Male" : 0,
    "Female" : 1, 
    "Agender" : 2, 
    "Other": 3, 
    "Non-binary": 4,
    "Two-Spirited": 5
})
players_filtered

  players_filtered["gender"] = players_filtered["gender"].replace({


Unnamed: 0,experience,gender,age,played_hours
0,Pro,0,9,30.3
1,Veteran,0,17,3.8
3,Amateur,1,21,0.7
4,Regular,0,21,0.1
8,Amateur,0,17,0.1
...,...,...,...,...
185,Regular,0,18,0.1
186,Veteran,1,44,0.1
192,Veteran,0,22,0.3
194,Amateur,0,17,2.3


In [14]:
players_filtered["experience"] = players_filtered["experience"].replace({
    "Pro" : 0,
    "Amateur" : 1, 
    "Regular" : 2, 
    "Beginner": 3, 
    "Veteran": 4,
})
players_filtered

Unnamed: 0,experience,gender,age,played_hours
0,0,0,9,30.3
1,4,0,17,3.8
3,1,1,21,0.7
4,2,0,21,0.1
8,1,0,17,0.1
...,...,...,...,...
185,2,0,18,0.1
186,4,1,44,0.1
192,4,0,22,0.3
194,1,0,17,2.3


In [15]:
columns_to_plot = ("experience", "age", "gender", "played_hours")

players_pairplot = alt.Chart(players_filtered).mark_point().encode(
    alt.X(alt.repeat("row"), type="quantitative"),
    alt.Y(alt.repeat("column"), type="quantitative"),
).properties(
    width=200,
    height=200
).repeat(
    column=columns_to_plot,
    row=columns_to_plot
)
players_pairplot

In [None]:
players_training, players_testing = train_test_split(
    players_filtered, test_size=0.25, random_state=2024
)
X_train_play = players_training[["lot_area", "year_built", "bsmt_sf", "first_sf", "second_sf"]]
y_train_play = players_training["sale_price"]

X_test_play = players_testing[["lot_area", "year_built", "bsmt_sf", "first_sf", "second_sf"]]
y_test_play = players_testing["played_hours"]