In [1]:
import pandas as pd

In [2]:
import altair as alt
import numpy as np
np.random.seed(4)

In [3]:
players=pd.read_csv("players.csv")
players

Unnamed: 0,experience,subscribe,hashedEmail,played_hours,name,gender,age,individualId,organizationName
0,Pro,True,f6daba428a5e19a3d47574858c13550499be23603422e6...,30.3,Morgan,Male,9,,
1,Veteran,True,f3c813577c458ba0dfef80996f8f32c93b6e8af1fa9397...,3.8,Christian,Male,17,,
2,Veteran,False,b674dd7ee0d24096d1c019615ce4d12b20fcbff12d79d3...,0.0,Blake,Male,17,,
3,Amateur,True,23fe711e0e3b77f1da7aa221ab1192afe21648d47d2b4f...,0.7,Flora,Female,21,,
4,Regular,True,7dc01f10bf20671ecfccdac23812b1b415acd42c2147cb...,0.1,Kylie,Male,21,,
...,...,...,...,...,...,...,...,...,...
191,Amateur,True,b6e9e593b9ec51c5e335457341c324c34a2239531e1890...,0.0,Bailey,Female,17,,
192,Veteran,False,71453e425f07d10da4fa2b349c83e73ccdf0fb3312f778...,0.3,Pascal,Male,22,,
193,Amateur,False,d572f391d452b76ea2d7e5e53a3d38bfd7499c7399db29...,0.0,Dylan,Prefer not to say,17,,
194,Amateur,False,f19e136ddde68f365afc860c725ccff54307dedd13968e...,2.3,Harlow,Male,17,,


In [4]:
dropped_players=players.drop(columns=["individualId","organizationName","hashedEmail"])
dropped_players

Unnamed: 0,experience,subscribe,played_hours,name,gender,age
0,Pro,True,30.3,Morgan,Male,9
1,Veteran,True,3.8,Christian,Male,17
2,Veteran,False,0.0,Blake,Male,17
3,Amateur,True,0.7,Flora,Female,21
4,Regular,True,0.1,Kylie,Male,21
...,...,...,...,...,...,...
191,Amateur,True,0.0,Bailey,Female,17
192,Veteran,False,0.3,Pascal,Male,22
193,Amateur,False,0.0,Dylan,Prefer not to say,17
194,Amateur,False,2.3,Harlow,Male,17


In [5]:
players_plot_bar1 = alt.Chart(dropped_players).mark_bar().encode(
    x = alt.X("experience").title("Experience of Players"),
    y = alt.Y("played_hours").title("Played Hours")
)
players_plot_bar1

In [6]:
players_plot_scatter = alt.Chart(dropped_players).mark_point().encode(
    x = alt.X("age").title("Age of Player"),
    y= alt.Y("played_hours").title("Played Hours"),
    color = alt.Color("experience").title("Experience").scale(scheme = "set1")
)
players_plot_scatter

In [7]:
players_plot_bar2 = alt.Chart(dropped_players).mark_bar().encode(
    x = alt.X("subscribe").title("Players Subcribed"),
    y = alt.Y("played_hours").title("Played Hours"),
    color = alt.Color("experience").title("Experience").scale(scheme = "set1")
)
players_plot_bar2

In [8]:
players_plot_bar3 = alt.Chart(dropped_players).mark_bar().encode(
    x = alt.X("experience").title("Experience of Players"),
    y = alt.Y("played_hours").title("Played Hours"),
    color = alt.Color("subscribe").title("Subscribed").scale(scheme = "set1")
)
players_plot_bar3

In [9]:
dropped_players["subscribe"].value_counts()

subscribe
True     144
False     52
Name: count, dtype: int64

In [10]:
# scaling so theres equal numbers of each option
not_subscribed_players = dropped_players[dropped_players["subscribe"] == False]
subscribed_players = dropped_players[dropped_players["subscribe"] == True]
not_subscribed_scaledup = not_subscribed_players.sample(
    n=subscribed_players.shape[0], replace=True
)
upsampled_players = pd.concat((not_subscribed_scaledup, subscribed_players))
upsampled_players["subscribe"].value_counts()


subscribe
False    144
True     144
Name: count, dtype: int64

In [11]:
import numpy as np
from sklearn import set_config
from sklearn.compose import make_column_transformer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.model_selection import train_test_split
set_config(transform_output="pandas")

In [12]:
#changing categories into numbers
#experience
upsampled_players["experience"] = upsampled_players["experience"].replace({
    "Beginner" : "1",
    "Amateur" : "2",
    "Regular" : "3",
    "Veteran" : "4",
    "Pro" : "5",
})

#gender
upsampled_players["gender"] = upsampled_players["gender"].replace({
    "Agender" : "1",
    "Female" : "2",
    "Male" : "3",
    "Two-Spirited" : "4",
    "Non-binary" : "5",
    "Other" : "6",
    "Prefer not to say" : "7",
})

upsampled_players

Unnamed: 0,experience,subscribe,played_hours,name,gender,age
184,5,False,1.7,Asher,3,17
29,4,False,0.1,Vivienne,3,18
7,2,False,0.0,Emerson,3,21
171,1,False,1.8,Amelia,3,32
108,4,False,0.0,Wren,3,20
...,...,...,...,...,...,...
187,2,True,0.0,Jasper,3,17
188,1,True,0.0,Lina,2,17
190,2,True,0.0,Rhys,3,20
191,2,True,0.0,Bailey,2,17


In [16]:
#train test split

players_train, players_test = train_test_split(
    upsampled_players, train_size=0.75, stratify=upsampled_players["subscribe"])

X_train = players_train[["age", "played_hours","experience","gender"]]
y_train = players_train["subscribe"]
X_test = players_test[["age", "played_hours","experience","gender"]]
y_test = players_test["subscribe"]
# scaling

players_preprocessor = make_column_transformer(
    (StandardScaler(), ["age", "played_hours","experience","gender"]),)

#finding best K
knn = KNeighborsClassifier()
players_tune_pipe = make_pipeline(players_preprocessor, knn)

parameter_grid = {
    "kneighborsclassifier__n_neighbors": range(1, 40),
}
players_tune_grid = GridSearchCV(
    estimator=players_tune_pipe,
    param_grid=parameter_grid,
    cv=5
)

In [18]:
#fitting to data

players_tune_grid.fit(X_train, y_train)

players_grid = pd.DataFrame(players_tune_grid.cv_results_)
players_grid
#plotting the accuracy vs k
accuracy_vs_k = alt.Chart(players_grid).mark_line(point=True).encode(
    x=alt.X("param_kneighborsclassifier__n_neighbors").title("Neighbors"),
    y=alt.Y("mean_test_score")
        .scale(zero=False)
        .title("Accuracy estimate")
)

accuracy_vs_k

In [20]:
#best k?
players_tune_grid.best_params_

{'kneighborsclassifier__n_neighbors': 1}