# Hikaru Investigation Simulation

## Loading Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score, mean_squared_error,median_absolute_error
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyRegressor
pd.options.mode.chained_assignment = None
from sklearn.tree import DecisionTreeRegressor
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from IPython.display import clear_output

In [None]:
df = pd.read_csv("Hikaru Big Simulation.csv")

## Extra Functions

In [None]:
def rmse(mse):
    return mse**0.5

In [None]:
def get_scores(target, predictions):
    mse = round(mean_squared_error(target, predictions),3)
    mean_ae = round(mean_absolute_error(target,predictions),3)
    rmse = round(mean_squared_error(target, predictions)**0.5,3)
    median_ae = round(median_absolute_error(target,predictions),3)
    r2 = r2_score(target,predictions)
    return [mse,rmse,mean_ae,median_ae,r2]

## Exploratory Data Analysis

### Numerical Statistics

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.describe()

### Graphical Analysis

In [None]:
sns.histplot(df,x='Number of 40 Win Streaks',y='Hikaru ELO',bins=10)
plt.savefig('Number of Streaks By ELO Difference')

In [None]:
df[['Hikaru ELO','Number of 40 Win Streaks']].plot(
        x='Hikaru ELO',
        y='Number of 40 Win Streaks',
        kind='scatter',
        title='Number of Streaks VS. Hikaru ELO')
plt.savefig('Number of Streaks VS Hikaru ELO')

In [None]:
df[['Anonymous GM ELO','Number of 40 Win Streaks']].plot(
        x='Anonymous GM ELO',
        y='Number of 40 Win Streaks',
        kind='scatter',
        title='Number of Streaks VS. GM ELO')
plt.savefig('Number of Streaks VS GM ELO')

In [None]:
df[['Hikaru Score','Number of 40 Win Streaks']].plot(
        x='Hikaru Score',
        y='Number of 40 Win Streaks',
        kind='scatter',
        title='Number of Streaks VS. Hikaru Score')
plt.savefig('Number of Streaks VS Hikaru Score')

In [None]:
df[['Anonymous GM Score','Number of 40 Win Streaks']].plot(
        x='Anonymous GM Score',
        y='Number of 40 Win Streaks',
        kind='scatter',
        xticks=[i for i in range(0,12000,1000)],
        title='Number of Streaks VS. GM Score')
plt.savefig('Number of Streaks VS GM ELO')

In [None]:
df.groupby('Hikaru ELO')[['Hikaru ELO','Hikaru Biggest Win Streak']].mean().plot(
        x='Hikaru ELO',
        y='Hikaru Biggest Win Streak',
        kind='scatter',
        title="Hikaru's Average Win Streak by ELO")
plt.savefig('Hikaru Average Win Streak by ELO')

In [None]:
df.groupby('Hikaru ELO')[['Hikaru ELO','Hikaru Biggest Win Streak']].median().plot(
        x='Hikaru ELO',
        y='Hikaru Biggest Win Streak',
        kind='scatter',
        title="Hikaru's Median Win Streak by ELO")
plt.savefig('Hikaru Median Win Streak by ELO')

In [None]:
df.groupby('Hikaru ELO')[['Hikaru ELO','Hikaru Biggest Win Streak']].min().plot(
        x='Hikaru ELO',
        y='Hikaru Biggest Win Streak',
        kind='scatter',
        title="Hikaru's Smallest Win Streak by ELO")
plt.savefig('Hikaru Smallest Win Streak by ELO')

In [None]:
df.groupby('Hikaru ELO')[['Hikaru ELO','Hikaru Biggest Win Streak']].max().plot(
        x='Hikaru ELO',
        y='Hikaru Biggest Win Streak',
        kind='scatter',
        title="Hikaru's Biggest Streak by ELO")
plt.savefig('Hikaru Biggest Streak by ELO')

In [None]:
df['ELO Difference'] = df['Hikaru ELO']-df['Anonymous GM ELO']

In [None]:
df.groupby('ELO Difference')[['ELO Difference','Number of 40 Win Streaks']].mean().plot(
        x='ELO Difference',
        y='Number of 40 Win Streaks',
        kind='scatter',
        xticks=[i for i in range(0,800,100)],
        title='Number of Streaks By ELO Difference')
plt.savefig('Number of Streaks By ELO Difference')

In [None]:
df.groupby('ELO Difference')[['ELO Difference','Hikaru Biggest Win Streak']].mean().plot(
        x='ELO Difference',
        y='Hikaru Biggest Win Streak',
        kind='scatter',
        xticks=[i for i in range(0,800,100)],
        title='Longest Win Streak By ELO Difference')
plt.savefig('Longest Win Streak By ELO Difference')

Based on the graphs provided, it is very evident that as Hikaru's ELO increased, the number of 40 game win streaks increases linearly. Also, as Hikaru's score increases, the expected number of 40 game win streaks increases exponentially. The larger the difference, the exponentially higher number of 40 game win streaks occurs and an exponential growth in the size of the largest winning streak.

## AI Predictions

### Splitting Data

In [None]:
scores = []
random_state = 42

In [None]:
features = df.drop('Number of 40 Win Streaks',axis=1)
target = df['Number of 40 Win Streaks']

In [None]:
features_train, features_test, target_train, target_test = train_test_split(features,target,test_size=0.15)

### Scaling Data

In [None]:
scaler = StandardScaler()

In [None]:
features_train_scaled = features_train.copy()
features_test_scaled = features_test.copy()

In [None]:
scaler.fit(features_train_scaled)

In [None]:
features_train_scaled = scaler.transform(features_train_scaled)
features_test_scaled = scaler.transform(features_test_scaled)

### Dummy

In [None]:
dr = DummyRegressor(strategy="mean")
dr.fit(features_train_scaled,target_train)

In [None]:
dr_predictions = dr.predict(features_test_scaled)

In [None]:
scores.append(get_scores(target_test,dr_predictions))

### Logistic

In [None]:
lr = LogisticRegression(random_state=random_state).fit(features_train_scaled,target_train)

In [None]:
lr_predictions = lr.predict(features_test_scaled)

In [None]:
scores.append(get_scores(target_test,lr_predictions))

### Random Forest

In [None]:
rfr_parameters = {'random_state':[random_state],
                  'warm_start':[True,False],
                  'n_estimators':[i for i in range(50,201,50)],
                  'max_depth':[None,4,6,8],
                  'max_features':[None],
                  'min_samples_leaf':[1,3,5],
                  'verbose':[0]}

In [None]:
rfr = GridSearchCV(RandomForestRegressor(),
                   rfr_parameters,cv=5,
                   verbose=10,
                   scoring=make_scorer(rmse)).fit(features_train_scaled,target_train)

In [None]:
rfr_predictions = rfr.best_estimator_.predict(features_test_scaled)

In [None]:
scores.append(get_scores(target_test,rfr_predictions))

### Cat Boost

In [None]:
cbr_parameters = {'verbose':[0],
                  'iterations':[i for i in range(100,1001,200)],
                  'early_stopping_rounds':[1,5,10],
                  'random_state':[random_state],
                  'learning_rate':[0.0005,0.001,0.01],
                  'eval_metric':['AUC']}

In [41]:
cbr = GridSearchCV(CatBoostRegressor(),
                   cbr_parameters,
                   scoring=make_scorer(rmse),
                   cv=5,
                   verbose=10).fit(features_train_scaled,target_train)

[CV 4/5; 45/45] END early_stopping_rounds=10, eval_metric=AUC, iterations=900, learning_rate=0.01, random_state=42, verbose=0;, score=nan total time=   7.3s
[CV 5/5; 45/45] START early_stopping_rounds=10, eval_metric=AUC, iterations=900, learning_rate=0.01, random_state=42, verbose=0
[CV 5/5; 45/45] END early_stopping_rounds=10, eval_metric=AUC, iterations=900, learning_rate=0.01, random_state=42, verbose=0;, score=nan total time=   7.0s


In [42]:
cbr_predictions = cbr.best_estimator_.predict(features_test_scaled)

In [43]:
scores.append(get_scores(target_test,cbr_predictions))

### Light Gradient Boost

In [46]:
lgbm_parameters = {'verbose':[0],
                  'n_estimators':[i for i in range(50,251,50)],
                  'num_leaves':[20,31,45],
                  'max_depth':[-1,2,4,6],
                  'random_state':[random_state],
                  'learning_rate':[0.0005,0.001,0.01,0.1]}

In [47]:
lgbm = GridSearchCV(LGBMRegressor(),
                    lgbm_parameters,
                    scoring=make_scorer(rmse),
                    cv=5,
                    verbose=10).fit(features_train_scaled,target_train)

[CV 3/5; 11/240] END learning_rate=0.0005, max_depth=-1, n_estimators=200, num_leaves=31, random_state=42, verbose=0;, score=nan total time=   0.3s
[CV 4/5; 11/240] START learning_rate=0.0005, max_depth=-1, n_estimators=200, num_leaves=31, random_state=42, verbose=0
[CV 4/5; 11/240] END learning_rate=0.0005, max_depth=-1, n_estimators=200, num_leaves=31, random_state=42, verbose=0;, score=nan total time=   0.4s
[CV 5/5; 11/240] START learning_rate=0.0005, max_depth=-1, n_estimators=200, num_leaves=31, random_state=42, verbose=0
[CV 5/5; 11/240] END learning_rate=0.0005, max_depth=-1, n_estimators=200, num_leaves=31, random_state=42, verbose=0;, score=nan total time=   0.3s
[CV 1/5; 12/240] START learning_rate=0.0005, max_depth=-1, n_estimators=200, num_leaves=45, random_state=42, verbose=0
[CV 1/5; 12/240] END learning_rate=0.0005, max_depth=-1, n_estimators=200, num_leaves=45, random_state=42, verbose=0;, score=nan total time=   0.4s
[CV 2/5; 12/240] START learning_rate=0.0005, max_de

In [48]:
lgbm_predictions = lgbm.best_estimator_.predict(features_test_scaled)

In [49]:
scores.append(get_scores(target_test,lgbm_predictions))

### Decision Tree

In [50]:
dtr_parameters = {'random_state':[random_state],
                  'criterion':['squared_error','absolute_error'],
                  'max_depth':[None,2,4,6],
                  'min_samples_leaf':[1,3,5],
                  'min_samples_split':[2,3,4,5]}

In [51]:
dtr = GridSearchCV(DecisionTreeRegressor(),
                    dtr_parameters,
                    scoring=make_scorer(rmse),
                    cv=5,
                    verbose=10).fit(features_train_scaled,target_train)

[CV 1/5; 78/96] END criterion=absolute_error, max_depth=4, min_samples_leaf=3, min_samples_split=3, random_state=42;, score=nan total time= 1.2min
[CV 2/5; 78/96] START criterion=absolute_error, max_depth=4, min_samples_leaf=3, min_samples_split=3, random_state=42
[CV 2/5; 78/96] END criterion=absolute_error, max_depth=4, min_samples_leaf=3, min_samples_split=3, random_state=42;, score=nan total time= 1.2min
[CV 3/5; 78/96] START criterion=absolute_error, max_depth=4, min_samples_leaf=3, min_samples_split=3, random_state=42
[CV 3/5; 78/96] END criterion=absolute_error, max_depth=4, min_samples_leaf=3, min_samples_split=3, random_state=42;, score=nan total time= 1.2min
[CV 4/5; 78/96] START criterion=absolute_error, max_depth=4, min_samples_leaf=3, min_samples_split=3, random_state=42
[CV 4/5; 78/96] END criterion=absolute_error, max_depth=4, min_samples_leaf=3, min_samples_split=3, random_state=42;, score=nan total time= 1.2min
[CV 5/5; 78/96] START criterion=absolute_error, max_depth=

In [None]:
dtr_predictions = dtr.best_estimator_.predict(features_test_scaled)

In [None]:
scores.append(get_scores(target_test,dtr_predictions))

## AI Score Analysis

In [None]:
pd.DataFrame(data=scores,columns=['Mean Squared Error','Reduced Mean Squared Error','Mean Absolute Error','Median Absolute Error','R2'],
             index=['Dummy','Logistic','Random Forest','Cat Boost','Light Gradient Boost','Decision Tree'])

In [None]:
ai_scores = pd.DataFrame(data=scores,
                         columns=['Mean Squared Error',
                                  'Reduced Mean Squared Error',
                                  'Mean Absolute Error',
                                  'Median Absolute Error','R2'],
                         index=['Dummy','Logistic','Random Forest',
                                'Cat Boost','Light Gradient Boost','Decision Tree'])

In [None]:
pd.DataFrame(data=  {
                    'Dummy':[1/(len(features.columns)-1)]*(len(features.columns)-1),
                    'Logistic':[1/(len(features.columns)-1)]*(len(features.columns)-1),
                    'Random Forest':rfr.best_estimator_.feature_importances_,
                    'Cat Boost':cbr.best_estimator_.get_feature_importance(),
                    'Light Gradient':lgbm.best_estimator_.feature_importance(),
                    'Decision Tree':dtr.best_estimator_.feature_importances_,
                    })