In [1]:
import pandas as pd

DATA = '/kaggle/input/fide-titled-chess-players/titled_players_otb.csv'
df = pd.read_csv(filepath_or_buffer=DATA)
df.head()

Unnamed: 0,fideid,name,country,sex,title,w_title,o_title,foa_title,standart_rating,rapid_rating,...,WIM,WIM_year,FM,FM_year,CM,CM_year,WGM,WGM_year,WH,WH_year
0,1701991,"Aaberg, Anton",SWE,M,IM,,,,2322,2331,...,False,0,False,0,False,0,False,0,False,0
1,1407589,"Aabling-Thomsen, Jakob",DEN,M,IM,,,,2327,0,...,False,0,False,2009,False,0,False,0,False,0
2,25678191,Aaditya Dhingra,IND,M,IM,,,,2421,2208,...,False,0,False,0,False,2020,False,0,False,0
3,25778293,Aadya Gupta,IND,F,WFM,WFM,,,1968,1748,...,False,0,False,0,False,0,False,0,False,0
4,25991426,Aadya Ranganath,IND,F,WCM,WCM,,,1900,1704,...,False,0,False,0,False,0,False,0,False,0


We expect our three ratings to be pretty highly correlated. Are they?

In [2]:
df[['standart_rating', 'rapid_rating', 'blitz_rating',]].corr()

Unnamed: 0,standart_rating,rapid_rating,blitz_rating
standart_rating,1.0,0.14853,0.154276
rapid_rating,0.14853,1.0,0.660861
blitz_rating,0.154276,0.660861,1.0


They are not, except for rapid/blitz. 

Let's look at the ratings distributions. We probably expect them to be exponential.

In [3]:
from plotly import express
from plotly.offline import init_notebook_mode

init_notebook_mode(connected=True)
express.histogram(data_frame=df[df['standart_rating'] > 0], x='standart_rating', facet_col='sex').show(renderer='iframe_connected',)

In [4]:
express.histogram(data_frame=df[df['rapid_rating'] > 0], x='rapid_rating', facet_col='sex').show(renderer='iframe_connected',)

In [5]:
express.histogram(data_frame=df[df['blitz_rating'] > 0], x='blitz_rating', facet_col='sex').show(renderer='iframe_connected',)

They look really Gaussian. Or at least binomial. 

Let's take our three ratings and project them into two dimensions and make a scatter plot.

In [6]:
from sklearn.manifold import TSNE

RANDOM_STATE = 2025
reducer = TSNE(random_state=RANDOM_STATE)
plot_df = pd.DataFrame(columns=['x', 'y'], data=reducer.fit_transform(X=df[['standart_rating', 'rapid_rating', 'blitz_rating',]]))
plot_df[['name', 'sex', 'standart_rating', 'rapid_rating', 'blitz_rating', ]] = df[['name', 'sex', 'standart_rating', 'rapid_rating', 'blitz_rating', ]].copy()

In [7]:
express.scatter(data_frame=plot_df, x='x', y='y', color='standart_rating', hover_name='name').show(renderer='iframe_connected',)

What do we see? We see that we have some dirty data due to zero standard ratings, but generally higher-rated players cluster together.

In [8]:
express.scatter(data_frame=plot_df, x='x', y='y', color='rapid_rating', hover_name='name').show(renderer='iframe_connected',)

In [9]:
express.scatter(data_frame=plot_df, x='x', y='y', color='blitz_rating', hover_name='name').show(renderer='iframe_connected',)

In [10]:
express.scatter(data_frame=plot_df, x='x', y='y', color='sex', hover_name='name').show(renderer='iframe_connected',)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22557 entries, 0 to 22556
Data columns (total 31 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   fideid           22557 non-null  int64  
 1   name             22557 non-null  object 
 2   country          22557 non-null  object 
 3   sex              22557 non-null  object 
 4   title            22557 non-null  object 
 5   w_title          4668 non-null   object 
 6   o_title          1134 non-null   object 
 7   foa_title        6 non-null      object 
 8   standart_rating  22557 non-null  int64  
 9   rapid_rating     22557 non-null  int64  
 10  blitz_rating     22557 non-null  int64  
 11  birthday         22426 non-null  float64
 12  flag             12438 non-null  object 
 13  IM               22557 non-null  bool   
 14  IM_year          22557 non-null  int64  
 15  WFM              22557 non-null  bool   
 16  WFM_year         22557 non-null  int64  
 17  WCM         