## Resources

- Berserk documentation: https://berserk.readthedocs.io/en/master/index.html

- Lichess API documentation: https://lichess.org/api

- Lichess Database: https://database.lichess.org/

## Imports

In [2]:
import dotenv
import requests
import os
import berserk
import random
import pandas as pd
import numpy as np
from datetime import date
from tqdm import tqdm
import matplotlib.pyplot as plt

## Load API token and usernames

The current username list is retrieved from the game data found on database.lichess.org

In [3]:
# get token:
api_token = os.environ.get('LICHESS_API_TOKEN')

# import usernames as list:
with open("../data/usernames.txt", "r") as file:
    users = file.read().split(",")

## Get a sample of users, start a session

In [132]:
users_sample = random.sample(users, k=400)

# opening a session to access lichess data:
session = berserk.TokenSession(api_token)
client = berserk.Client(session)

## Request data from the API

In [134]:
# rating dataframes:
rapid_ratings = pd.DataFrame(columns=['username', "year", 	"month", 	"day", 	"rating"])
puzzle_ratings = pd.DataFrame(columns=['username', "year", 	"month", 	"day", 	"rating"])

# rapid games dataframe:
rapid_games = pd.DataFrame()

for user in tqdm(users_sample):
    try:
      user_history = client.users.get_rating_history(user)
      if len(user_history[2]["points"]) > 50 and len(user_history[2]["points"]) < 500: # only extract data if this user has more than 50 rated rapid games.

        # rating data:
        # rapid ratings
        user_rapid_ratings = pd.DataFrame(user_history[2]["points"])
        user_rapid_ratings.insert(0, "username", user)
        user_rapid_ratings['month'] += 1  # because months in lichess API start at 0 we have to increment by 1
        rapid_ratings = pd.concat([rapid_ratings, user_rapid_ratings])

        # puzzle ratings
        user_puzzle_ratings = pd.DataFrame(user_history[13]["points"])
        user_puzzle_ratings.insert(0, "username", user)
        user_puzzle_ratings['month'] += 1  # because months in lichess API start at 0 we have to increment by 1
        puzzle_ratings = pd.concat([puzzle_ratings, user_puzzle_ratings])

        # rapid games information:
        user_games = pd.DataFrame(list(client.games.export_by_player(user, evals=True, clocks=True, opening=True, perf_type="rapid")))
        user_games.insert(0, "username", user)
        rapid_games = pd.concat([rapid_games, user_games])

    except KeyError:
      pass
    except berserk.exceptions.ResponseError:
      pass

100%|██████████| 400/400 [3:06:04<00:00, 27.91s/it]   


## Formatting Data

### Rapid Ratings

In [135]:
rapid_ratings["date"] = pd.to_datetime(rapid_ratings[rapid_ratings.columns[1:4]])
rapid_ratings = rapid_ratings.drop(labels=["year","month","day"], axis=1)
rapid_ratings

Unnamed: 0,username,rating,date
0,volamcaothu,1221,2018-04-13
1,volamcaothu,1195,2018-04-18
2,volamcaothu,1124,2018-04-19
3,volamcaothu,1186,2018-07-10
4,volamcaothu,1149,2018-09-29
...,...,...,...
126,vodovorot1951,2100,2023-10-13
127,vodovorot1951,2089,2023-10-17
128,vodovorot1951,2097,2023-11-01
129,vodovorot1951,2103,2023-11-03


### Puzzle Ratings

In [136]:
puzzle_ratings["date"] = pd.to_datetime(puzzle_ratings[puzzle_ratings.columns[1:4]])
puzzle_ratings = puzzle_ratings.drop(labels=["year","month","day"], axis=1)
puzzle_ratings

Unnamed: 0,username,rating,date
0,volamcaothu,1384,2019-07-14
1,volamcaothu,1464,2019-08-31
2,volamcaothu,1414,2019-09-14
3,volamcaothu,1544,2019-09-16
4,volamcaothu,1638,2019-09-17
...,...,...,...
117,vodovorot1951,2335,2023-09-18
118,vodovorot1951,2389,2023-09-25
119,vodovorot1951,2444,2023-10-02
120,vodovorot1951,2337,2023-10-03


### Rapid Games

In [137]:
# drop games with nonstandard starting position:
rapid_games = rapid_games[rapid_games["variant"]=="standard"]

# drop games that were not started:
rapid_games = rapid_games[rapid_games.status!="noStart"]

# reset indices:
rapid_games = rapid_games.reset_index(drop=True)


# change id to link for the game:
rapid_games.loc[:,"id"] = "https://lichess.org/" + rapid_games.loc[:,"id"]


# extract white and black columns from players column:
players = pd.DataFrame(rapid_games["players"].to_list())
players = players.applymap(lambda x: x.get("user", {}).get("name"))
rapid_games = pd.concat([rapid_games, players], axis=1)


# extract opening names:
rapid_games["opening"] = rapid_games.apply(lambda x: x["opening"], axis=1).apply(lambda x: x.get("name") if not pd.isna(x) else x)


# extract evaluations:
def extract_eval(x):
    if isinstance(x, list):
        return [item['eval'] for item in x if isinstance(item, dict) and 'eval' in item]
    else:
        return []
rapid_games['analysis'] = rapid_games['analysis'].apply(extract_eval)


# add outcome of the game (win, draw, loss) from the perspective of the username:
conditions = [
    rapid_games["winner"].isna(),
    (rapid_games["winner"]=="white") & (rapid_games["white"]==rapid_games["username"]),
    (rapid_games["winner"]=="black") & (rapid_games["black"]==rapid_games["username"]),
]
choices = [
    "draw",
    "win",
    "win"
]
rapid_games["outcome"] = np.select(conditions, choices, default="loss")


# drop unnecessary columns:
rapid_games = rapid_games.drop(labels=["variant","perf","players","swiss","initialFen","tournament"], axis=1)

  players = players.applymap(lambda x: x.get("user", {}).get("name"))


## Save as .csv

In [140]:
puzzle_ratings.to_csv("../data/puzzle_ratings_Dec15.csv", index=False)
rapid_ratings.to_csv("../data/rapid_ratings_Dec15.csv", index=False)
rapid_games.to_csv("../data/rapid_games_Dec15.csv", index=False)