```
Parse IRT dataset from jsonlines, formatted in the following way:
* The dataset is in jsonlines format, each line representing the responses of a subject
* Each row looks like this:
{"subject_id": "<subject_id>", "responses": {"<item_id>": <response>}}
* Where <subject_id> is a string, <item_id> is a string, and <response> is a number (usually integer)
```  
- mengubah value table dengan perkiraan ini  
![](images/estimasi_level.png)
- filter data pemain dengan kondisi clear > 0 and minbp < notes * 0.2
- easy clear = jadikan player dengan clear >= 2 jadi 1 jika tidak 0

# Olah data
## Download table

In [None]:
import requests

sources = {
    "Satellite": "https://stellabms.xyz/sl/score.json",
    "Stella": "https://stellabms.xyz/st/score.json",
    "Insane1": "http://www.ribbit.xyz/bms/tables/insane_body.json",
    "Insane2": "https://rattoto10.github.io/second_table/insane_data.json",
}

output_paths = {
    "Satellite": "dataset/table_ori/satellite.json",
    "Stella": "dataset/table_ori/stella.json",
    "Insane1": "dataset/table_ori/insane1.json",
    "Insane2": "dataset/table_ori/insane2.json",
}

for source_name, source_url in sources.items():
    response = requests.get(source_url)

    if response.status_code == 200:
        response.encoding = 'utf-8'

        json_data = response.json()

        output_path = output_paths[source_name]
        with open(output_path, 'w', encoding='utf-8') as file:
            file.write(response.text)

        print(f"Downloaded and saved {source_name} JSON data to {output_path}")
    else:
        print(f"Failed to retrieve {source_name} JSON data. Status code: {response.status_code}")


## Ganti level table dan gabungkan

In [12]:
import json

dataset_paths = ["dataset/table_ori/insane1.json", "dataset/table_ori/insane2.json", "dataset/table_ori/satellite.json", "dataset/table_ori/stella.json"]
combined_data = {}

def transform_level(dataset_name, level):
    
    if level.isdigit(): 
        level = int(level)
    else:
        level = 0 
    
    if dataset_name == "insane1.json":
        return level + 11
    elif dataset_name == "insane2.json":
        if level == "0-":
            return 11.5
        elif level == "0":
            return 11.8
        else:
            return level + 11
    elif dataset_name == "satellite.json":
        return [0.5, 1.5, 3, 4.5, 6.5, 8.5, 10.5, 12, 13.5, 15.5, 16.5, 17.5, 19][level] + 11
    elif dataset_name == "stella.json":
        return [19.5, 21, 22, 22.5, 23.5, 24, 24.25, 24.5, 24.75, 25, 25.5, 26, 27, 27.5][level] + 11

for dataset_path in reversed(dataset_paths):  
    with open(dataset_path, "r", encoding="utf-8") as file:
        dataset = json.load(file)

    for item in dataset:
        md5 = item["md5"]
        level = item["level"]

        item["level"] = transform_level(dataset_path.split("/")[-1], level)

        if md5 in combined_data:
            combined_data[md5] = item
        else:
            combined_data[md5] = item 

combined_data_list = list(combined_data.values())

with open("dataset/combined_dataset.json", "w", encoding="utf-8") as outfile:
    json.dump(combined_data_list, outfile, ensure_ascii=False, indent=2)


## Dapatkan player score

In [None]:
import json
import os
import requests
import time
from xml.etree import ElementTree as ET

def fetch_scores_for_md5(md5, cache_dir):
    cache_file = os.path.join(cache_dir, f'{md5}.json')

    try:
        with open(cache_file, 'r', encoding='utf-8') as f:
            data = json.load(f)

        if not isinstance(data, list):
            raise ValueError(f'Invalid cache for {md5}')

        return data
    except (FileNotFoundError, ValueError):
        response = requests.get(f'http://dream-pro.info/~lavalse/LR2IR/2/getrankingxml.cgi?songmd5={md5}&id=1')
        response_text = response.text

        xml_start = response_text.find('<?xml')
        xml_end = response_text.rfind('</ranking>') + len('</ranking>')
        if xml_start != -1 and xml_end != -1:
            xml_text = response_text[xml_start:xml_end]
        else:
            raise ValueError(f'Invalid XML response for {md5}')

        xml_root = ET.fromstring(xml_text)
        data = []

        for score_elem in xml_root.findall('.//score'):
            score_data = {
                'name': score_elem.find('name').text,
                'id': int(score_elem.find('id').text),
                'clear': int(score_elem.find('clear').text),
                'notes': int(score_elem.find('notes').text),
                'combo': int(score_elem.find('combo').text),
                'pg': int(score_elem.find('pg').text),
                'gr': int(score_elem.find('gr').text),
                'minbp': int(score_elem.find('minbp').text),
            }
            data.append(score_data)

        for d in data:
            d['name'] = str(d['name'])

        with open(cache_file, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent='\t')

        return data

def get_scores_for_md5(md5, cache_dir):
    tries = 0

    while tries < 3:
        try:
            data = fetch_scores_for_md5(md5, cache_dir)
            return data
        except Exception as err:
            tries += 1
            sleep_time = (1000 * tries + (0.5 - 1) * 1000) * 2
            print(f'Got throttled ({md5}): Sleeping for {sleep_time:.0f}ms. {err}')
            time.sleep(sleep_time)

    raise Exception(f"Couldn't fetch data in 3 tries. Giving up and exiting.")

cache_directory = "dataset/lr2ir" 

md5_values = []

json_files = ["dataset/combined_dataset.json"]
for json_file in json_files:
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        for item in data:
            md5_values.append(item['md5'])

for md5 in md5_values:
    scores = get_scores_for_md5(md5, cache_directory)
    print(scores)


## Filter dataset dengan kondisi clear > 0 and minbp < notes * 0.2
0 = no play  
1 = failed  
2 = easy clear  
3 = normal clear  
4 = hard clear  
5 = full combo  

In [13]:
import json
import os

with open("dataset/combined_dataset.json", "r", encoding="utf-8") as combined_file:
    combined_data = json.load(combined_file)

def filter_and_save_dataset(md5):
    dataset_path = f"dataset/lr2ir/{md5}.json"
    if os.path.exists(dataset_path):
        with open(dataset_path, "r", encoding="utf-8") as file:
            dataset = json.load(file)

        filtered_dataset = [item for item in dataset if item["clear"] > 0 and item["minbp"] < item["notes"] * 0.2]

        filtered_path = f"dataset/filtered_lr2ir/{md5}.json"
        with open(filtered_path, "w", encoding="utf-8") as outfile:
            json.dump(filtered_dataset, outfile, ensure_ascii=False, indent=2)
    else:
        print(f"Dataset not found for MD5: {md5}")

for item in combined_data:
    md5 = item["md5"]
    filter_and_save_dataset(md5)


## Akuisisi Data

In [15]:
import json
import os

with open("dataset/combined_dataset.json", "r", encoding="utf-8") as combined_file:
    combined_data = json.load(combined_file)

def process_subject_responses(md5):
    response_dict = {"subject_id": md5, "responses": {}}

    dataset_path = f"dataset/filtered_lr2ir/{md5}.json"
    if os.path.exists(dataset_path):
        with open(dataset_path, "r", encoding="utf-8") as file:
            dataset = json.load(file)

        for item in dataset:
            item_id = str(item["id"])
            clear = item["clear"]
            response = 1 if clear >= 2 else 0
            response_dict["responses"][item_id] = response

        with open("dataset/easy_dataset.jsonlines", "a", encoding="utf-8") as outfile:
            json.dump(response_dict, outfile, ensure_ascii=False)
            outfile.write("\n")
    else:
        print(f"Filtered dataset not found for MD5: {md5}")

for item in combined_data:
    md5 = item["md5"]
    process_subject_responses(md5)


In [1]:
import json
import os

with open("dataset/combined_dataset.json", "r", encoding="utf-8") as combined_file:
    combined_data = json.load(combined_file)

def process_subject_responses(md5):
    response_dict = {"subject_id": md5, "responses": {}}

    dataset_path = f"dataset/filtered_lr2ir/{md5}.json"
    if os.path.exists(dataset_path):
        with open(dataset_path, "r", encoding="utf-8") as file:
            dataset = json.load(file)

        for item in dataset:
            item_id = str(item["id"])
            clear = item["clear"]
            response = 1 if clear >= 4 else 0
            response_dict["responses"][item_id] = response

        with open("dataset/hard_dataset.jsonlines", "a", encoding="utf-8") as outfile:
            json.dump(response_dict, outfile, ensure_ascii=False)
            outfile.write("\n")
    else:
        print(f"Filtered dataset not found for MD5: {md5}")

for item in combined_data:
    md5 = item["md5"]
    process_subject_responses(md5)


# Run py-irt

## train dan evaluasi data
By default this will train a model with 90% of the provided data and evaluate with the remaining 10% 

In [2]:
!poetry run py-irt train-and-evaluate --help

                                                                               
 Usage: py-irt.cmd train-and-evaluate [OPTIONS] MODEL_TYPE DATA_PATH           
 OUTPUT_DIR                                                                    
                                                                               
┌─ Arguments ─────────────────────────────────────────────────────────────────┐
│ *    model_type      TEXT  [default: None] [required]                       │
│ *    data_path       TEXT  [default: None] [required]                       │
│ *    output_dir      TEXT  [default: None] [required]                       │
└─────────────────────────────────────────────────────────────────────────────┘
┌─ Options ───────────────────────────────────────────────────────────────────┐
│ --epochs              INTEGER  [default: 2000]                              │
│ --priors              TEXT     [default: None]                              │
│ --dims                INTEGER  [defaul

In [14]:
!poetry run py-irt evaluate --help

Usage: py-irt.cmd evaluate [OPTIONS] MODEL_TYPE PARAMETER_PATH TEST_PAIRS_PATH
                           OUTPUT_DIR

Arguments:
  MODEL_TYPE       [required]
  PARAMETER_PATH   [required]
  TEST_PAIRS_PATH  [required]
  OUTPUT_DIR       [required]

Options:
  --epochs INTEGER     [default: 2000]
  --device TEXT        [default: cpu]
  --initializers TEXT
  --evaluation TEXT    [default: heldout]
  --seed INTEGER       [default: 42]
  --train-size FLOAT   [default: 0.9]
  --help               Show this message and exit.


In [1]:
!poetry run py-irt train-and-evaluate 2pl dataset/easy_dataset.jsonlines 2pl/easyx/

[07:45:41] config: model_type='2pl' epochs=2000 priors=None          cli.py:176
           initializers=[] dims=None lr=0.1 lr_decay=0.9999                    
           dropout=0.5 hidden=100 vocab_size=None log_every=100                
           seed=None deterministic=False                                       
           data_path: dataset/easy_dataset.jsonlines                 cli.py:178
           output directory: 2pl/easyx/                              cli.py:179
[07:45:55] amortized: False                                      dataset.py:112
[07:46:35] Vocab size: None                                      training.py:90
[07:46:37] Training Model...                                         cli.py:209
[07:46:37] args: {'device': 'cpu', 'num_items': 52766,          training.py:134
           'num_subjects': 5230}                                               
           Parsed Model Args: {'device': 'cpu', 'num_items':    training.py:147
           52766, 'num_subjects': 5230, 

In [3]:
!poetry run py-irt train-and-evaluate 2pl dataset/hard_dataset.jsonlines 2pl/hardx/

[09:07:21] config: model_type='2pl' epochs=2000 priors=None          cli.py:176
           initializers=[] dims=None lr=0.1 lr_decay=0.9999                    
           dropout=0.5 hidden=100 vocab_size=None log_every=100                
           seed=None deterministic=False                                       
           data_path: dataset/hard_dataset.jsonlines                 cli.py:178
           output directory: 2pl/hardx/                              cli.py:179
[09:07:36] amortized: False                                      dataset.py:112
[09:08:15] Vocab size: None                                      training.py:90
[09:08:18] Training Model...                                         cli.py:209
[09:08:18] args: {'device': 'cpu', 'num_items': 52766,          training.py:134
           'num_subjects': 5230}                                               
           Parsed Model Args: {'device': 'cpu', 'num_items':    training.py:147
           52766, 'num_subjects': 5230, 

In [4]:
import json

with open('2pl/easyx/best_parameters.json', 'r') as file:
    data = json.load(file)

# Count the number of items in each dictionary
num_ability_items = len(data["ability"])
num_diff_items = len(data["diff"])
num_disc_items = len(data["disc"])
num_item_ids = len(data["item_ids"])
num_subject_ids = len(data["subject_ids"])

# Print the counts
print(f"Number of items in 'ability': {num_ability_items}")
print(f"Number of items in 'diff': {num_diff_items}")
print(f"Number of items in 'disc': {num_disc_items}")
print(f"Number of items in 'item_ids': {num_item_ids}")
print(f"Number of items in 'subject_ids': {num_subject_ids}")

Number of items in 'ability': 5230
Number of items in 'diff': 52766
Number of items in 'disc': 52766
Number of items in 'item_ids': 52766
Number of items in 'subject_ids': 5230


In [5]:
import json

with open('2pl/hardx/best_parameters.json', 'r') as file:
    data = json.load(file)

# Count the number of items in each dictionary
num_ability_items = len(data["ability"])
num_diff_items = len(data["diff"])
num_disc_items = len(data["disc"])
num_item_ids = len(data["item_ids"])
num_subject_ids = len(data["subject_ids"])

# Print the counts
print(f"Number of items in 'ability': {num_ability_items}")
print(f"Number of items in 'diff': {num_diff_items}")
print(f"Number of items in 'disc': {num_disc_items}")
print(f"Number of items in 'item_ids': {num_item_ids}")
print(f"Number of items in 'subject_ids': {num_subject_ids}")

Number of items in 'ability': 5230
Number of items in 'diff': 52766
Number of items in 'disc': 52766
Number of items in 'item_ids': 52766
Number of items in 'subject_ids': 5230


## Output hasil

### Easy

In [None]:
import pandas as pd
import numpy as np
import json
from sklearn.metrics import mean_absolute_error

with open('dataset/combined_dataset.json', 'r', encoding="utf-8") as chart_file:
    combined_data = json.load(chart_file)

best_parameters = []
with open('2pl/easyx/best_parameters.json', 'r', encoding="utf-8") as parameters_file:
    for line in parameters_file:
        params = json.loads(line)
        abilities = params['ability']
        for subject_id, ability in zip(params['subject_ids'].values(), abilities):
            best_parameters.append({'subject_id': subject_id, 'ability': ability})

model_predictions = []
with open('2pl/easyx/model_predictions.jsonlines', 'r', encoding="utf-8") as predictions_file:
    for line in predictions_file:
        model_predictions.append(json.loads(line))

combined_df = pd.DataFrame(combined_data)
parameters_df = pd.DataFrame(best_parameters)
predictions_df = pd.DataFrame(model_predictions)

predictions_df.dropna(subset=['prediction'], inplace=True)

summary_df = predictions_df.groupby('subject_id').agg({'response': lambda x: (x == 1).sum(), 'prediction': ['mean', 'count']}).reset_index()
summary_df.columns = ['subject_id', 'clear', 'average_prediction', 'playercount']

summary_df['subject_id'] = summary_df['subject_id'].astype(str)

result_df = pd.DataFrame({
    'md5': combined_df['md5'],
    'title': combined_df['title'],
    'base_level': combined_df['level'],
    'prediction': summary_df['average_prediction'],
    'ability': parameters_df['ability'],
    'player_count': summary_df['playercount'],
    'clear_count': summary_df['clear']
})
result_df.dropna(inplace=True)

def calculate_est_level(row):
    base_level = row['base_level']
    prediction = row['prediction']
    
    mid_zone_start = np.percentile(result_df[result_df['base_level'] == base_level]['prediction'], 33.33)
    mid_zone_end = np.percentile(result_df[result_df['base_level'] == base_level]['prediction'], 66.67)

    if prediction <= mid_zone_start:
        return base_level + (mid_zone_start - prediction)
    elif mid_zone_start < prediction <= mid_zone_end:
        return base_level
    else:
        return base_level - (prediction - mid_zone_end)

result_df['est_level'] = result_df.apply(calculate_est_level, axis=1)
result_df.sort_values(by='est_level', ascending=True, inplace=True)

mae_est_level = mean_absolute_error(result_df['base_level'], result_df['est_level'])
print("MAE for est_level:", mae_est_level)

result_df.to_csv('output/easy_output_final.csv', index=False)

print(result_df)

MAE for est_level: 0.03363296658688851
                                   md5  \
4257  bed8c515b04d9fa79d19d51a57890a28   
4258  508330b4bc4513536aea7945d90909e2   
4256  975edffb868ef4d1cde4b845022ea316   
1475  b341e4cc6f0f6a59100910ed11a92f8d   
2965  03c906bf1229a701a0471165242e1233   
...                                ...   
1386  fed59d3b700499c9207c4995d160cc9a   
1381  8fdd4ffb45d79ef2e713a01abc9a08c3   
1383  e01344b7d64159e7aa28b29d32ea6351   
1382  bb843baa1332cdacd049f1e142c79045   
1384  f880de58c2dcb3ea4e3d9dc39b096080   

                                                  title  base_level  \
4257                  さいこ゛のたたかい /たひ゛のおわり [7Key Another]        11.0   
4258                                            マジカル縦連打        11.0   
4256                            Thrill Trigger EX2 YPER        11.0   
1475                               Lieselotte [ANOTHER]        11.0   
2965                                Pure Ruby [Another]        11.0   
...                           

In [None]:
import pandas as pd
from tabulate import tabulate

#result_df.to_csv('output/easy_output_final.csv', index=False)
df = pd.read_csv('output/easy_output_final.csv')

html_table = tabulate(df, headers='keys', tablefmt='html')

styled_html_table = f'''
<!DOCTYPE html>
<html>
<head>
    <title>Easy Clear Estimation</title>
    <style>
        table {{border-collapse: collapse; border: 2px solid black;}}
        th, td {{border: 1px solid black; padding: 10px;}}
        tr:nth-child(even) {{background-color: #ff9393;}}
    </style>
</head>
<body>
    <h1>Easy Clear Estimation</h1>
    {html_table}
</body>
</html>
'''

with open('output/easy_output_final.html', 'w', encoding='utf-8') as f:
    f.write(styled_html_table)

### Hard

In [5]:
import pandas as pd
import numpy as np
import json
from sklearn.metrics import mean_absolute_error

with open('dataset/combined_dataset.json', 'r', encoding="utf-8") as chart_file:
    combined_data = json.load(chart_file)

best_parameters = []
with open('2pl/hardx/best_parameters.json', 'r', encoding="utf-8") as parameters_file:
    for line in parameters_file:
        params = json.loads(line)
        abilities = params['ability']
        for subject_id, ability in zip(params['subject_ids'].values(), abilities):
            best_parameters.append({'subject_id': subject_id, 'ability': ability})
            
model_predictions = []
with open('2pl/hardx/model_predictions.jsonlines', 'r', encoding="utf-8") as predictions_file:
    for line in predictions_file:
        model_predictions.append(json.loads(line))

combined_df = pd.DataFrame(combined_data)
parameters_df = pd.DataFrame(best_parameters)
predictions_df = pd.DataFrame(model_predictions)

predictions_df.dropna(subset=['prediction'], inplace=True)

summary_df = predictions_df.groupby('subject_id').agg({'response': lambda x: (x == 1).sum(), 'prediction': ['mean', 'count']}).reset_index()
summary_df.columns = ['subject_id', 'clear', 'average_prediction', 'playercount']

summary_df['subject_id'] = summary_df['subject_id'].astype(str)

result_df = pd.DataFrame({
    'md5': combined_df['md5'],
    'title': combined_df['title'],
    'base_level': combined_df['level'],
    'prediction': summary_df['average_prediction'],
    'ability': parameters_df['ability'],
    'player_count': summary_df['playercount'],
    'clear_count': summary_df['clear']
})
result_df.dropna(inplace=True)

def calculate_est_level(row):
    base_level = row['base_level']
    prediction = row['prediction']
    
    mid_zone_start = np.percentile(result_df[result_df['base_level'] == base_level]['prediction'], 33.33)
    mid_zone_end = np.percentile(result_df[result_df['base_level'] == base_level]['prediction'], 66.67)

    if prediction <= mid_zone_start:
        return base_level + (mid_zone_start - prediction)
    elif mid_zone_start < prediction <= mid_zone_end:
        return base_level
    else:
        return base_level - (prediction - mid_zone_end)

result_df['est_level'] = result_df.apply(calculate_est_level, axis=1)
result_df.sort_values(by='est_level', ascending=True, inplace=True)

mae_est_level = mean_absolute_error(result_df['base_level'], result_df['est_level'])
print("MAE for est_level:", mae_est_level)

result_df.to_csv('output/hard_output_final.csv', index=False)

print(result_df)

MAE for est_level: 0.039402934422971686
                                   md5  \
2965  03c906bf1229a701a0471165242e1233   
2961  a23ea067c01254f860a55171dbee4f89   
2971  8bdc1d1fe654d75136173211f124ad70   
4257  bed8c515b04d9fa79d19d51a57890a28   
4258  508330b4bc4513536aea7945d90909e2   
...                                ...   
1386  fed59d3b700499c9207c4995d160cc9a   
1383  e01344b7d64159e7aa28b29d32ea6351   
1381  8fdd4ffb45d79ef2e713a01abc9a08c3   
1382  bb843baa1332cdacd049f1e142c79045   
1384  f880de58c2dcb3ea4e3d9dc39b096080   

                                                  title  base_level  \
2965                                Pure Ruby [Another]        11.0   
2961                          Master of GENOCIDE [発狂入門]        11.0   
2971                                 コスモワンダラー [SAETHER]        11.0   
4257                  さいこ゛のたたかい /たひ゛のおわり [7Key Another]        11.0   
4258                                            マジカル縦連打        11.0   
...                          

In [6]:
import pandas as pd
from tabulate import tabulate

df = pd.read_csv('output/hard_output_final.csv')

html_table = tabulate(df, headers='keys', tablefmt='html')

styled_html_table = f'''
<!DOCTYPE html>
<html>
<head>
    <title>Hard Clear Estimation</title>
    <style>
        table {{border-collapse: collapse; border: 2px solid black;}}
        th, td {{border: 1px solid black; padding: 10px;}}
        tr:nth-child(even) {{background-color: #ff9393;}}
    </style>
</head>
<body>
    <h1>Hard Clear Estimation</h1>
    {html_table}
</body>
</html>
'''

with open('output/hard_output_final.html', 'w', encoding='utf-8') as f:
    f.write(styled_html_table)

---