```
Parse IRT dataset from jsonlines, formatted in the following way:
* The dataset is in jsonlines format, each line representing the responses of a subject
* Each row looks like this:
{"subject_id": "<subject_id>", "responses": {"<item_id>": <response>}}
* Where <subject_id> is a string, <item_id> is a string, and <response> is a number (usually integer)
```  
- mengubah value table dengan perkiraan ini  
![](images/estimasi_level.png)


# Olah data
## Download table

In [None]:
# Mengimpor modul requests untuk melakukan HTTP requests
import requests

# Daftar sumber data JSON dan URL-nya
sources = {
    "Satellite": "https://stellabms.xyz/sl/score.json",
    "Stella": "https://stellabms.xyz/st/score.json",
    "Insane1": "http://www.ribbit.xyz/bms/tables/insane_body.json",
    "Insane2": "https://rattoto10.github.io/second_table/insane_data.json",
}

# Lokasi penyimpanan file output untuk masing-masing sumber
output_paths = {
    "Satellite": "dataset/table_ori/satellite.json",
    "Stella": "dataset/table_ori/stella.json",
    "Insane1": "dataset/table_ori/insane1.json",
    "Insane2": "dataset/table_ori/insane2.json",
}

# Loop utama untuk mengunduh data dari semua sumber
for source_name, source_url in sources.items():
    # Mengirim GET request ke URL sumber
    response = requests.get(source_url)

    # Memeriksa status code response
    if response.status_code == 200:
        # Mengatur encoding response ke UTF-8 untuk handle karakter khusus
        response.encoding = 'utf-8'
        
        # Parse data JSON dari response (tidak digunakan tapi bisa untuk validasi)
        json_data = response.json()
        
        # Mengambil path output yang sesuai dari dictionary output_paths
        output_path = output_paths[source_name]
        
        # Menyimpan data ke file dengan encoding UTF-8
        with open(output_path, 'w', encoding='utf-8') as file:
            file.write(response.text)
        
        # Notifikasi sukses dengan nama sumber dan path output
        print(f"Downloaded and saved {source_name} JSON data to {output_path}")
    else:
        # Notifikasi gagal dengan informasi status code
        print(f"Failed to retrieve {source_name} JSON data. Status code: {response.status_code}")

## Ganti level table dan gabungkan

In [12]:
import json

# Daftar path file dataset yang akan diproses
dataset_paths = [
    "dataset/table_ori/insane1.json",
    "dataset/table_ori/insane2.json",
    "dataset/table_ori/satellite.json",
    "dataset/table_ori/stella.json"
]

# Dictionary untuk menyimpan data gabungan dengan MD5 sebagai key
combined_data = {}

def transform_level(dataset_name, level):
    ###### Fungsi untuk transformasi level sesuai kriteria khusus tiap dataset
    # Konversi level ke integer jika berupa digit
    if level.isdigit():
        level = int(level)
    else:
        level = 0  # Default untuk nilai non-numerik
    
    # Logika transformasi berdasarkan nama dataset
    if dataset_name == "insane1.json":
        return level + 11
    elif dataset_name == "insane2.json":
        # Handle kasus khusus level 0- dan 0
        if level == "0-":
            return 11.5
        elif level == "0":
            return 11.8
        else:
            return int(level) + 11
    elif dataset_name == "satellite.json":
        # Mapping level menggunakan array khusus
        return [0.5,1.5,3,4.5,6.5,8.5,10.5,12,13.5,15.5,16.5,17.5,19][level] + 11
    elif dataset_name == "stella.json":
        # Mapping level dengan array berbeda
        return [19.5,21,22,22.5,23.5,24,24.25,24.5,24.75,25,25.5,26,27,27.5][level] + 11

# Proses file dataset secara terbalik (prioritas dataset terakhir)
for dataset_path in reversed(dataset_paths):
    # Baca file JSON
    with open(dataset_path, "r", encoding="utf-8") as file:
        dataset = json.load(file)
    
    # Proses setiap item dalam dataset
    for item in dataset:
        md5 = item["md5"]
        level = item["level"]
        
        # Transformasi level dan update nilai
        item["level"] = transform_level(dataset_path.split("/")[-1], level)
        
        # Update data dengan MD5 sebagai key (data terakhir menimpa yang lama)
        combined_data[md5] = item

# Konversi dictionary ke list untuk output
combined_data_list = list(combined_data.values())

# Simpan data gabungan ke file JSON
with open("dataset/combined_dataset.json", "w", encoding="utf-8") as outfile:
    json.dump(combined_data_list, outfile, ensure_ascii=False, indent=2)

## Dapatkan player score

In [None]:
import json
import os
import requests
import time
from xml.etree import ElementTree as ET

def fetch_scores_for_md5(md5, cache_dir):
    ### Mengambil data skor dari cache atau API eksternal dengan mekanisme fallback
    # Membentuk path file cache berdasarkan MD5
    cache_file = os.path.join(cache_dir, f'{md5}.json')

    try:
        # Coba baca dari cache
        with open(cache_file, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # Validasi format cache
        if not isinstance(data, list):
            raise ValueError(f'Invalid cache for {md5}')

        return data
    except (FileNotFoundError, ValueError):
        # Fallback ke API jika cache tidak ada/invalid
        response = requests.get(f'http://dream-pro.info/~lavalse/LR2IR/2/getrankingxml.cgi?songmd5={md5}&id=1')
        response_text = response.text

        # Ekstrak bagian XML dari response
        xml_start = response_text.find('<?xml')
        xml_end = response_text.rfind('</ranking>') + len('</ranking>')
        if xml_start != -1 and xml_end != -1:
            xml_text = response_text[xml_start:xml_end]
        else:
            raise ValueError(f'Invalid XML response for {md5}')

        # Parsing data XML
        xml_root = ET.fromstring(xml_text)
        data = []

        # Ekstrak data skor dari XML
        for score_elem in xml_root.findall('.//score'):
            score_data = {
                'name': score_elem.find('name').text,
                'id': int(score_elem.find('id').text),
                'clear': int(score_elem.find('clear').text),
                'notes': int(score_elem.find('notes').text),
                'combo': int(score_elem.find('combo').text),
                'pg': int(score_elem.find('pg').text),
                'gr': int(score_elem.find('gr').text),
                'minbp': int(score_elem.find('minbp').text),
            }
            data.append(score_data)

        # Konversi nama ke string
        for d in data:
            d['name'] = str(d['name'])

        # Simpan ke cache untuk penggunaan berikutnya
        with open(cache_file, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent='\t')

        return data

def get_scores_for_md5(md5, cache_dir):
    ### Wrapper dengan mekanisme retry untuk handle throttling
    tries = 0

    # Maksimal 3 kali percobaan
    while tries < 3:
        try:
            data = fetch_scores_for_md5(md5, cache_dir)
            return data
        except Exception as err:
            tries += 1
            # Kalkulasi waktu tunggu eksponensial
            sleep_time = (1000 * tries + (0.5 - 1) * 1000) * 2
            print(f'Got throttled ({md5}): Sleeping for {sleep_time:.0f}ms. {err}')
            time.sleep(sleep_time)

    raise Exception(f"Couldn't fetch data in 3 tries. Giving up and exiting.")

# Konfigurasi direktori cache
cache_directory = "dataset/lr2ir" 

# Persiapan daftar MD5 dari dataset gabungan
md5_values = []

# Baca file dataset kombinasi
json_files = ["dataset/combined_dataset.json"]
for json_file in json_files:
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        # Kumpulkan semua nilai MD5
        for item in data:
            md5_values.append(item['md5'])

# Proses setiap MD5 untuk mendapatkan skor
for md5 in md5_values:
    scores = get_scores_for_md5(md5, cache_directory)
    print(scores)


## Filter dataset dengan kondisi clear > 0 and minbp < notes * 0.2
0 = no play  
1 = failed  
2 = easy clear  
3 = normal clear  
4 = hard clear  
5 = full combo  

In [13]:
import json
import os

# Membaca dataset gabungan sebagai referensi
with open("dataset/combined_dataset.json", "r", encoding="utf-8") as combined_file:
    combined_data = json.load(combined_file)

def filter_and_save_dataset(md5):
    ### Memfilter dan menyimpan dataset berdasarkan kriteria performa bermain
    # Path file dataset asli dan hasil filter
    dataset_path = f"dataset/lr2ir/{md5}.json"
    
    # Cek keberadaan file dataset
    if os.path.exists(dataset_path):
        # Baca dataset skor mentah
        with open(dataset_path, "r", encoding="utf-8") as file:
            dataset = json.load(file)
        
        # Filter data: clear > 0 dan minbp < 20% notes
        filtered_dataset = [item for item in dataset 
                           if item["clear"] > 0 
                           and item["minbp"] < item["notes"] * 0.2]
        
        # Simpan dataset terfilter
        filtered_path = f"dataset/filtered_lr2ir/{md5}.json"
        with open(filtered_path, "w", encoding="utf-8") as outfile:
            json.dump(filtered_dataset, outfile, ensure_ascii=False, indent=2)
    else:
        # Handle kasus file tidak ditemukan
        print(f"Dataset not found for MD5: {md5}")

# Proses filter untuk semua entry dalam combined_data
for item in combined_data:
    md5 = item["md5"]
    filter_and_save_dataset(md5)


## Akuisisi Data

In [15]:
import json
import os

# Membaca dataset gabungan sebagai referensi utama
with open("dataset/combined_dataset.json", "r", encoding="utf-8") as combined_file:
    combined_data = json.load(combined_file)

def process_subject_responses(md5):
    ### Membuat dataset mudah dengan kriteria clear >= 2 (easy clear)
    # Struktur dasar untuk menyimpan respons pemain
    response_dict = {"subject_id": md5, "responses": {}}
    
    # Path dataset yang sudah difilter
    dataset_path = f"dataset/filtered_lr2ir/{md5}.json"
    
    if os.path.exists(dataset_path):
        with open(dataset_path, "r", encoding="utf-8") as file:
            dataset = json.load(file)

        # Pemrosesan setiap skor pemain
        for item in dataset:
            item_id = str(item["id"])
            clear = item["clear"]
            # Kriteria respon: 1 jika clear status >= 2
            response = 1 if clear >= 2 else 0
            response_dict["responses"][item_id] = response
            
        # Menyimpan ke format JSON Lines dengan append mode
        with open("dataset/easy_dataset.jsonlines", "a", encoding="utf-8") as outfile:
            json.dump(response_dict, outfile, ensure_ascii=False)
            outfile.write("\n")
    else:
        print(f"Filtered dataset not found for MD5: {md5}")

# Eksekusi untuk semua MD5 dalam dataset gabungan
for item in combined_data:
    md5 = item["md5"]
    process_subject_responses(md5)

In [1]:
import json
import os

# Membaca dataset gabungan sebagai referensi utama
with open("dataset/combined_dataset.json", "r", encoding="utf-8") as combined_file:
    combined_data = json.load(combined_file)

def process_subject_responses(md5):
    ### Membuat dataset mudah dengan kriteria clear >= 4 (hard clear)
    # Struktur dasar untuk menyimpan respons pemain
    response_dict = {"subject_id": md5, "responses": {}}

    dataset_path = f"dataset/filtered_lr2ir/{md5}.json"
    if os.path.exists(dataset_path):
        with open(dataset_path, "r", encoding="utf-8") as file:
            dataset = json.load(file)

        for item in dataset:
            item_id = str(item["id"])
            clear = item["clear"]
            # Kriteria respon: 1 jika clear status >= 4
            response = 1 if clear >= 4 else 0
            response_dict["responses"][item_id] = response

        # Menyimpan ke format JSON Lines dengan append mode
        with open("dataset/hard_dataset.jsonlines", "a", encoding="utf-8") as outfile:
            json.dump(response_dict, outfile, ensure_ascii=False)
            outfile.write("\n")
    else:
        print(f"Filtered dataset not found for MD5: {md5}")

# Eksekusi untuk semua MD5 dalam dataset gabungan
for item in combined_data:
    md5 = item["md5"]
    process_subject_responses(md5)


# Run py-irt

## train dan evaluasi data
By default this will train a model with 90% of the provided data and evaluate with the remaining 10% 

In [2]:
!poetry run py-irt train-and-evaluate --help

                                                                               
 Usage: py-irt.cmd train-and-evaluate [OPTIONS] MODEL_TYPE DATA_PATH           
 OUTPUT_DIR                                                                    
                                                                               
┌─ Arguments ─────────────────────────────────────────────────────────────────┐
│ *    model_type      TEXT  [default: None] [required]                       │
│ *    data_path       TEXT  [default: None] [required]                       │
│ *    output_dir      TEXT  [default: None] [required]                       │
└─────────────────────────────────────────────────────────────────────────────┘
┌─ Options ───────────────────────────────────────────────────────────────────┐
│ --epochs              INTEGER  [default: 2000]                              │
│ --priors              TEXT     [default: None]                              │
│ --dims                INTEGER  [defaul

In [14]:
!poetry run py-irt evaluate --help

Usage: py-irt.cmd evaluate [OPTIONS] MODEL_TYPE PARAMETER_PATH TEST_PAIRS_PATH
                           OUTPUT_DIR

Arguments:
  MODEL_TYPE       [required]
  PARAMETER_PATH   [required]
  TEST_PAIRS_PATH  [required]
  OUTPUT_DIR       [required]

Options:
  --epochs INTEGER     [default: 2000]
  --device TEXT        [default: cpu]
  --initializers TEXT
  --evaluation TEXT    [default: heldout]
  --seed INTEGER       [default: 42]
  --train-size FLOAT   [default: 0.9]
  --help               Show this message and exit.


In [1]:
!poetry run py-irt train-and-evaluate 2pl dataset/easy_dataset.jsonlines 2pl/easyx/

[07:45:41] config: model_type='2pl' epochs=2000 priors=None          cli.py:176
           initializers=[] dims=None lr=0.1 lr_decay=0.9999                    
           dropout=0.5 hidden=100 vocab_size=None log_every=100                
           seed=None deterministic=False                                       
           data_path: dataset/easy_dataset.jsonlines                 cli.py:178
           output directory: 2pl/easyx/                              cli.py:179
[07:45:55] amortized: False                                      dataset.py:112
[07:46:35] Vocab size: None                                      training.py:90
[07:46:37] Training Model...                                         cli.py:209
[07:46:37] args: {'device': 'cpu', 'num_items': 52766,          training.py:134
           'num_subjects': 5230}                                               
           Parsed Model Args: {'device': 'cpu', 'num_items':    training.py:147
           52766, 'num_subjects': 5230, 

In [3]:
!poetry run py-irt train-and-evaluate 2pl dataset/hard_dataset.jsonlines 2pl/hardx/

[09:07:21] config: model_type='2pl' epochs=2000 priors=None          cli.py:176
           initializers=[] dims=None lr=0.1 lr_decay=0.9999                    
           dropout=0.5 hidden=100 vocab_size=None log_every=100                
           seed=None deterministic=False                                       
           data_path: dataset/hard_dataset.jsonlines                 cli.py:178
           output directory: 2pl/hardx/                              cli.py:179
[09:07:36] amortized: False                                      dataset.py:112
[09:08:15] Vocab size: None                                      training.py:90
[09:08:18] Training Model...                                         cli.py:209
[09:08:18] args: {'device': 'cpu', 'num_items': 52766,          training.py:134
           'num_subjects': 5230}                                               
           Parsed Model Args: {'device': 'cpu', 'num_items':    training.py:147
           52766, 'num_subjects': 5230, 

In [4]:
import json

with open('2pl/easyx/best_parameters.json', 'r') as file:
    data = json.load(file)

# Count the number of items in each dictionary
num_ability_items = len(data["ability"])
num_diff_items = len(data["diff"])
num_disc_items = len(data["disc"])
num_item_ids = len(data["item_ids"])
num_subject_ids = len(data["subject_ids"])

# Print the counts
print(f"Number of items in 'ability': {num_ability_items}")
print(f"Number of items in 'diff': {num_diff_items}")
print(f"Number of items in 'disc': {num_disc_items}")
print(f"Number of items in 'item_ids': {num_item_ids}")
print(f"Number of items in 'subject_ids': {num_subject_ids}")

Number of items in 'ability': 5230
Number of items in 'diff': 52766
Number of items in 'disc': 52766
Number of items in 'item_ids': 52766
Number of items in 'subject_ids': 5230


In [5]:
import json

with open('2pl/hardx/best_parameters.json', 'r') as file:
    data = json.load(file)

# Count the number of items in each dictionary
num_ability_items = len(data["ability"])
num_diff_items = len(data["diff"])
num_disc_items = len(data["disc"])
num_item_ids = len(data["item_ids"])
num_subject_ids = len(data["subject_ids"])

# Print the counts
print(f"Number of items in 'ability': {num_ability_items}")
print(f"Number of items in 'diff': {num_diff_items}")
print(f"Number of items in 'disc': {num_disc_items}")
print(f"Number of items in 'item_ids': {num_item_ids}")
print(f"Number of items in 'subject_ids': {num_subject_ids}")

Number of items in 'ability': 5230
Number of items in 'diff': 52766
Number of items in 'disc': 52766
Number of items in 'item_ids': 52766
Number of items in 'subject_ids': 5230


## Output hasil

### Easy

In [None]:
import pandas as pd
import numpy as np
import json
from sklearn.metrics import mean_absolute_error

# Membaca data chart asli sebagai referensi metadata
with open('dataset/combined_dataset.json', 'r', encoding="utf-8") as chart_file:
    combined_data = json.load(chart_file)

# Membaca parameter kemampuan pemain dari model 2PL
best_parameters = []
with open('2pl/easyx/best_parameters.json', 'r', encoding="utf-8") as parameters_file:
    for line in parameters_file:
        params = json.loads(line)
        abilities = params['ability']
        # Mapping subject_id dengan kemampuan
        for subject_id, ability in zip(params['subject_ids'].values(), abilities):
            best_parameters.append({'subject_id': subject_id, 'ability': ability})

# Membaca prediksi model dari file JSON Lines
model_predictions = []
with open('2pl/easyx/model_predictions.jsonlines', 'r', encoding="utf-8") as predictions_file:
    for line in predictions_file:
        model_predictions.append(json.loads(line))

# Konversi ke DataFrame pandas untuk pengolahan data
combined_df = pd.DataFrame(combined_data)
parameters_df = pd.DataFrame(best_parameters)
predictions_df = pd.DataFrame(model_predictions)

# Membersihkan data prediksi yang tidak valid
predictions_df.dropna(subset=['prediction'], inplace=True)

# Agregasi statistik performa per chart
summary_df = predictions_df.groupby('subject_id').agg({
    'response': lambda x: (x == 1).sum(),  # Hitung clear success
    'prediction': ['mean', 'count']  # Rata-rata prediksi dan jumlah player
}).reset_index()
# Flatten multi-index column
summary_df.columns = ['subject_id', 'clear', 'average_prediction', 'playercount']

# Konversi tipe data untuk konsistensi
summary_df['subject_id'] = summary_df['subject_id'].astype(str)

# Gabungkan semua data menjadi DataFrame akhir
result_df = pd.DataFrame({
    'md5': combined_df['md5'],  # Identifier unik chart
    'title': combined_df['title'],  # Judul chart
    'base_level': combined_df['level'],  # Kesulitan dasar
    'prediction': summary_df['average_prediction'],  # Probabilitas clear rata-rata
    'ability': parameters_df['ability'],  # Parameter kemampuan dari model IRT
    'player_count': summary_df['playercount'],  # Jumlah pemain yang mencoba
    'clear_count': summary_df['clear']  # Jumlah clear sukses
})
# Bersihkan baris dengan data tidak lengkap
result_df.dropna(inplace=True)

def calculate_est_level(row):
    ### Menghitung level estimasi berdasarkan distribusi prediksi per base level
    # Ekstrak nilai dasar dan prediksi dari row
    base_level = row['base_level']
    prediction = row['prediction']
    
    # Hitung batas zona mid (percentil 33.33 dan 66.67) untuk base level yang sama
    mid_zone_start = np.percentile(result_df[result_df['base_level'] == base_level]['prediction'], 33.33)
    mid_zone_end = np.percentile(result_df[result_df['base_level'] == base_level]['prediction'], 66.67)

    if prediction <= mid_zone_start:
        return base_level + (mid_zone_start - prediction)  # Jika prediksi di bawah batas zona bawah
    elif mid_zone_start < prediction <= mid_zone_end:
        return base_level  # Jika prediksi berada di zona tengah
    else:
        return base_level - (prediction - mid_zone_end)  # Jika prediksi di atas batas zona atas

# Terapkan fungsi ke setiap baris dan tambahkan kolom baru
result_df['est_level'] = result_df.apply(calculate_est_level, axis=1)
# Urutkan DataFrame berdasarkan level estimasi
result_df.sort_values(by='est_level', ascending=True, inplace=True)

# Menghitung Mean Absolute Error (MAE) antara level aktual dan prediksi
mae_est_level = mean_absolute_error(result_df['base_level'], result_df['est_level'])
# Menampilkan hasil perhitungan MAE
print("MAE for est_level:", mae_est_level)

# Output file ke csv
result_df.to_csv('output/easy_output_final.csv', index=False)

print(result_df)

MAE for est_level: 0.03363296658688851
                                   md5  \
4257  bed8c515b04d9fa79d19d51a57890a28   
4258  508330b4bc4513536aea7945d90909e2   
4256  975edffb868ef4d1cde4b845022ea316   
1475  b341e4cc6f0f6a59100910ed11a92f8d   
2965  03c906bf1229a701a0471165242e1233   
...                                ...   
1386  fed59d3b700499c9207c4995d160cc9a   
1381  8fdd4ffb45d79ef2e713a01abc9a08c3   
1383  e01344b7d64159e7aa28b29d32ea6351   
1382  bb843baa1332cdacd049f1e142c79045   
1384  f880de58c2dcb3ea4e3d9dc39b096080   

                                                  title  base_level  \
4257                  さいこ゛のたたかい /たひ゛のおわり [7Key Another]        11.0   
4258                                            マジカル縦連打        11.0   
4256                            Thrill Trigger EX2 YPER        11.0   
1475                               Lieselotte [ANOTHER]        11.0   
2965                                Pure Ruby [Another]        11.0   
...                           

In [None]:
import pandas as pd
from tabulate import tabulate

# Membaca file CSV hasil pengolahan data final
df = pd.read_csv('output/easy_output_final.csv')  # Memuat data hasil perhitungan ke DataFrame

# Mengkonversi DataFrame ke format HTML menggunakan tabulate
html_table = tabulate(df, headers='keys', tablefmt='html')  # Generate tabel HTML dasar

# Membuat template HTML dengan styling CSS
styled_html_table = f'''
<!DOCTYPE html>
<html>
<head>
    <title>Easy Clear Estimation</title>
    <style>
        table {{border-collapse: collapse; border: 2px solid black;}}  /* Garis tepi tebal untuk tabel */
        th, td {{border: 1px solid black; padding: 10px;}}  /* Padding dan border untuk sel */
        tr:nth-child(even) {{background-color: #ff9393;}}  /* Warna latar bergantian untuk baris */
    </style>
</head>
<body>
    <h1>Easy Clear Estimation</h1>
    {html_table}  <!-- Menyisipkan tabel yang telah digenerate -->
</body>
</html>
'''

# Menyimpan hasil akhir ke file HTML
with open('output/easy_output_final.html', 'w', encoding='utf-8') as f:
    f.write(styled_html_table)  # Tulis konten HTML ke file dengan encoding UTF-8

### Hard

In [5]:
import pandas as pd
import numpy as np
import json
from sklearn.metrics import mean_absolute_error

# Membaca data chart asli sebagai referensi metadata
with open('dataset/combined_dataset.json', 'r', encoding="utf-8") as chart_file:
    combined_data = json.load(chart_file)
    
# Membaca parameter kemampuan pemain dari model 2PL
best_parameters = []
with open('2pl/hardx/best_parameters.json', 'r', encoding="utf-8") as parameters_file:
    for line in parameters_file:
        params = json.loads(line)
        abilities = params['ability']
        # Mapping subject_id dengan kemampuan
        for subject_id, ability in zip(params['subject_ids'].values(), abilities):
            best_parameters.append({'subject_id': subject_id, 'ability': ability})

# Membaca prediksi model dari file JSON Lines            
model_predictions = []
with open('2pl/hardx/model_predictions.jsonlines', 'r', encoding="utf-8") as predictions_file:
    for line in predictions_file:
        model_predictions.append(json.loads(line))

# Konversi ke DataFrame pandas untuk pengolahan data
combined_df = pd.DataFrame(combined_data)
parameters_df = pd.DataFrame(best_parameters)
predictions_df = pd.DataFrame(model_predictions)

# Membersihkan data prediksi yang tidak valid
predictions_df.dropna(subset=['prediction'], inplace=True)

# Agregasi statistik performa per chart
summary_df = predictions_df.groupby('subject_id').agg({
    'response': lambda x: (x == 1).sum(),  # Hitung clear success
    'prediction': ['mean', 'count']  # Rata-rata prediksi dan jumlah player
}).reset_index()
# Flatten multi-index column
summary_df.columns = ['subject_id', 'clear', 'average_prediction', 'playercount']

# Konversi tipe data untuk konsistensi
summary_df['subject_id'] = summary_df['subject_id'].astype(str)

# Gabungkan semua data menjadi DataFrame akhir
result_df = pd.DataFrame({
    'md5': combined_df['md5'],  # Identifier unik chart
    'title': combined_df['title'],  # Judul chart
    'base_level': combined_df['level'],  # Kesulitan dasar
    'prediction': summary_df['average_prediction'],  # Probabilitas clear rata-rata
    'ability': parameters_df['ability'],  # Parameter kemampuan dari model IRT
    'player_count': summary_df['playercount'],  # Jumlah pemain yang mencoba
    'clear_count': summary_df['clear']  # Jumlah clear sukses
})
# Bersihkan baris dengan data tidak lengkap
result_df.dropna(inplace=True)

def calculate_est_level(row):
    ### Menghitung level estimasi berdasarkan distribusi prediksi per base level
    # Ekstrak nilai dasar dan prediksi dari row
    base_level = row['base_level']
    prediction = row['prediction']
    
    # Hitung batas zona mid (percentil 33.33 dan 66.67) untuk base level yang sama
    mid_zone_start = np.percentile(result_df[result_df['base_level'] == base_level]['prediction'], 33.33)
    mid_zone_end = np.percentile(result_df[result_df['base_level'] == base_level]['prediction'], 66.67)

    if prediction <= mid_zone_start:
        return base_level + (mid_zone_start - prediction)  # Jika prediksi di bawah batas zona bawah
    elif mid_zone_start < prediction <= mid_zone_end:
        return base_level  # Jika prediksi berada di zona tengah
    else:
        return base_level - (prediction - mid_zone_end)  # Jika prediksi di atas batas zona atas

# Terapkan fungsi ke setiap baris dan tambahkan kolom baru
result_df['est_level'] = result_df.apply(calculate_est_level, axis=1)
# Urutkan DataFrame berdasarkan level estimasi
result_df.sort_values(by='est_level', ascending=True, inplace=True)

# Menghitung Mean Absolute Error (MAE) antara level aktual dan prediksi
mae_est_level = mean_absolute_error(result_df['base_level'], result_df['est_level'])
# Menampilkan hasil perhitungan MAE
print("MAE for est_level:", mae_est_level)

# Output file ke csv
result_df.to_csv('output/hard_output_final.csv', index=False)

print(result_df)

MAE for est_level: 0.039402934422971686
                                   md5  \
2965  03c906bf1229a701a0471165242e1233   
2961  a23ea067c01254f860a55171dbee4f89   
2971  8bdc1d1fe654d75136173211f124ad70   
4257  bed8c515b04d9fa79d19d51a57890a28   
4258  508330b4bc4513536aea7945d90909e2   
...                                ...   
1386  fed59d3b700499c9207c4995d160cc9a   
1383  e01344b7d64159e7aa28b29d32ea6351   
1381  8fdd4ffb45d79ef2e713a01abc9a08c3   
1382  bb843baa1332cdacd049f1e142c79045   
1384  f880de58c2dcb3ea4e3d9dc39b096080   

                                                  title  base_level  \
2965                                Pure Ruby [Another]        11.0   
2961                          Master of GENOCIDE [発狂入門]        11.0   
2971                                 コスモワンダラー [SAETHER]        11.0   
4257                  さいこ゛のたたかい /たひ゛のおわり [7Key Another]        11.0   
4258                                            マジカル縦連打        11.0   
...                          

In [6]:
import pandas as pd
from tabulate import tabulate

# Membaca file CSV hasil pengolahan data final
df = pd.read_csv('output/hard_output_final.csv')  # Memuat data hasil perhitungan ke DataFrame

# Mengkonversi DataFrame ke format HTML menggunakan tabulate
html_table = tabulate(df, headers='keys', tablefmt='html')  # Generate tabel HTML dasar

# Membuat template HTML dengan styling CSS
styled_html_table = f'''
<!DOCTYPE html>
<html>
<head>
    <title>Hard Clear Estimation</title>
    <style>
        table {{border-collapse: collapse; border: 2px solid black;}}  /* Garis tepi tebal untuk tabel */
        th, td {{border: 1px solid black; padding: 10px;}}  /* Padding dan border untuk sel */
        tr:nth-child(even) {{background-color: #ff9393;}}  /* Warna latar bergantian untuk baris */
    </style>
</head>
<body>
    <h1>Hard Clear Estimation</h1>
    {html_table}  <!-- Menyisipkan tabel yang telah digenerate -->
</body>
</html>
'''

# Menyimpan hasil akhir ke file HTML
with open('output/hard_output_final.html', 'w', encoding='utf-8') as f:
    f.write(styled_html_table)  # Tulis konten HTML ke file dengan encoding UTF-8

---