In [1]:
import json
import pickle
import pandas as pd
import numpy as np
import random

In [2]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

In [3]:
import tensorflow as tf
from tensorflow.keras import mixed_precision
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from transformers.keras_callbacks import KerasMetricCallback
import evaluate
from datasets import load_dataset, load_metric, list_metrics
from transformers import create_optimizer
from transformers import create_optimizer, TFAutoModelForSequenceClassification, DistilBertTokenizer
from transformers import DataCollatorWithPadding, TFDistilBertForSequenceClassification
from transformers import TFRobertaForSequenceClassification, RobertaTokenizer, AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [4]:
def open_pickle(pickle_path):
    # Open the pickle file
    with open(pickle_path, 'rb') as f:
        pickle_dict = pickle.load(f)

    return pickle_dict

In [5]:
def create_model(num_classes, emb_table_size):
    # Load finetuned language model
    model_name = "bert-base-multilingual-cased"
    task = "openalex-topic-classification-title-abstract"
    language_model_name = f"OpenAlex/{model_name}-finetuned-{task}"
    language_model = TFAutoModelForSequenceClassification.from_pretrained(language_model_name, 
                                                                          output_hidden_states=True)
    language_model.trainable = False


    # Inputs
    ids = tf.keras.layers.Input((512,), dtype=tf.int64, name='ids')
    mask = tf.keras.layers.Input((512,), dtype=tf.int64, name='mask')
    citation_0 = tf.keras.layers.Input((16,), dtype=tf.int64, name='citation_0')
    citation_1 = tf.keras.layers.Input((128,), dtype=tf.int64, name='citation_1')
    journal = tf.keras.layers.Input((384,), dtype=tf.float32, name='journal_emb')
    
    language_model_output = language_model(input_ids=ids, attention_mask=mask).hidden_states[-1]
    pooled_language_model_output = tf.keras.layers.GlobalAveragePooling1D()(language_model_output)
    
    citation_emb_layer = tf.keras.layers.Embedding(input_dim=emb_table_size, output_dim=256, mask_zero=True, 
                                                   trainable=True, name='citation_emb_layer')

    citation_0_emb = citation_emb_layer(citation_0)
    citation_1_emb = citation_emb_layer(citation_1)

    pooled_citation_0 = tf.keras.layers.GlobalAveragePooling1D()(citation_0_emb)
    pooled_citation_1 = tf.keras.layers.GlobalAveragePooling1D()(citation_1_emb)

    concat_data = tf.keras.layers.Concatenate(name='concat_data', axis=-1)([pooled_language_model_output, 
                                                                            pooled_citation_0, 
                                                                            pooled_citation_1, journal])

    # Dense layer 1
    dense_output = tf.keras.layers.Dense(2048, activation='relu', 
                                         kernel_regularizer='L2', name="dense_1")(concat_data)
    dense_output = tf.keras.layers.Dropout(0.20, name="dropout_1")(dense_output)
    dense_output = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layer_norm_1")(dense_output)
    
    # Dense layer 2
    dense_output = tf.keras.layers.Dense(1024, activation='relu', 
                                         kernel_regularizer='L2', name="dense_2")(dense_output)
    dense_output = tf.keras.layers.Dropout(0.20, name="dropout_2")(dense_output)
    dense_output = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layer_norm_2")(dense_output)

    # Dense layer 3
    dense_output = tf.keras.layers.Dense(512, activation='relu', 
                                         kernel_regularizer='L2', name="dense_3")(dense_output)
    dense_output = tf.keras.layers.Dropout(0.20, name="dropout_3")(dense_output)
    dense_output = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layer_norm_3")(dense_output)

    class_prior = 1/len(target_vocab)
    last_layer_weight_init = tf.keras.initializers.Constant(class_prior)
    last_layer_bias_init = tf.keras.initializers.Constant(-np.log((1-class_prior)/class_prior))
    
    output_layer = tf.keras.layers.Dense(num_classes, kernel_initializer=last_layer_weight_init,
                                         bias_initializer=last_layer_bias_init,
                                         activation='sigmoid', name='output_layer')(dense_output)
    model = tf.keras.Model(inputs=[ids, mask, citation_0, citation_1, journal], outputs=output_layer)

    loss_fn = tf.keras.losses.CategoricalFocalCrossentropy()

    # Compile the model
    model.compile(optimizer=tf.keras.optimizers.AdamW(), 
                  loss=loss_fn,
                  metrics=[tf.keras.metrics.CategoricalAccuracy(), 
                           tf.keras.metrics.TopKCategoricalAccuracy(k=2, name='top_2_categorical_accuracy'),
                           tf.keras.metrics.TopKCategoricalAccuracy(k=5, name='top_5_categorical_accuracy'),
                           tf.keras.metrics.TopKCategoricalAccuracy(k=10, name='top_10_categorical_accuracy')])

    return model

#### Loading model artifacts and test dataset

In [6]:
target_vocab = open_pickle('./model_artifacts/target_vocab.pkl')
inv_target_vocab = open_pickle('./model_artifacts/inv_target_vocab.pkl')
citation_feature_vocab = open_pickle('./model_artifacts/citation_feature_vocab.pkl')
inv_citation_feature_vocab = open_pickle('./model_artifacts/inv_citation_feature_vocab.pkl')
gold_to_id_mapping_dict = open_pickle('./model_artifacts/gold_to_id_mapping_dict.pkl')

In [7]:
# Using the HuggingFace library to load the dataset
all_dataset = load_dataset("parquet", data_files={'test': ["./model_artifacts/test.parquet"]}) \
    .rename_column("level_0_citation", "citation_0") \
    .rename_column("level_1_citation", "citation_1") \
    .rename_column("input_ids", "ids") \
    .rename_column("attention_mask", "mask")

#### Creating model and loading last checkpoint from training

In [None]:
pred_model = create_model(len(target_vocab), len(citation_feature_vocab)+2)

pred_model.load_weights('./model_checkpoints/08-0.909-0.5003.keras')

pred_model.trainable = False

tf_test_dataset = all_dataset['test'].to_tf_dataset(
    columns=['paper_id','ids','mask','citation_0','citation_1','journal_emb'],
    label_cols=["label"],
    batch_size=batch_size,
    shuffle=False
)

#### Getting model predictions and saving file

In [None]:
preds = pred_model.predict(tf_test_dataset)

In [24]:
scores = np.sort(preds, -1)[:, -10:]
preds = np.argsort(preds, -1)[:, -10:]

In [25]:
test = pd.read_parquet("./model_artifacts/test.parquet")

In [26]:
test['preds'] = preds.tolist()
test['scores'] = scores.tolist()

test['preds'] = test['preds'].apply(lambda x: x[::-1])
test['scores'] = test['scores'].apply(lambda x: x[::-1])

In [28]:
test.to_parquet("./model_artifacts/test_set_preds.parquet")

#### Generating random samples of the output to explore

In [None]:
import random

In [29]:
test_set_preds = pd.read_parquet("./model_artifacts/test_set_preds.parquet")

In [32]:
rows_to_test = random.sample(list(range(test_set_preds.shape[0])), 30)

for row_to_test in rows_to_test:

    paper_id = int(test_set_preds.iloc[row_to_test, :]['paper_id'])
    print(f"OpenAlex Work ID: {paper_id}")
    row_data = test_set_preds[test_set_preds['paper_id']==paper_id].head(1)
    print()
    print(f"{row_data['title_abstract'].tolist()[0]}")
    # print("")
    # print(f"ABSTRACT: {row_data['abstract_processed'].tolist()[0]}")
    
    
    print("")
    
    label = row_data['full_label'].tolist()[0]
    
    print(f"LABEL: {label}")
    print("")
    
    print("MODEL PREDS:")
    
    top_10 = row_data['preds'].tolist()[0]
    top_10_scores = row_data['scores'].tolist()[0]
    
    for i, (pred, score)  in enumerate(zip(top_10, top_10_scores)):
        print(f"___________ {i+1}: {inv_target_vocab[pred]} ({round(score, 3)})")
    print("")
    print("------------------------------------------------------------------")
    print("")

OpenAlex Work ID: 1590985406

<TITLE> Bioinformatics basics: applications in biological science and medicine
<ABSTRACT> Contents BIOLOGY AND INFORMATION Bioinformatics-A Rapidly Maturing Science Computers in Biology and Medicine The Virtual Doctor Biological Macromolecules as Information Carriers Proteins: From Sequence to Structure to Function DNA and RNA Structure DNA Cloning and Sequencing Genes, Taxonomy, and Evolution BIOLOGICAL DATABASES Biological Database Organization Public Databases Database Mining Tools GENOME ANALYSIS The Genomic Organization of Genes Comparative Genomics Functional Genomics Microarray and Bioarray Technology Genomes as Gene Networks PROTEOME ANALYSIS Proteomics Hydrodynamic Methods Predictive Biology Systems Biology THE BIOINFORMATICS REVOLUTION IN MEDICINE Genes and Diseases Appendix A Glossary of Biological Terms Appendix B Bioinformatics Web Sites Index Includes contributions from: Borries Demeler, PhD, The University of Texas Health Science Center, San

### Testing only the language model (HuggingFace pipeline)

In [2]:
def merge_title_and_abstract(title, abstract):
    if isinstance(title, str):
        if isinstance(abstract, str):
            return f"<TITLE> {title}\n<ABSTRACT> {abstract}"
        else:
            return f"<TITLE> {title}"
    else:
        if isinstance(abstract, str):
            return f"<TITLE> NONE\n<ABSTRACT> {abstract}"
        else:
            return ""

In [1]:
data_to_score = {'work_id': 3106188259, 
 'original_title': "From Louvain to Leiden: guaranteeing well-connected communities",
 'abstract': "Community detection is often used to understand the structure of large and complex networks. One of the most popular algorithms for uncovering community structure is the so-called Louvain algorithm. We show that this algorithm has a major defect that largely went unnoticed until now: the Louvain algorithm may yield arbitrarily badly connected communities. In the worst case, communities may even be disconnected, especially when running the algorithm iteratively. In our experimental analysis, we observe that up to 25% of the communities are badly connected and up to 16% are disconnected. To address this problem, we introduce the Leiden algorithm. We prove that the Leiden algorithm yields communities that are guaranteed to be connected. In addition, we prove that, when the Leiden algorithm is applied iteratively, it converges to a partition in which all subsets of all communities are locally optimally assigned. Furthermore, by relying on a fast local move approach, the Leiden algorithm runs faster than the Louvain algorithm. We demonstrate the performance of the Leiden algorithm for several benchmark and real-world networks. We find that the Leiden algorithm is faster than the Louvain algorithm and uncovers better partitions, in addition to providing explicit guarantees."}

In [2]:
model_name = "bert-base-multilingual-cased"
task = "openalex-topic-classification-title-abstract"
language_model_name = f"OpenAlex/{model_name}-finetuned-{task}"

In [None]:
from transformers import pipeline

classifier = pipeline(model=language_model_name, top_k=10)

In [7]:
classifier(merge_title_and_abstract(data_to_score['original_title'], data_to_score['abstract']))

[[{'label': '64: Statistical Mechanics of Complex Networks',
   'score': 0.9007338285446167},
  {'label': '1704: Crowdsourcing for Research and Data Collection',
   'score': 0.03273259475827217},
  {'label': '1896: Delay-Tolerant Networking in Mobile Ad Hoc Networks',
   'score': 0.006536854896694422},
  {'label': '637: Advancements in Data Clustering Techniques and Algorithms',
   'score': 0.00574135035276413},
  {'label': '2292: Graph Matching and Analysis Techniques',
   'score': 0.0042234864085912704},
  {'label': '799: Information Visualization and Visual Data Mining',
   'score': 0.0034829594660550356},
  {'label': '1273: Graph Neural Network Models and Applications',
   'score': 0.003296172246336937},
  {'label': '1106: Trajectory Data Mining and Analysis',
   'score': 0.002930482616648078},
  {'label': '1980: Understanding Human Mobility Patterns',
   'score': 0.002405719831585884},
  {'label': '1502: Humanitarian Logistics and Disaster Relief Operations Management',
   'score'

### General Stats on Results

In [10]:
within_clust_per_df = pd.read_parquet("{gold_citations_file_location}")

In [12]:
test_set_preds = pd.read_parquet("./model_artifacts/test_set_preds.parquet")

In [13]:
test_set_preds['micro_cluster_id'] = test_set_preds['full_label'].apply(lambda x: int(x.split(':')[0]))

In [14]:
all_test_data = test_set_preds.drop_duplicates(subset=['paper_id']) \
    .merge(within_clust_per_df.groupby('micro_cluster_id')['within_clust_per'].mean().reset_index(), how='left', on='micro_cluster_id')

In [15]:
all_test_data.shape

(113025, 15)

In [16]:
all_test_data.sample(20)

Unnamed: 0,paper_id,full_label,title_abstract,level_0_citation,level_1_citation,label,input_ids,attention_mask,journal_id,journal_emb,row_num,preds,scores,micro_cluster_id,within_clust_per
67996,4288987729,1642: Standards and Guidelines for Genetic Var...,<TITLE> Genomic testing in premature ovarian i...,"[4557, 4557, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[3984, 2040, 3678, 4557, 4557, 663, 2419, 2223...",4481,"[101, 133, 157, 37611, 51036, 135, 88114, 1113...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",130598054,"[-0.09029929339885712, 0.03826557844877243, -0...",995,"[3948, 2983, 664, 3134, 3880, 1728, 1369, 1988...","[0.9963721036911011, 0.9963556528091431, 0.987...",1642,0.236087
12833,1972128290,4219: Local Economic Development in South Africa,<TITLE> Corporate social responsibility Delive...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1159,"[101, 133, 157, 37611, 51036, 135, 70168, 1214...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",151142007,"[-0.028800103813409805, -0.04602973908185959, ...",976,"[4142, 261, 3422, 3119, 1569, 4344, 2040, 337,...","[0.5786996483802795, 0.5608984231948853, 0.448...",4219,0.429516
61418,4237104227,413: Mantle Dynamics and Earth's Structure,,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3764,"[101, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4210227220,"[-0.0781172588467598, -0.006848946679383516, -...",989,"[3658, 744, 4173, 2808, 3208, 2833, 2279, 1212...","[0.0008757359464652836, 0.0006789403269067407,...",413,0.540998
43417,126287089,668: Advancements in Gynecologic Oncology Rese...,<TITLE> Cancers de l’endomètre de stades avanc...,"[3660, 3785, 3660, 3785, 3660, 3660, 0, 0, 0, ...","[3660, 3660, 3660, 3553, 3785, 3660, 3762, 378...",3215,"[101, 133, 157, 37611, 51036, 135, 37379, 1010...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",22998730,"[-0.014570106752216816, 0.09389408677816391, -...",993,"[3558, 759, 3215, 124, 3389, 241, 123, 3536, 6...","[0.9999426603317261, 0.9997239708900452, 0.999...",668,0.806504
52357,2048631488,1033: Skin Sensitization and Contact Allergy R...,<TITLE> Differentiation of skin sensitizers fr...,"[1349, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1349, 3435, 936, 1349, 3435, 3435, 936, 1349,...",3768,"[101, 133, 157, 37611, 51036, 135, 69627, 3276...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",205878144,"[-0.01863487996160984, -0.047593217343091965, ...",986,"[3768, 872, 1246, 1203, 4330, 3439, 1601, 3197...","[0.9997194409370422, 0.9953385591506958, 0.990...",1033,0.833724
96373,3127856920,3544: Prevention and Treatment of Cervical Cancer,<TITLE> A aplicabilidade da humanização no ate...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3228,"[101, 133, 157, 37611, 51036, 135, 138, 81295,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",4210239051,"[-0.04972222074866295, 0.02842138335108757, -0...",999,"[3613, 2996, 2742, 216, 4260, 101, 3281, 3228,...","[0.9804410934448242, 0.9770548343658447, 0.958...",3544,0.56523
80164,4383267974,1544: Gender Inequality and Labor Force Dynamics,<TITLE> ÖLÜM YARDIMLARININ KADIN İSTİHDAMINA E...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[2415, 2844, 2415, 2844, 13, 13, 1985, 2598, 0...",2032,"[101, 133, 157, 37611, 51036, 135, 246, 11369,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2736667502,"[-0.05797955021262169, 0.08789290487766266, -0...",993,"[3525, 2032, 1662, 3597, 478, 3322, 1191, 4477...","[0.9834539890289307, 0.9615026712417603, 0.932...",1544,0.484799
23744,2088301318,3998: Hubris Syndrome in Leadership and Politics,<TITLE> Community governance and peacebuilding...,"[277, 277, 1849, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[2212, 1253, 1742, 2550, 2550, 2732, 3994, 133...",4244,"[101, 133, 157, 37611, 51036, 135, 18553, 1078...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2764896299,"[0.008132725954055786, 0.014643145725131035, -...",982,"[1054, 4399, 2878, 1824, 3961, 1351, 3537, 234...","[0.9943970441818237, 0.9500281810760498, 0.940...",3998,0.649334
30565,2047714800,3852: Structural Developments in Diagrid Tall ...,<TITLE> An analytical model for high-rise wall...,"[1474, 1474, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1474, 1474, 1474, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3851,"[101, 133, 157, 37611, 51036, 135, 10313, 1066...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",207543021,"[0.004953976720571518, 0.13800837099552155, 0....",987,"[3851, 4017, 3667, 3979, 2239, 2172, 2084, 186...","[0.999897837638855, 0.9885165691375732, 0.8485...",3852,0.779975
42449,2222935693,"3974: Health, Education, and Social Sciences R...",<TITLE> OS SENTIDOS DA OBSERVAÇÃO ASTRONÔMICA:...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4516,"[101, 133, 157, 37611, 51036, 135, 18398, 2305...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",4210195863,"[0.07386615872383118, 0.035323530435562134, -0...",982,"[3251, 4407, 4516, 1996, 132, 1121, 4263, 2415...","[0.9738020300865173, 0.9665312767028809, 0.960...",3974,0.814913


#### Creating bins/binary variables to see how performance changes across different groups of data

In [17]:
all_test_data['pred_correct'] = all_test_data.apply(lambda x: 1 if x.full_label == inv_target_vocab[x.preds[0]] else 0, axis=1)

In [18]:
all_test_data['cluster_bin'] = pd.cut(all_test_data['micro_cluster_id'], bins=10, labels=False)

In [19]:
all_test_data['within_clust_per_bin'] = pd.cut(all_test_data['within_clust_per'].values, bins=10)

In [20]:
all_test_data['title_bool'] = all_test_data['title_abstract'].apply(lambda x: 1 if 'TITLE' in x else 0)
all_test_data['abstract_bool'] = all_test_data['title_abstract'].apply(lambda x: 1 if 'ABSTRACT' in x  else 0)
all_test_data['level_0_bool'] = all_test_data['level_0_citation'].apply(lambda x: 1 if x[0] != 1 else 0)
all_test_data['level_1_bool'] = all_test_data['level_1_citation'].apply(lambda x: 1 if x[0] != 1 else 0)

#### Performance grouped by bins of micro-cluster-id

In [21]:
all_test_data.groupby('cluster_bin').agg({'micro_cluster_id': ['min', 'max'], 
                                         'pred_correct': ['mean', 'count']})

Unnamed: 0_level_0,micro_cluster_id,micro_cluster_id,pred_correct,pred_correct
Unnamed: 0_level_1,min,max,mean,count
cluster_bin,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,1,453,0.534128,11325
1,454,905,0.570088,11300
2,906,1357,0.586283,11300
3,1358,1809,0.59115,11300
4,1810,2261,0.584867,11300
5,2262,2713,0.557876,11300
6,2714,3165,0.479204,11300
7,3166,3617,0.418496,11300
8,3618,4069,0.364336,11300
9,4070,4521,0.325752,11300


#### Performance grouped by amount of data available (title only vs title/abstract/citations available, etc.)

In [22]:
all_test_data.groupby(['title_bool','abstract_bool','level_0_bool','level_1_bool'])\
    .agg({'pred_correct': ['mean', 'count']})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,pred_correct,pred_correct
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,mean,count
title_bool,abstract_bool,level_0_bool,level_1_bool,Unnamed: 4_level_2,Unnamed: 5_level_2
0,0,0,0,0.006645,301
0,0,0,1,0.5625,32
0,0,1,1,0.65625,64
1,0,0,0,0.204973,15524
1,0,0,1,0.469417,3466
1,0,1,1,0.683609,5253
1,1,0,0,0.295602,29560
1,1,0,1,0.551604,17857
1,1,1,1,0.72254,40968


#### Performance grouped by percentage of references within cluster

In [23]:
all_test_data.groupby('within_clust_per_bin').agg({'micro_cluster_id': ['min', 'max','mean'], 
                                         'pred_correct': ['mean', 'count']})

  all_test_data.groupby('within_clust_per_bin').agg({'micro_cluster_id': ['min', 'max','mean'],


Unnamed: 0_level_0,micro_cluster_id,micro_cluster_id,micro_cluster_id,pred_correct,pred_correct
Unnamed: 0_level_1,min,max,mean,mean,count
within_clust_per_bin,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
"(0.142, 0.229]",1162,3910,2975.571429,0.28,525
"(0.229, 0.314]",44,4393,2772.302083,0.322917,2400
"(0.314, 0.4]",2,4521,2795.636364,0.32689,5225
"(0.4, 0.486]",3,4513,2507.391213,0.413808,11950
"(0.486, 0.571]",6,4520,2318.298544,0.455437,20600
"(0.571, 0.657]",4,4519,2156.422131,0.509795,24400
"(0.657, 0.743]",13,4518,2011.550574,0.562336,23975
"(0.743, 0.829]",25,4485,2095.782743,0.582743,16225
"(0.829, 0.914]",33,4495,2508.692884,0.588764,6675
"(0.914, 1.0]",1,4492,3250.595238,0.369524,1050
