In [1]:
!pip install datasets
!pip install kneed
!pip install sklearn
from google.colab import drive
drive.mount('/content/drive')
from datasets import load_dataset, load_from_disk
import os
import pickle
from tqdm import tqdm
import pandas as pd
# Data processing
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets# Dimensionality reduction
from sklearn.decomposition import PCA# Modeling
from sklearn.cluster import KMeans
import scipy.cluster.hierarchy as hier
from sklearn.mixture import GaussianMixture# Number of clusters
from sklearn.metrics import silhouette_score

def get_user_input():
    per_var = input("Enter the percentage of variance to retain: ")
    n_clusters = input("Enter the number of clusters to retain:  ")
    time = input("Enter the number of timestamps to keep in the ground truth: ")
    k = input("Enter the value of k, the number of concepts to predict:")

    return int(per_var), int(n_clusters), int(time), int(k)

per_var, n_clusters, time, k = get_user_input()
type_to_t = {
    'disorder': 'T-11',
    'procedure': 'T-39',
    'finding': 'T-18',
    'substance': 'T-55'
}

with open('drive/MyDrive/Tesi_LossAcc/id2tkn.pickle', 'rb') as handle:
    id2tkn = pickle.load(handle)

data_path = 'drive/MyDrive/Tesi_LossAcc/annotations_stream_phase2_v1_1d_256_ALL_TYPES_prepared_split'
cui_types = 'disorder,procedure,finding,substance'
#cui_types = 'disorder'
FOLDER = "drive/MyDrive/Tesi_LossAcc/[foresight_small]-[types={}]".format(cui_types)
subset = 8000

cui_types = cui_types.split(',')
def map_to_df(data_path, mode, cui_types, id2tkn):
    dataset = load_from_disk(os.path.join(data_path, mode))
    cui_types = [type_to_t[t] for t in cui_types] + ['sep']

    df = []
    for sample in tqdm(dataset):
        patient_id = sample['patient_id']
        ts = 0
        tokens = []
        for token_type, token in zip(sample['token_type'], sample['input_ids']):
            if token_type not in cui_types: continue

            if token_type == 'sep':
                ts += 1
            else:
                if token not in tokens:
                    df.append([patient_id, token_type, id2tkn[token], ts])
                    tokens.append(token)
    df = pd.DataFrame(df)
    return df

df_train = map_to_df(data_path, 'train', cui_types, id2tkn)
df_test = map_to_df(data_path, 'test', cui_types, id2tkn)

cols = df_train.columns
df_train.rename (columns = {cols[0]:'patient_id',cols[1]:'concept_types',cols[2]:'concept',cols[3]:'timestamp'}, inplace=True)
cols_2 = df_test.columns
df_test.rename (columns = {cols_2[0]:'patient_id',cols_2[1]:'concept_types',cols_2[2]:'concept',cols_2[3]:'timestamp'}, inplace=True)
duplicate = df_train[df_train.duplicated()]
df_train = df_train.drop_duplicates()
df_test = df_test.drop_duplicates()
df_train = df_train.drop(df_train.columns[[3]], axis=1)
conteggio_occorrenze = df_train.groupby(df_train.columns.tolist(),as_index=False).size()
cols_prova = conteggio_occorrenze.columns
conteggio_occorrenze.rename(columns = {cols_prova[3]:'occurrence'}, inplace=True)

def filter_concept_types(df_train, df_test):
    concept_type_mapping = {'T-11': 'disorder', 'T-39': 'procedure', 'T-18': 'finding', 'T-55': 'substance'}
    excluded_concept_types = input("Which concept types do you want to EXCLUDE? (separate by comma)").split(',')
    excluded_concept_types = [concept_type_mapping.get(x.strip(), x.strip()) for x in excluded_concept_types]
    excluded_concept_types = set(excluded_concept_types)

    for df in [df_train, df_test]:
        df.drop(df[df['concept_types'].apply(lambda x: concept_type_mapping.get(x, x)) \
                    .isin(excluded_concept_types)].index, inplace=True)

    print("The {} concepts have been removed from the dataframes.".format(', '.join(excluded_concept_types)))

def filter_concept_types2(ground_truth):
    concept_type_mapping = {'T-11': 'disorder', 'T-39': 'procedure', 'T-18': 'finding', 'T-55': 'substance'}
    excluded_concept_types = set()

    user_response = input("Do you want to predict only concepts of type 'disorder'? (y or n): ")

    if user_response.lower() == 'y':
        excluded_concept_types.add('T-39')
        excluded_concept_types.add('procedure')
        excluded_concept_types.add('T-18')
        excluded_concept_types.add('finding')
        excluded_concept_types.add('T-55')
        excluded_concept_types.add('substance')

        ground_truth.drop(ground_truth[ground_truth['concept_types'].apply(lambda x: concept_type_mapping.get(x, x)) \
                                      .isin(excluded_concept_types)].index, inplace=True)


        print("The concepts have been removed.")
    elif user_response.lower() == 'n':
        print("The concepts have not been removed.")
    else:
        print("Invalid response. No changes made.")

    return user_response, ground_truth

filter_concept_types(df_train, df_test)

df_train = conteggio_occorrenze

df_train_R = df_train.groupby(['patient_id', 'concept', 'concept_types'])['occurrence'].sum().reset_index()
df_train_nR = pd.crosstab(index=df_train['patient_id'], columns=df_train['concept'], values=df_train['occurrence'], aggfunc='sum', dropna=True)
df_train_nR.fillna(0, inplace=True)
df_addestramento = df_train_nR

#truncated SVD
import numpy as np
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt

X = df_addestramento.values
tsvd = TruncatedSVD(n_components=X.shape[1])
X_tsvd = tsvd.fit_transform(X)

n_components=X.shape[1]

variance_cumulative = np.cumsum(tsvd.explained_variance_ratio_)
transformed_data_reduced = X_tsvd[:,:n_components]

df_pcas = {}
n_components_dict = {}
variance_percentages = [50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]

for p in variance_percentages:
    n_components = np.argmax(variance_cumulative >= p/100) + 1
    n_components_dict[p] = n_components
    transformed_data_reduced = X_tsvd[:,:n_components]
    df_pca = pd.DataFrame(transformed_data_reduced, columns=[f'PC{i+1}' for i in range(n_components)])
    df_pca.index = df_addestramento.index
    df_pcas[p] = df_pca

df_pca = df_pcas[per_var]
print("The number of components for the selected variance is: ",n_components_dict[per_var])

#KMEANS
kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_init=10)
df_pca_copy = df_pca.copy()
df_pca_copy['cluster'] = kmeans.fit_predict(df_pca_copy)
df_copy = df_addestramento.copy()
df_copy.reset_index(inplace=True)
df_pca_copy.reset_index(inplace=True)
df_pca_copy["patient_id"] = df_pca_copy["patient_id"].astype(int)
df_copy["patient_id"] = df_copy["patient_id"].astype(int)
df_copy['cluster'] = df_copy['patient_id'].map(dict(zip(df_pca_copy['patient_id'],df_pca_copy['cluster'])))
grouped = df_copy.groupby("cluster")

"""# per ogni cluster, stampare i 10 concetti con occorrenza più elevata ed il relativo numero di occorrenze
for name, group in grouped:
    print("Cluster:", name)
    count = group.drop(columns=['cluster','patient_id']).reset_index(drop=True).sum().sort_values(ascending=False)
    top_10 = count.head(10)
    #print(top_10)
    #print()"""
df_trained = df_pca_copy
patient_count_t = df_test.groupby("patient_id").agg({"timestamp": "nunique"})
patient_count_t = patient_count_t[(patient_count_t["timestamp"] == 1)] #& (patient_count_t["concept"] == 1)]

df_test = df_test[~df_test['patient_id'].isin(patient_count_t.index)]
df_timestamp_unic = df_test.groupby('patient_id')['timestamp'].nunique().reset_index()
df_timestamp_unic.columns = ['patient_id', 'timestamp_unic']

#GROUND TRUTH
ground_truth = pd.DataFrame() # creazione del DataFrame vuoto
for patient_id in df_timestamp_unic['patient_id'].unique():
    timestamp_unic = df_timestamp_unic.loc[df_timestamp_unic['patient_id'] == patient_id, 'timestamp_unic'].iloc[0]
    if timestamp_unic > time:
        group = df_test.loc[df_test['patient_id'] == patient_id] # selezione del gruppo di righe per il paziente corrente
        unique_timestamps = group['timestamp'].unique()
        max_timestamps = pd.Series(unique_timestamps).nlargest(time)
        ground_truth = pd.concat([ground_truth, group.loc[group['timestamp'].isin(max_timestamps)]])
    elif timestamp_unic <= time:
        group = df_test.loc[(df_test['patient_id'] == patient_id) & (df_test['timestamp'] != 0)] # selezione del gruppo di righe per il paziente corrente, escludendo i timestamp uguali a zero
        ground_truth = pd.concat([ground_truth, group])
ground_truth.reset_index(drop=True, inplace=True) # reset dell'indice del DataFrame

user_response, ground_trurh = filter_concept_types2(ground_truth)
merged_test_time = pd.merge(df_test, ground_truth, on=["patient_id", "concept", "concept_types", "timestamp"], how="outer", indicator=True)
filtered_test_time = merged_test_time[merged_test_time["_merge"] == "left_only"]
df_test1 = filtered_test_time.drop("_merge", axis=1)
df_test_concept = (df_test1.groupby('patient_id')['concept'].value_counts().unstack().fillna(0))
df_result = pd.DataFrame(0, index=df_test_concept.index, columns=df_addestramento.columns)
df_result.update(df_test_concept)

#PCA on test data
X_test = df_result.values
pca_test = PCA(n_components=n_components_dict[per_var])
transformed_data_reduced_test = pca_test.fit_transform(X_test)
df_pca_test = pd.DataFrame(transformed_data_reduced_test, columns=[f'PC{i+1}' for i in range(pca_test.n_components_)])
df_pca_test.index = df_result.index

# KMEANS on test data
# PREDICTION
df_pca_test['cluster'] = kmeans.predict(df_pca_test)
df_results_copy = df_result.copy()
df_results_copy['cluster'] = df_pca_test['cluster']
df_GTT1 = ground_truth.groupby('patient_id')['concept'].apply(list).reset_index(name='concepts')
df_GTT1 = df_GTT1.explode('concepts')
df_GTT1 = pd.concat([df_GTT1.drop(columns='concepts'), df_GTT1['concepts'].apply(pd.Series)], axis=1)
df_addestrato = df_copy
df_grouped = df_addestrato.groupby('cluster').sum()
columns = df_grouped.columns
columns = [col for col in columns if col not in ['patient_id', 'cluster']]
for i in range(0, n_clusters):
    df_cluster = df_grouped.loc[i, columns]
    concepts = df_cluster[df_cluster > 0].index.tolist()
dfs = []

for cluster, group in grouped:

    for i, row in group.iterrows():
        patient_id = row['patient_id']
        concepts = row.iloc[1:-1]
        concepts = concepts[concepts != 0]
        df_concepts = pd.DataFrame({'cluster': [cluster] * len(concepts),
                                    'patient_id': [patient_id] * len(concepts),
                                    'concepts': concepts.index,
                                    'occurence': concepts.values})
        dfs.append(df_concepts)
df_concepts = pd.concat(dfs, ignore_index=True)
#Creating dataframes to calculate metrics
df_concepts = df_concepts.sort_values(by=['cluster', 'patient_id'])
df_concepts_grouped = df_concepts.groupby(['cluster', 'concepts'])['occurence'].sum().reset_index()
df_concepts_grouped = df_concepts_grouped[['cluster', 'concepts', 'occurence']]
df_concepts_grouped = df_concepts_grouped.sort_values(by=['cluster', 'occurence'], ascending=[True, False])
df_concepts_ranked = df_concepts_grouped.groupby('cluster').apply(lambda x: x.sort_values('occurence', ascending=False))
df_concepts_ranked = df_concepts_ranked.reset_index(drop=True)
df_concepts_ranked['rank'] = df_concepts_ranked.groupby('cluster')['occurence'].rank(method='dense', ascending=False)
df_concepts_ranked = df_concepts_ranked.rename(columns={'concepts': 'concept'})
df_prediction = df_results_copy.reset_index().loc[:, ["patient_id", "cluster"]]
df_prediction = df_prediction.reset_index(drop=True)
df_merged = pd.merge(df_prediction, df_GTT1, on='patient_id')
df_merged = df_merged.rename(columns={0: 'concept'})
df_merged['cluster'] = df_merged['cluster'].astype(np.int64)
df_prova1 = pd.merge(df_merged, df_concepts_ranked, on=['concept', 'cluster'], how='left')
df_prova1.loc[df_prova1['rank'].isna(), 'rank'] = np.nan
df_prova1.fillna(0, inplace=True)


df_types = df_train.drop(columns = ['patient_id','occurrence'])

if user_response.lower() == 'y':
  batch_size = 10000
  num_batches = len(df_concepts_ranked) // batch_size + 1
  df_merge_types = pd.DataFrame()
  for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = (i + 1) * batch_size
    batch_concepts_ranked = df_concepts_ranked[start_idx:end_idx]
    batch_merge_types = batch_concepts_ranked.merge(df_types, on="concept")
    batch_merge_types = batch_merge_types.loc[batch_merge_types["concept_types"] == "T-11"]
    batch_merge_types = batch_merge_types.drop_duplicates()
    df_merge_types = pd.concat([df_merge_types, batch_merge_types])
  df_merge_types.reset_index(drop=True, inplace=True)
  df_merge_types.drop_duplicates()
  df_topk = df_merge_types
else:
  df_topk = df_concepts_ranked.copy()
def update_rank2(df_topk, k):
    updated_rows = []

    for cluster in df_topk['cluster'].unique():
        cluster_data = df_topk[df_topk['cluster'] == cluster].sort_values(by='rank')
        selected_rows = cluster_data.head(k)
        updated_rows.append(selected_rows)

    df_topk_updated = pd.concat(updated_rows)

    return df_topk_updated
df_MRR = df_topk
df_topk_updated = update_rank2(df_topk, k)

unique_clusters = df_topk_updated['cluster'].unique()
unique_concepts = df_topk_updated['concept'].unique()
df_topk_concepts = pd.DataFrame(0, index=unique_clusters, columns=unique_concepts)
for _, row in df_topk_updated.iterrows():
    cluster = row['cluster']
    concept = row['concept']
    df_topk_concepts.loc[cluster, concept] = 1

df_topk_concepts = df_topk_concepts.reset_index().rename(columns={'index': 'cluster'})

df_original_concept = df_addestramento
order_concept = pd.DataFrame(columns=df_original_concept.columns[:2637])

order_concept['cluster'] = df_topk_concepts['cluster']
common_columns = set(order_concept.columns) & set(df_topk_concepts.columns)

for column in common_columns:
    order_concept[column] = df_topk_concepts[column]

order_concept = order_concept.fillna(0)
ground_truth2 = ground_truth.drop(ground_truth.columns[[3]], axis=1)
ground_truth_conteggio = ground_truth2.groupby(ground_truth2.columns.tolist(),as_index=False).size()
ground_truth_cols = ground_truth_conteggio.columns
ground_truth_conteggio.rename(columns = {ground_truth_cols[3]:'occurrence'}, inplace=True)
gt2 = pd.crosstab(index=ground_truth_conteggio['patient_id'], columns=ground_truth_conteggio['concept'], values=ground_truth_conteggio['occurrence'], aggfunc='sum', dropna=True)
gt2.fillna(0, inplace=True)
gt2 = gt2.reset_index().rename(columns={'index': 'patient_id'})
order_concept2 = pd.DataFrame(columns=df_original_concept.columns[:2637])
order_concept2['patient_id'] = gt2['patient_id']
common_columns = set(order_concept2.columns) & set(gt2.columns)

for column in common_columns:
    order_concept2[column] = gt2[column]

order_concept2 = order_concept2.fillna(0)
ranking_df = pd.crosstab(index=df_topk_updated['cluster'], columns=df_topk_updated['concept'], values=df_topk_updated['rank'], aggfunc='sum', dropna=True)
ranking_df_MRR = pd.crosstab(index=df_MRR['cluster'], columns=df_MRR['concept'], values=df_MRR['rank'], aggfunc='sum', dropna=True)
ranking_df.fillna(0, inplace=True)
ranking_df_MRR.fillna(0, inplace=True)
ranking_df = ranking_df.reset_index().rename(columns={'index': 'cluster'})
ranking_df_MRR = ranking_df_MRR.reset_index().rename(columns={'index': 'cluster'})
order_concept_MRR = pd.DataFrame(columns=df_original_concept.columns[:2637])
order_concept_MRR['cluster'] = ranking_df_MRR['cluster']
common_columns = set(order_concept_MRR.columns) & set(ranking_df_MRR.columns)

for column in common_columns:
    order_concept_MRR[column] = ranking_df_MRR[column]

order_concept_MRR = order_concept_MRR.fillna(0)
order_concept3 = pd.DataFrame(columns=df_original_concept.columns[:2637])
order_concept3['cluster'] = ranking_df['cluster']
common_columns = set(order_concept3.columns) & set(ranking_df.columns)

for column in common_columns:
    order_concept3[column] = ranking_df[column]

order_concept3 = order_concept3.fillna(0)
score_df = pd.crosstab(index=df_topk_updated['cluster'], columns=df_topk_updated['concept'], values=df_topk_updated['occurence'], aggfunc='sum', dropna=True)
score_df.fillna(0, inplace=True)
score_df = score_df.reset_index().rename(columns={'index': 'cluster'})
order_concept4 = pd.DataFrame(columns=df_original_concept.columns[:2637])
order_concept4['cluster'] = score_df['cluster']
common_columns = set(order_concept4.columns) & set(score_df.columns)

for column in common_columns:
    order_concept4[column] = score_df[column]

order_concept4 = order_concept4.fillna(0)
prev_vect = pd.DataFrame()
prev_vect['cluster'] = order_concept['cluster']
prev_values = order_concept.iloc[:, :2637].values.tolist()
prev_vect['prev'] = [np.array(row) for row in prev_values]
#LABEL
label_vect = pd.DataFrame()
label_vect['patient_id'] = order_concept2['patient_id']
label_values = order_concept2.iloc[:, :2637].values.tolist()
label_vect['label'] = [np.array(row) for row in label_values]
#SCORE
score_vect = pd.DataFrame()
score_vect['cluster'] = order_concept4['cluster']
score_values = order_concept4.iloc[:, :2637].values.tolist()
score_vect['score'] = [np.array(row) for row in score_values]
#RANK
rank_vect = pd.DataFrame()
rank_vect['cluster'] = order_concept3['cluster']
rank_values = order_concept3.iloc[:, :2637].values.tolist()
rank_vect['rank'] = [np.array(row) for row in rank_values]
#MRR RANK
rank_vect_MRR = pd.DataFrame()
rank_vect_MRR['cluster'] = order_concept_MRR['cluster']
rank_values_MRR = order_concept_MRR.iloc[:, :2637].values.tolist()
rank_vect_MRR['rank_MRR'] = [np.array(row) for row in rank_values_MRR]

metriche_df = df_prova1[['patient_id', 'cluster']].drop_duplicates()
metriche_df["patient_id"] = metriche_df["patient_id"].astype('int64')
label_vect["patient_id"] = label_vect["patient_id"].astype('int64')
df_final = pd.merge(metriche_df, label_vect, on='patient_id', how='left')
df_final = pd.merge(df_final, rank_vect, on='cluster', how='left')
df_final = pd.merge(df_final, rank_vect_MRR, on='cluster', how='left')
df_final = pd.merge(df_final, prev_vect, on='cluster', how='left')
df_final = pd.merge(df_final, score_vect, on='cluster', how='left')
df_final['label'] = df_final['label'].apply(lambda x: x.astype(float))
df_final['prev'] = df_final['prev'].apply(lambda x: x.astype(float))
merged_df = df_final.copy()
merged_df['predicted_vector_ones'] = merged_df['prev'].apply(lambda vec: sum(x == 1 for x in vec))
merged_df['label_ones'] = merged_df['label'].apply(lambda vec: sum(x == 1 for x in vec))
merged_df['common_ones'] = merged_df.apply(lambda row: sum(x == 1 for x, y in zip(row['prev'], row['label']) if y == 1), axis=1)

#MRR
def calculate_reciprocal_rank(node_id):
  label_vector = merged_df.loc[merged_df['patient_id'] == node_id, 'label'].values[0]
  ranked_vector = merged_df.loc[merged_df['patient_id'] == node_id, 'rank_MRR'].values[0]

  positions = [i for i, val in enumerate(label_vector) if val == 1]
  values = [ranked_vector[i] for i in positions]

  reciprocal_values = [1/val for val in values if val != 0]
  reciprocal_rank = sum(reciprocal_values)

  return reciprocal_rank
merged_df['reciprocal_rank'] = merged_df['patient_id'].apply(calculate_reciprocal_rank)
num_camp = merged_df['label_ones'].sum()
print("Correct samples:", num_camp)
reciprocalN_True = 1/num_camp
reciprocalRank = merged_df['reciprocal_rank'].sum()
MRR = reciprocalN_True*reciprocalRank
mrr_metric = MRR
print("Mean Reciprocal Rank: ", MRR)

#TPR
num_camp = merged_df['patient_id'].nunique()
print("Samples:", num_camp)
reciprocalN = 1/num_camp
merged_df["True_positive"] = merged_df["common_ones"].apply(lambda x: 1 if x > 0 else 0)
positive_sum = merged_df['True_positive'].sum()
print("Positive samples:", positive_sum)
True_PRate = reciprocalN*positive_sum
TP_Metr = True_PRate
print("True Positive Rate: ", True_PRate)

#Hits
num_veri_positivi = merged_df['common_ones'].sum()
print("Correct Predictions:", num_veri_positivi)
hits2 = num_veri_positivi/k
#print("Rapporto tra positivi e k:", hits2)
Hits = reciprocalN * hits2
hits = Hits
print("Hits: ", Hits)

#Mean Recall
merged_df["False_Negative"] = merged_df['label_ones'] - merged_df["common_ones"]
merged_df["Recall"] = merged_df['common_ones']/(merged_df['common_ones']+merged_df['False_Negative'])
Sum_Recall = merged_df['Recall'].sum()
MR = reciprocalN*Sum_Recall
mean_rec = MR
print("Mean Recall: ", MR)


def calculate_AP_at_k(row, k):
    temp_df = pd.DataFrame({
        'label': row['label'],
        'score': row['score']
    })
    temp_df = temp_df.sort_values(by='score', ascending=False)
    top_k = temp_df.head(k)
    hits = 0
    sum_precs = 0
    for i, (_, row) in enumerate(top_k.iterrows(), start=1):
        if row['label'] == 1:
            hits += 1
            sum_precs += hits / i

    # AP@k
    if hits > 0:
        AP_at_k = sum_precs / hits
    else:
        AP_at_k = 0

    return AP_at_k
merged_df['AP'] = merged_df.apply(calculate_AP_at_k, k=k, axis=1)

# MAP
N = len(merged_df)
MAP = merged_df['AP'].sum() / N
print('Mean Average Precision: ', MAP)

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.17.2-py3-none-a

100%|██████████| 48784/48784 [00:38<00:00, 1259.43it/s]
100%|██████████| 2539/2539 [00:01<00:00, 1822.75it/s]


Which concept types do you want to EXCLUDE? (separate by comma)
The  concepts have been removed from the dataframes.
The number of components for the selected variance is:  417




Do you want to predict only concepts of type 'disorder'? (y or n): y
The concepts have been removed.
Correct samples: 4249
Mean Reciprocal Rank:  0.029836579534881755
Samples: 1490
Positive samples: 245
True Positive Rate:  0.16442953020134227
Correct Predictions: 292
Hits:  0.01959731543624161
Mean Recall:  0.06340314902888926
Mean Average Precision:  0.06203306434430595
