In [None]:
import pandas as pd

file_path = './intersection_nice_morning.txt'

total_corpus = 100
true_relevant = 10 

def load_data(file_path):
    """Load the CSV file into a pandas DataFrame."""
    return pd.read_csv(file_path)


def calculate_metrics(file, threshold=1):
    """
    Calculate precision, recall, and F1-score for relevance evaluation.
    
    Args:
        intersection_file (str): Path to the intersection file.
        phrase_file (str): Path to the phrase file.
        threshold (int): Minimum relevance score to consider as relevant.
    
    Returns:
        dict: A dictionary containing precision, recall, and F1-score.
    """
    # Load the data
    df = load_data(file)
    
    # Flatten relevance scores
    relevance = df.iloc[:, 4:].values.flatten()
    
    # Apply threshold to determine predicted relevance
    predicted_relevant = (relevance >= threshold).astype(int)
    
    # Assume that all items in intersection are "true positives" by definition
    # Thus we calculate precision and recall assuming ground truth is all-positives in intersection
    true_positives = predicted_relevant.sum()
    total_predictions = predicted_relevant.size
    
    # Precision and recall equal if only the intersection is used
    precision = recall = true_positives / total_predictions if total_predictions > 0 else 0
    f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        "precision": precision,
        "recall": recall,
        "f1_score": f1_score,
        "df": df
    }

In [15]:
threshold = 1
intersection_file = './intersection_nice_morning.txt'
phrase_file = './phrase_nice_morning.txt'

metrics = calculate_metrics(intersection_file, threshold)

print("Relevance Evaluation Metrics:")
print(f"Precision: {metrics['precision']:.2f}")
print(f"Recall: {metrics['recall']:.2f}")
print(f"F1-Score: {metrics['f1_score']:.2f}")
metrics['df']

Relevance Evaluation Metrics:
Precision: 0.82
Recall: 0.82
F1-Score: 0.82


Unnamed: 0,query_id,episode_name,show_name,episode_id,30s,1m,2m,3m,5m
0,0,'Showering in the morning or before bed' - Epi...,The Weirdest Talk Show,41jhdrTXp5G8roRKU82FqZ,1,1,1,1,1
1,0,TCRNo7 Day 06,Pilgrim Radio,4q7cSYdBYq7yONUgiEVA0E,1,2,3,3,3
2,0,The final days of TCR No7,Pilgrim Radio,0HdhtdR3Uhe6CMhPGRRJnr,3,3,3,3,3
3,0,THE NICEST THING I'VE DONE THIS YEAR,The Morning Rush,4nMVfR0gkasUo3V73aBJKK,1,0,0,0,0
4,0,042 - Sleep Waking Up Early and The Mamba Ment...,The Weekly Warrior Podcast,32MiS3uiNaXmFfnzNnlzgy,2,2,3,3,3
5,0,Our Top 5 Non Negotiable's For Our Ideal Morni...,Mind Movement Matter,32mwzVgQsicgjqPnDP5Gh9,1,1,1,2,2
6,0,Bring Your Parent To School Day!,Enlightening English Language Learners,2Fs76Rsko5D1RNCiN1yE0A,2,2,2,2,2
7,0,I DON'T LIKE YOUR CHRISTMAS GIFT,The Morning Rush,1Ifz0KuyUNrslTP5nuNvLU,0,0,0,0,0
8,0,TCRNo7 Day 9,Pilgrim Radio,2a2p0VoEz0l2KRQDFs2hQL,2,3,3,3,3
9,0,#08 Morning Routines,The Alpha Babes Podcast,1dEWYnZAMzWDyEtVTYHbvp,3,3,3,3,3


In [18]:
import pandas as pd

file_path = './intersection_nice_morning.txt'

def load_data(file_path):
    """Load the CSV file into a pandas DataFrame."""
    return pd.read_csv(file_path)


df = load_data(file_path)

print(df.columns)

# Set total corpus size
total_corpus = 100
true_relevant = 10  # True relevant items known

# Function to compute precision, recall, f1 for each time column
def compute_metrics(df, column, threshold=1):
    retrieved = (df[column] >= threshold).sum()
    true_positive = retrieved  # assuming all rows in df are truly relevant
    precision = true_positive / retrieved if retrieved > 0 else 0
    recall = true_positive / true_relevant
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return precision, recall, f1

# Apply function to each column of interest
results = {}
for time_col in ["30s", "1m", "2m", "3m", "5m"]:
    precision, recall, f1 = compute_metrics(df, time_col)
    results[time_col] = {
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1
    }

results_df = pd.DataFrame(results).T
import ace_tools as tools; tools.display_dataframe_to_user(name="Relevance Evaluation Metrics", dataframe=results_df)

Index(['query_id', ' episode_name', ' show_name', ' episode_id', ' 30s', ' 1m',
       ' 2m', ' 3m', ' 5m'],
      dtype='object')


KeyError: '30s'

In [31]:
import pandas as pd

file_path = 'intersection_nice_morning.txt'  # your .txt file

# Load data
df = pd.read_csv(file_path, delimiter=',')
df.columns = df.columns.str.strip()

# Assumed corpus size (you don't know true relevant count)
corpus_size = 100

def compute_metrics(df, column, threshold=1):
    retrieved = len(df)  # Your system retrieved these 10 rows
    relevant_in_retrieved = (df[column] > threshold).sum()  # Ground-truth relevance

    precision = relevant_in_retrieved / retrieved if retrieved > 0 else 0
    
    # Optional: approximate "recall-like" stat as relevant docs per corpus size
    rel_density = relevant_in_retrieved / corpus_size
    
    # F1 here is semi-informative since recall is approximate
    f1 = 2 * precision * rel_density / (precision + rel_density) if (precision + rel_density) > 0 else 0

    return precision, rel_density, f1, retrieved, relevant_in_retrieved

# Time windows to evaluate
time_columns = ['30s', "1m", "2m", "3m", "5m"]

# Calculate metrics
results = {}
for col in time_columns:
    precision, rel_density, f1, retrieved, tp = compute_metrics(df, col)
    results[col] = {
        "Retrieved": retrieved,
        "TP (score > 1)": tp,
        "Precision": round(precision, 2),
        "RelDocs/Corpus (approx recall)": round(rel_density, 2),
        "F1 (approx)": round(f1, 2)
    }

# Display
metrics_df = pd.DataFrame(results).T
metrics_df



Unnamed: 0,Retrieved,TP (score > 1),Precision,RelDocs/Corpus (approx recall),F1 (approx)
30s,10.0,5.0,0.5,0.05,0.09
1m,10.0,6.0,0.6,0.06,0.11
2m,10.0,6.0,0.6,0.06,0.11
3m,10.0,7.0,0.7,0.07,0.13
5m,10.0,7.0,0.7,0.07,0.13


In [30]:
df

Unnamed: 0,query_id,episode_name,show_name,episode_id,30s,1m,2m,3m,5m
0,0,'Showering in the morning or before bed' - Epi...,The Weirdest Talk Show,41jhdrTXp5G8roRKU82FqZ,1,1,1,1,1
1,0,TCRNo7 Day 06,Pilgrim Radio,4q7cSYdBYq7yONUgiEVA0E,1,2,3,3,3
2,0,The final days of TCR No7,Pilgrim Radio,0HdhtdR3Uhe6CMhPGRRJnr,3,3,3,3,3
3,0,THE NICEST THING I'VE DONE THIS YEAR,The Morning Rush,4nMVfR0gkasUo3V73aBJKK,1,0,0,0,0
4,0,042 - Sleep Waking Up Early and The Mamba Ment...,The Weekly Warrior Podcast,32MiS3uiNaXmFfnzNnlzgy,2,2,3,3,3
5,0,Our Top 5 Non Negotiable's For Our Ideal Morni...,Mind Movement Matter,32mwzVgQsicgjqPnDP5Gh9,1,1,1,2,2
6,0,Bring Your Parent To School Day!,Enlightening English Language Learners,2Fs76Rsko5D1RNCiN1yE0A,2,2,2,2,2
7,0,I DON'T LIKE YOUR CHRISTMAS GIFT,The Morning Rush,1Ifz0KuyUNrslTP5nuNvLU,0,0,0,0,0
8,0,TCRNo7 Day 9,Pilgrim Radio,2a2p0VoEz0l2KRQDFs2hQL,2,3,3,3,3
9,0,#08 Morning Routines,The Alpha Babes Podcast,1dEWYnZAMzWDyEtVTYHbvp,3,3,3,3,3
