# Enhanced Shared-Private BERT with ConceptNet for Humor Detection

In [1]:
import pandas as pd
import os

# Define the dataset path
DATASET_PATH = './data/shared_private_dataset.csv'
DATASET_PATH = './concept_dataset.csv'


# Step 1: Load the dataset
if os.path.exists(DATASET_PATH):
    print("[INFO] Loading dataset...")
    dataset = pd.read_csv(DATASET_PATH)
    print(f"[INFO] Dataset loaded successfully. Total samples: {len(dataset)}")
else:
    raise FileNotFoundError(f"[ERROR] Dataset file not found at {DATASET_PATH}")

# Step 2: Display basic information about the dataset
print("\n[INFO] Dataset Info:")
print(dataset.info())

# Step 3: Display sample rows
print("\n[INFO] Sample Rows:")
print(dataset.head())

# Step 4: Check for missing values
print("\n[INFO] Checking for missing values:")
missing_values = dataset.isnull().sum()
print(missing_values)

# Step 5: Check the distribution of humor types
if 'type' in dataset.columns:
    print("\n[INFO] Distribution of humor types:")
    humor_type_distribution = dataset['type'].value_counts()
    print(humor_type_distribution)
else:
    print("[WARNING] Column 'type' not found in the dataset.")

# Step 6: Check label distribution
if 'label' in dataset.columns:
    print("\n[INFO] Distribution of labels (humorous vs. non-humorous):")
    label_distribution = dataset['label'].value_counts()
    print(label_distribution)
else:
    print("[WARNING] Column 'label' not found in the dataset.")


[INFO] Loading dataset...
[INFO] Dataset loaded successfully. Total samples: 100000

[INFO] Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   text       100000 non-null  object
 1   label      100000 non-null  int64 
 2   type       100000 non-null  object
 3   concepts   100000 non-null  object
 4   relations  100000 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.8+ MB
None

[INFO] Sample Rows:
                                                text  label             type  \
0              Good he answer ok allow trial worker.      0  body punchlines   
1  I met the world's tallest and the world's heav...      1  body punchlines   
2  I was at a barber shop in Bangkok and I asked ...      1  body punchlines   
3                        Its build card time factor.      0  body punchlines   
4  A cashier asks a French gu

In [2]:
# import spacy

# # Load spaCy model for concept extraction
# print("[INFO] Loading spaCy model...")
# nlp = spacy.load("en_core_web_sm")
# print("[INFO] spaCy model loaded successfully.")

# # Function to extract concepts from text
# def extract_concepts(text, index=None, total=None, print_every=10000):
#     """
#     Extracts key concepts from the input text using NER and noun phrase extraction.

#     Parameters:
#         text (str): The input text.
#         index (int, optional): Current index of the text in the dataset.
#         total (int, optional): Total number of entries in the dataset.
#         print_every (int): Print status every `print_every` rows.

#     Returns:
#         List[str]: A list of unique concepts extracted from the text.
#     """
#     # Print progress updates every `print_every` rows
#     if index is not None and total is not None and index % print_every == 0:
#         print(f"[INFO] Processing row {index}/{total}...")

#     doc = nlp(text)
#     concepts = set()

#     # Extract named entities
#     for ent in doc.ents:
#         concepts.add(ent.text.lower())

#     # Extract noun phrases
#     for chunk in doc.noun_chunks:
#         concepts.add(chunk.text.lower())

#     return list(concepts)

# # Apply the function with explicit indexing for progress updates
# print("[INFO] Extracting concepts for the entire dataset...")

# concepts = []  # Initialize a list to store concepts for each row
# for idx, row in dataset.iterrows():
#     # Extract concepts and append them to the list
#     concepts.append(extract_concepts(row['text'], index=idx, total=len(dataset), print_every=10000))

# # Add the extracted concepts as a new column in the dataset
# dataset['concepts'] = concepts

# print("[INFO] Concepts extraction completed.")

dataset.head()


Unnamed: 0,text,label,type,concepts,relations
0,Good he answer ok allow trial worker.,0,body punchlines,"['trial worker', 'he']",[]
1,I met the world's tallest and the world's heav...,1,body punchlines,"['i', 'the other day']",[]
2,I was at a barber shop in Bangkok and I asked ...,1,body punchlines,"['rainbow swirls', 'me,""well', 'bangkok', 'the...",[]
3,Its build card time factor.,0,body punchlines,['its build card time factor'],[]
4,A cashier asks a French guy would you like a b...,1,body punchlines,"['french', 'the french guy', 'this year', 'a c...",[]


In [3]:
# # Save the updated dataset to a CSV file in the root folder
# output_filename = "concept_dataset.csv"
# dataset.to_csv(output_filename, index=False, encoding='utf-8')
# print(f"[INFO] Dataset saved successfully as '{output_filename}' in the root folder.")

# # Display the first few rows of the dataset
# print("[INFO] Preview of the dataset:")
# print(dataset.head())

# # Condense the dataset to around 100,000 rows with equal distribution across the 'type' column
# desired_rows_per_type = 25000  # Approximate number of rows per type
# grouped = dataset.groupby('type')  # Group the dataset by the 'type' column

# # Sample rows equally from each type
# condensed_dataset = grouped.apply(
#     lambda x: x.sample(n=min(len(x), desired_rows_per_type), random_state=42)
# ).reset_index(drop=True)

# print("[INFO] Condensed dataset created with equal distribution across types.")

# # Save the condensed dataset to a CSV file
# condensed_output_filename = "concept_dataset_sampled.csv"
# condensed_dataset.to_csv(condensed_output_filename, index=False, encoding='utf-8')
# print(f"[INFO] Condensed dataset saved successfully as '{condensed_output_filename}'.")

# # Update the dataset variable to the condensed dataset
# dataset = condensed_dataset

# # Print the shape of the condensed dataset
# print(f"[INFO] Condensed dataset shape: {dataset.shape}")

# # Display the first few rows of the condensed dataset
# print("[INFO] Preview of the condensed dataset:")
# print(dataset.head())


[INFO] Dataset saved successfully as 'concept_dataset.csv' in the root folder.
[INFO] Preview of the dataset:
                                                text  label             type  \
0              Good he answer ok allow trial worker.      0  body punchlines   
1  I met the world's tallest and the world's heav...      1  body punchlines   
2  I was at a barber shop in Bangkok and I asked ...      1  body punchlines   
3                        Its build card time factor.      0  body punchlines   
4  A cashier asks a French guy would you like a b...      1  body punchlines   

                                            concepts relations  
0                             ['trial worker', 'he']        []  
1                             ['i', 'the other day']        []  
2  ['rainbow swirls', 'me,"well', 'bangkok', 'the...        []  
3                     ['its build card time factor']        []  
4  ['french', 'the french guy', 'this year', 'a c...        []  
[INFO] Condensed da

In [None]:
import aiohttp
import asyncio
import json
from collections import defaultdict
from itertools import islice
import pandas as pd
import nest_asyncio

# Enable nested event loops for Jupyter compatibility
nest_asyncio.apply()

# Asynchronous function to query ConceptNet API for a given concept
async def get_conceptnet_relations(session, concept, limit=3):
    """
    Queries ConceptNet API to retrieve relationships for a given concept and filters fields.

    Parameters:
        session (aiohttp.ClientSession): The aiohttp session for making requests.
        concept (str): The concept to query (e.g., "money").
        limit (int): Number of top relations to retrieve.

    Returns:
        List[dict]: A list of filtered dictionaries representing the relationships.
    """
    concept_query = f"/c/en/{concept.replace(' ', '_')}"  # Format concept for ConceptNet query
    url = f"https://api.conceptnet.io/query?node={concept_query}&other=/c/en&limit={limit}"

    try:
        async with session.get(url) as response:
            response.raise_for_status()  # Raise an error for HTTP errors
            data = await response.json()
            edges = data.get("edges", [])
            
            # Filter and simplify edges
            simplified_edges = [
                {
                    "start": edge.get("start", {}).get("label", "unknown"),
                    "end": edge.get("end", {}).get("label", "unknown"),
                    "relation": edge.get("@id", "").split("/")[-2],  # Extract relation type
                    "weight": edge.get("weight", 0),
                }
                for edge in edges
            ]
            return simplified_edges
    except Exception as e:
        print(f"[ERROR] Failed to fetch data for concept '{concept}': {e}")
        return []

# Function to fetch relationships for all unique concepts asynchronously
async def fetch_relations_async(concepts, cache, limit=3):
    """
    Fetch relationships for all unique concepts asynchronously using aiohttp.

    Parameters:
        concepts (List[str]): List of unique concepts to query.
        cache (dict): Cache to store and reuse previously fetched results.
        limit (int): Number of relationships to fetch per concept.

    Returns:
        dict: Updated cache with fetched and simplified relationships.
    """
    async with aiohttp.ClientSession() as session:
        tasks = []
        for concept in concepts:
            if concept not in cache:  # Only fetch if not in cache
                tasks.append(get_conceptnet_relations(session, concept, limit))
        results = await asyncio.gather(*tasks)

        for concept, relations in zip(concepts, results):
            cache[concept] = relations

    return cache

# Batch processing utility
def batch(iterable, n=1):
    """
    Splits an iterable into batches of size n.

    Parameters:
        iterable (iterable): The iterable to split.
        n (int): The batch size.

    Returns:
        generator: Generator yielding batches of size n.
    """
    iterable = iter(iterable)
    while True:
        batch_items = list(islice(iterable, n))
        if not batch_items:
            break
        yield batch_items

# Load cache from file if it exists
try:
    with open('concept_cache.json', 'r') as cache_file:
        concept_cache = json.load(cache_file)
    print("[INFO] Loaded existing cache from 'concept_cache.json'.")
except FileNotFoundError:
    concept_cache = defaultdict(list)
    print("[INFO] No existing cache found. Starting fresh.")

# Step 1: Deduplicate all concepts in the dataset
print("[INFO] Extracting unique concepts from the dataset...")
unique_concepts = set([concept for row in dataset['concepts'] for concept in row])
print(f"[INFO] Extracted {len(unique_concepts)} unique concepts.")

# Step 2: Fetch relationships using asynchronous batching
print("[INFO] Fetching ConceptNet relationships for unique concepts...")
batch_size = 250  # Number of concepts to process in a batch
for batch_idx, concept_batch in enumerate(batch(unique_concepts, batch_size), start=1):
    print(f"[INFO] Processing batch {batch_idx} with {len(concept_batch)} concepts...")
    concept_cache = asyncio.run(fetch_relations_async(concept_batch, cache=concept_cache, limit=3))

# Save the updated cache to a file
with open('concept_cache.json', 'w') as cache_file:
    json.dump(concept_cache, cache_file)
print("[INFO] Cache updated and saved to 'concept_cache.json'.")

# Step 3: Map cached relationships back to rows in the dataset
print("[INFO] Mapping relationships back to dataset rows...")
dataset['relations'] = dataset['concepts'].apply(
    lambda concepts: [relation for concept in concepts for relation in concept_cache.get(concept, [])]
)
print("[INFO] Relationship mapping completed.")

# Step 4: Save the updated dataset
output_filename = "relation_dataset.csv"
dataset.to_csv(output_filename, index=False, encoding='utf-8')
print(f"[INFO] Dataset saved successfully as '{output_filename}'.")

# Display first few rows of the dataset
print("[INFO] Displaying the first few rows of the updated dataset:")
print(dataset.head())


[INFO] Loaded existing cache from 'concept_cache.json'.
[INFO] Extracting unique concepts from the dataset...
[INFO] Extracted 46 unique concepts.
[INFO] Fetching ConceptNet relationships for unique concepts...
[INFO] Processing batch 1 with 46 concepts...
[INFO] Cache updated and saved to 'concept_cache.json'.
[INFO] Mapping relationships back to dataset rows...
[INFO] Relationship mapping completed.
[INFO] Dataset saved successfully as 'relation_dataset.csv'.
[INFO] Displaying the first few rows of the updated dataset:
                                                text  label             type  \
0              Good he answer ok allow trial worker.      0  body punchlines   
1  I met the world's tallest and the world's heav...      1  body punchlines   
2  I was at a barber shop in Bangkok and I asked ...      1  body punchlines   
3                        Its build card time factor.      0  body punchlines   
4  A cashier asks a French guy would you like a b...      1  body punchli

In [5]:
import pandas as pd
import json
from joblib import Parallel, delayed

# Paths
input_filename = "relation_dataset.csv"
output_filename = "updated_relation_dataset.csv"

# Chunk size for processing in batches
chunk_size = 100  # Number of rows per chunk

# Function to filter and simplify the relations
def simplify_relations(relations_json):
    """
    Simplifies the relations by keeping only relevant fields and removing duplicates.

    Parameters:
        relations_json (str): JSON string of relations.

    Returns:
        List[dict]: Simplified list of relations with only 'start', 'end', and 'weight'.
    """
    try:
        relations = json.loads(relations_json)  # Parse the JSON string
        unique_relations = {}
        for rel in relations:
            key = (rel.get("start", {}).get("label", "unknown"),
                   rel.get("end", {}).get("label", "unknown"),
                   rel.get("weight", 0))
            unique_relations[key] = {"start": key[0], "end": key[1], "weight": key[2]}
        return json.dumps(list(unique_relations.values()))
    except (json.JSONDecodeError, TypeError):
        return "[]"  # Return empty list if parsing fails

# Process dataset in chunks with joblib
def process_in_parallel_joblib(input_file, output_file, chunk_size):
    """
    Processes the input dataset in parallel using joblib and writes the updated dataset incrementally.

    Parameters:
        input_file (str): Path to the input CSV file.
        output_file (str): Path to the output CSV file.
        chunk_size (int): Number of rows per chunk.
    """
    print(f"[INFO] Processing dataset in chunks of {chunk_size} rows...")

    # Remove existing output file if exists
    if os.path.exists(output_file):
        os.remove(output_file)

    is_header_written = False  # Flag to track header writing

    with pd.read_csv(input_file, chunksize=chunk_size) as reader:
        for chunk_idx, chunk in enumerate(reader, start=1):
            print(f"[INFO] Processing chunk {chunk_idx}...")

            if "relations" not in chunk.columns:
                raise ValueError("[ERROR] 'relations' column not found in the dataset.")

            # Parallel processing with joblib
            chunk["relations"] = Parallel(n_jobs=-1)(
                delayed(simplify_relations)(relations) for relations in chunk["relations"]
            )

            # Write the processed chunk incrementally
            chunk.to_csv(output_file, index=False, mode="a", header=not is_header_written)
            is_header_written = True  # Ensure headers are written only once

    print(f"[INFO] Updated dataset saved successfully as '{output_file}'.")

# Run the function in Jupyter
process_in_parallel_joblib(input_filename, output_filename, chunk_size)


[INFO] Processing dataset in chunks of 100 rows...
[INFO] Processing chunk 1...
[INFO] Processing chunk 2...
[INFO] Processing chunk 3...
[INFO] Processing chunk 4...
[INFO] Processing chunk 5...
[INFO] Processing chunk 6...
[INFO] Processing chunk 7...
[INFO] Processing chunk 8...
[INFO] Processing chunk 9...
[INFO] Processing chunk 10...
[INFO] Processing chunk 11...
[INFO] Processing chunk 12...
[INFO] Processing chunk 13...
[INFO] Processing chunk 14...
[INFO] Processing chunk 15...
[INFO] Processing chunk 16...
[INFO] Processing chunk 17...
[INFO] Processing chunk 18...
[INFO] Processing chunk 19...
[INFO] Processing chunk 20...
[INFO] Processing chunk 21...
[INFO] Processing chunk 22...
[INFO] Processing chunk 23...
[INFO] Processing chunk 24...
[INFO] Processing chunk 25...
[INFO] Processing chunk 26...
[INFO] Processing chunk 27...
[INFO] Processing chunk 28...
[INFO] Processing chunk 29...
[INFO] Processing chunk 30...
[INFO] Processing chunk 31...
[INFO] Processing chunk 32..

In [None]:
# # Condense the dataset to around 100,000 rows with equal distribution across the 'type' column
# desired_rows_per_type = 25000  # Approximate number of rows per type
# grouped = dataset.groupby('type')  # Group the dataset by the 'type' column

# # Sample rows equally from each type
# condensed_dataset = grouped.apply(
#     lambda x: x.sample(n=min(len(x), desired_rows_per_type), random_state=42)
# ).reset_index(drop=True)

# print("[INFO] Condensed dataset created with equal distribution across types.")

# # Save the condensed dataset to a CSV file
# output_filename = "relation_dataset.csv"
# condensed_dataset.to_csv(output_filename, index=False, encoding='utf-8')
# print(f"[INFO] Condensed dataset saved successfully as '{output_filename}'.")

# # Update the dataset variable to the condensed dataset
# dataset = condensed_dataset

# # Print the first few rows of the condensed dataset
# print(dataset.head())


In [None]:
print(dataset.head())

In [10]:
# Import necessary libraries
import os
import numpy as np
import pandas as pd
import json
from tqdm.notebook import tqdm
from joblib import Parallel, delayed
from collections import defaultdict

# Ensure that tqdm works well in Jupyter
tqdm.pandas()

# Define file paths
UPDATED_RELATION_DATASET_PATH = "./data/updated_relation_dataset.csv"
NUMBERBATCH_TXT_PATH = "./numberbatch-en-19.08.txt"  # Path to your specific Numberbatch file
NUMBERBATCH_NPY_PATH = "./numberbatch-en-19.08.npy"  # Path to save the converted .npy file
CLEANED_CONCEPT_CACHE_PATH = "./cleaned_concept_cache.json"
FINAL_PARQUET_PATH = "./final_dataset_with_embeddings.parquet"

# Define embedding dimension
EMBED_DIM = 300

# Step 1: Convert Numberbatch Embeddings to NumPy Format
def convert_numberbatch_to_npy(txt_file_path, output_npy_path, embed_dim=300):
    """
    Converts ConceptNet Numberbatch embeddings from text format to NumPy format.

    Parameters:
        txt_file_path (str): Path to the `numberbatch-en-19.08.txt` file.
        output_npy_path (str): Path to save the `numberbatch-en-19.08.npy` file.
        embed_dim (int): Dimension of the embeddings.
    """
    print("[INFO] Converting Numberbatch embeddings from text to NumPy format...")
    embeddings_dict = {}
    with open(txt_file_path, "r", encoding="utf-8") as f:
        for idx, line in enumerate(tqdm(f, desc="Converting embeddings")):
            if idx == 0:  # Skip the first line (header)
                continue
            values = line.strip().split()
            if len(values) != embed_dim + 1:
                continue  # Skip lines that don't have the correct number of dimensions
            word = values[0].lower()
            try:
                vector = np.array(values[1:], dtype=np.float32)
                embeddings_dict[word] = vector
            except ValueError:
                # Skip lines with invalid float values
                continue
            if idx % 100000 == 0 and idx != 0:
                print(f"[INFO] Processed {idx} embeddings...")
    np.save(output_npy_path, embeddings_dict)
    print(f"[INFO] Embeddings saved to {output_npy_path}.")

# Check if .npy file exists; if not, convert the text file
if not os.path.exists(NUMBERBATCH_NPY_PATH):
    if os.path.exists(NUMBERBATCH_TXT_PATH):
        convert_numberbatch_to_npy(NUMBERBATCH_TXT_PATH, NUMBERBATCH_NPY_PATH, embed_dim=EMBED_DIM)
    else:
        raise FileNotFoundError(f"[ERROR] Numberbatch text file not found at {NUMBERBATCH_TXT_PATH}")
else:
    print(f"[INFO] Pre-converted Numberbatch embeddings found at {NUMBERBATCH_NPY_PATH}.")

# Step 2: Load the Updated Relation Dataset
print("\n[INFO] Loading the updated relation dataset...")
if os.path.exists(UPDATED_RELATION_DATASET_PATH):
    dataset = pd.read_csv(UPDATED_RELATION_DATASET_PATH)
    print(f"[INFO] Dataset loaded successfully. Total rows: {len(dataset)}")
else:
    raise FileNotFoundError(f"[ERROR] Dataset file not found at {UPDATED_RELATION_DATASET_PATH}")

# Display basic information about the dataset
print("\n[INFO] Dataset Info:")
print(dataset.info())

print("\n[INFO] Sample Rows:")
display(dataset.head())

# Step 3: Load the Cleaned Concept Cache
print("\n[INFO] Loading the cleaned concept cache...")
if os.path.exists(CLEANED_CONCEPT_CACHE_PATH):
    with open(CLEANED_CONCEPT_CACHE_PATH, 'r', encoding='utf-8') as f:
        concept_cache = json.load(f)
    print(f"[INFO] Concept cache loaded successfully. Total concepts cached: {len(concept_cache)}")
    
    # Verify the structure of the concept_cache
    print("\n[INFO] Sample Concept Cache Entries:")
    sample_concepts = list(concept_cache.keys())[:2]
    for concept in sample_concepts:
        print(f"Concept: {concept}")
        print(f"Relations: {concept_cache[concept]}\n")
else:
    raise FileNotFoundError(f"[ERROR] Concept cache file not found at {CLEANED_CONCEPT_CACHE_PATH}")

# Step 4: Load the Pre-converted Numberbatch Embeddings
print("\n[INFO] Loading pre-converted Numberbatch embeddings...")
if os.path.exists(NUMBERBATCH_NPY_PATH):
    embeddings_dict = np.load(NUMBERBATCH_NPY_PATH, allow_pickle=True).item()
    print(f"[INFO] Embeddings loaded successfully. Total embeddings: {len(embeddings_dict)}")
else:
    raise FileNotFoundError(f"[ERROR] Embeddings file not found at {NUMBERBATCH_NPY_PATH}")

# Step 5: Define Functions for Aggregation

def parse_concepts(concepts_str):
    """
    Parses the 'concepts' column from string to list.

    Parameters:
        concepts_str (str): String representation of concepts.

    Returns:
        List[str]: List of concepts.
    """
    if isinstance(concepts_str, str):
        try:
            # Replace single quotes with double quotes for valid JSON
            concepts = json.loads(concepts_str.replace("'", '"'))
            return concepts
        except json.JSONDecodeError:
            # Fallback: split by comma
            return [item.strip() for item in concepts_str.split(',')]
    elif isinstance(concepts_str, list):
        return concepts_str
    else:
        return []

def aggregate_embeddings(concepts, embeddings_dict, embed_dim=300):
    """
    Aggregates embeddings for a list of concepts by computing their mean.

    Parameters:
        concepts (List[str]): List of concepts.
        embeddings_dict (dict): Preloaded embeddings dictionary.
        embed_dim (int): Dimension of the embeddings.

    Returns:
        np.ndarray: Aggregated embedding vector.
    """
    vectors = []
    for concept in concepts:
        # Normalize the concept key
        key = concept.lower().replace(" ", "_")
        vector = embeddings_dict.get(key)
        if vector is not None:
            vectors.append(vector)
    if vectors:
        aggregated = np.mean(vectors, axis=0)
    else:
        aggregated = np.zeros(embed_dim, dtype=np.float32)
    return aggregated

def get_all_relation_types(relations_dict):
    """
    Extracts all unique relation types from the relations dictionary.

    Parameters:
        relations_dict (dict): Dictionary mapping concepts to their relations.

    Returns:
        List[str]: List of unique relation types.
    """
    relation_types = set()
    for relations in relations_dict.values():
        for relation in relations:
            if isinstance(relation, dict):
                rel_type = relation.get('relation')
            elif isinstance(relation, (list, tuple)) and len(relation) > 0:
                rel_type = relation[0]  # Assuming first element is the type
            else:
                rel_type = None
            if rel_type:
                relation_types.add(rel_type)
    return sorted(list(relation_types))

def aggregate_relations(relations_json, relation_types):
    """
    Aggregates relation counts for a list of relations.

    Parameters:
        relations_json (str): JSON string of relations.
        relation_types (List[str]): List of all possible relation types.

    Returns:
        np.ndarray: Array of counts corresponding to each relation type.
    """
    relation_counts = defaultdict(int)
    try:
        relations = json.loads(relations_json)
        for rel in relations:
            if isinstance(rel, dict):
                relation_type = rel.get('relation')
            elif isinstance(rel, (list, tuple)) and len(rel) > 0:
                relation_type = rel[0]  # Assuming first element is the type
            else:
                relation_type = None
            if relation_type in relation_types:
                relation_counts[relation_type] += 1
    except json.JSONDecodeError:
        pass  # Return counts as zero if JSON is invalid
    # Create a count vector
    counts = [relation_counts[rel] for rel in relation_types]
    return np.array(counts, dtype=np.float32)

# Step 6: Extract All Unique Relation Types
print("\n[INFO] Extracting all unique relation types...")
unique_relation_types = get_all_relation_types(concept_cache)
print(f"[INFO] Total unique relation types: {len(unique_relation_types)}")
print(f"[INFO] Sample relation types: {unique_relation_types[:10]}")

# If no relation types found, inspect the concept_cache
if len(unique_relation_types) == 0:
    print("\n[WARNING] No relation types found. Inspecting the structure of concept_cache.json...")
    for concept, relations in list(concept_cache.items())[:2]:
        print(f"Concept: {concept}")
        print(f"Relations: {relations}\n")
    print("[INFO] Proceeding without aggregating relation features.")
    # Proceed without relation features

# Step 7: Sample 10,000 Rows per 'type'
print("\n[INFO] Sampling 10,000 rows per 'type' to create a balanced subset...")
desired_rows_per_type = 10000

# Get unique types
unique_types = dataset['type'].unique()
print(f"[INFO] Unique types found: {unique_types}")

# Initialize an empty DataFrame for the sampled data
sampled_dataset = pd.DataFrame()

# Iterate over each type and sample rows
for t in unique_types:
    type_subset = dataset[dataset['type'] == t]
    n_samples = min(desired_rows_per_type, len(type_subset))
    sampled_subset = type_subset.sample(n=n_samples, random_state=42)
    sampled_dataset = pd.concat([sampled_dataset, sampled_subset], ignore_index=True)
    print(f"[INFO] Sampled {n_samples} rows for type '{t}'.")

print(f"[INFO] Total sampled rows: {len(sampled_dataset)}")

# Replace the original dataset with the sampled dataset for further processing
dataset = sampled_dataset

# Display information about the sampled dataset
print("\n[INFO] Sampled Dataset Info:")
print(dataset.info())

print("\n[INFO] Sampled Dataset Sample Rows:")
display(dataset.head())

# Step 8: Parse the 'concepts' Column
print("\n[INFO] Parsing 'concepts' column from string to list...")
dataset['concepts'] = dataset['concepts'].apply(parse_concepts)
print("[INFO] Parsing completed.")

# Step 9: Aggregate Embeddings for Each Row
print("\n[INFO] Aggregating ConceptNet embeddings for each row...")

# Define a function to apply embeddings aggregation
def apply_aggregate_embeddings(concepts):
    return aggregate_embeddings(concepts, embeddings_dict, embed_dim=EMBED_DIM)

# Use joblib's Parallel for faster processing
conceptnet_embeddings = Parallel(n_jobs=-1)(
    delayed(apply_aggregate_embeddings)(concepts) for concepts in tqdm(dataset['concepts'], desc="Aggregating Embeddings")
)

# Convert list to NumPy array
conceptnet_embeddings = np.vstack(conceptnet_embeddings)
print(f"[INFO] Aggregated embeddings shape: {conceptnet_embeddings.shape}")

# Add embeddings to the dataset
print("[INFO] Adding aggregated embeddings to the dataset...")
for i in range(EMBED_DIM):
    dataset[f'embedding_{i}'] = conceptnet_embeddings[:, i]
print("[INFO] Embeddings added successfully.")

# Step 10: Aggregate Relation Features for Each Row (If Available)
if unique_relation_types:
    print("\n[INFO] Aggregating relation features for each row...")
    
    # Define a function to apply relations aggregation
    def apply_aggregate_relations(relations_json):
        return aggregate_relations(relations_json, unique_relation_types)
    
    # Use joblib's Parallel for faster processing
    relation_features = Parallel(n_jobs=-1)(
        delayed(apply_aggregate_relations)(relations_json) for relations_json in tqdm(dataset['relations'], desc="Aggregating Relations")
    )
    
    # Convert list to NumPy array
    relation_features = np.vstack(relation_features)
    print(f"[INFO] Aggregated relations shape: {relation_features.shape}")
    
    # Add relation features to the dataset
    print("[INFO] Adding relation features to the dataset...")
    for idx, rel_type in enumerate(unique_relation_types):
        # Sanitize column names
        rel_sanitized = rel_type.replace(" ", "_").replace("/", "_").replace("-", "_")
        dataset[f'relation_{rel_sanitized}'] = relation_features[:, idx]
    print("[INFO] Relation features added successfully.")
else:
    print("\n[INFO] Skipping relation features aggregation as no relation types were found.")

# Step 11: (Optional) Drop 'concepts' and 'relations' Columns to Optimize Storage
print("\n[INFO] Dropping 'concepts' and 'relations' columns to optimize storage...")
columns_to_drop = ['concepts', 'relations']
existing_columns_to_drop = [col for col in columns_to_drop if col in dataset.columns]
dataset = dataset.drop(columns=existing_columns_to_drop)
print("[INFO] Columns dropped successfully.")

# Step 12: Save the Final DataFrame in Parquet Format
print(f"\n[INFO] Saving the final dataset to '{FINAL_PARQUET_PATH}' in Parquet format...")
dataset.to_parquet(FINAL_PARQUET_PATH, compression='snappy')
print("[INFO] Final dataset saved successfully.")

# Step 13: Verify the Saved Parquet File
print(f"\n[INFO] Loading the saved Parquet file '{FINAL_PARQUET_PATH}' to verify...")
loaded_dataset = pd.read_parquet(FINAL_PARQUET_PATH)
print(f"[INFO] Loaded dataset shape: {loaded_dataset.shape}")

print("\n[INFO] Displaying the first few rows of the loaded dataset:")
display(loaded_dataset.head())

# Step 14: Access Embedding and Relation Features
# Example: Accessing the first 5 embedding columns and relation columns
embedding_columns = [f'embedding_{i}' for i in range(EMBED_DIM)]
relation_columns = [col for col in loaded_dataset.columns if col.startswith('relation_')]

print(f"\n[INFO] Sample Embedding Columns: {embedding_columns[:5]}")
print(f"[INFO] Sample Relation Columns: {relation_columns[:5]}")

# Accessing embeddings and relations for the first row
if embedding_columns:
    first_row_embedding = loaded_dataset.loc[0, embedding_columns].values
    print("\n[INFO] First Row Embedding Vector:")
    print(first_row_embedding)

if relation_columns:
    first_row_relations = loaded_dataset.loc[0, relation_columns].values
    print("\n[INFO] First Row Relation Counts:")
    print(first_row_relations)


[INFO] Pre-converted Numberbatch embeddings found at ./numberbatch-en-19.08.npy.

[INFO] Loading the updated relation dataset...
[INFO] Dataset loaded successfully. Total rows: 100000

[INFO] Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   text       100000 non-null  object
 1   label      100000 non-null  int64 
 2   type       100000 non-null  object
 3   concepts   100000 non-null  object
 4   relations  100000 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.8+ MB
None

[INFO] Sample Rows:


Unnamed: 0,text,label,type,concepts,relations
0,Good he answer ok allow trial worker.,0,body punchlines,"['trial worker', 'he']",[]
1,I met the world's tallest and the world's heav...,1,body punchlines,"['i', 'the other day']",[]
2,I was at a barber shop in Bangkok and I asked ...,1,body punchlines,"['rainbow swirls', 'me,""well', 'bangkok', 'the...",[]
3,Its build card time factor.,0,body punchlines,['its build card time factor'],[]
4,A cashier asks a French guy would you like a b...,1,body punchlines,"['french', 'the french guy', 'this year', 'a c...",[]



[INFO] Loading the cleaned concept cache...
[INFO] Concept cache loaded successfully. Total concepts cached: 46

[INFO] Sample Concept Cache Entries:
Concept: 9
Relations: [{'start': 'cardinal', 'end': '9', 'relation': 'wn', 'surfaceText': '[[cardinal]] is similar to [[9]]', 'weight': 2.0}, {'start': '9', 'end': 'a number', 'relation': 'number', 'surfaceText': '[[9]] is [[a number]]', 'weight': 2.0}, {'start': 'nine', 'end': '9', 'relation': 'wn', 'surfaceText': '[[nine]] is a synonym of [[9]]', 'weight': 2.0}, {'start': '9', 'end': 'nine', 'relation': 'quantity', 'surfaceText': '[[9]] is a synonym of [[nine]]', 'weight': 2.0}, {'start': 'ix', 'end': '9', 'relation': 'wn', 'surfaceText': '[[ix]] is a synonym of [[9]]', 'weight': 2.0}]

Concept: 1
Relations: [{'start': '1', 'end': 'a number', 'relation': 'number', 'surfaceText': '[[1]] is [[a number]]', 'weight': 4.47213595499958}, {'start': '1s', 'end': '1', 'relation': '1', 'surfaceText': None, 'weight': 2.0}, {'start': '1', 'end': '

Unnamed: 0,text,label,type,concepts,relations
0,Whats red and white and sits in a tree? A sani...,1,body punchlines,"['what', 'a sanitary owl', 'a tree']",[]
1,Discover idea traditional like another reach t...,0,body punchlines,"['another', 'discover idea', 'task treatment']",[]
2,Seat center spend place.,0,body punchlines,"['seat center', 'place']",[]
3,Place rather about garden reality local can in...,0,body punchlines,['garden reality'],[]
4,I ve been cleaning my shoes with Vodka removed,1,body punchlines,"['i', 'vodka', 'my shoes']",[]



[INFO] Parsing 'concepts' column from string to list...
[INFO] Parsing completed.

[INFO] Aggregating ConceptNet embeddings for each row...


Aggregating Embeddings:   0%|          | 0/40000 [00:00<?, ?it/s]


KeyboardInterrupt



In [11]:
# Cell 3: Define Parameters
MAX_LENGTH = 64
BATCH_SIZE = 32  # Adjusted for memory considerations
EPOCHS_SHARED = 3  # Number of epochs for shared BERT fine-tuning
EPOCHS_PRIVATE = 3  # Number of epochs for private BERT fine-tuning
LEARNING_RATE = 0.00001
NUM_LABELS = 2  # Humorous or not
print("[INFO] Training parameters defined.")


[INFO] Training parameters defined.


In [14]:
# Cell 4: Paths to Models and Data

# Paths to models
SHARED_MODEL_PATH = 'models/shared_private_model.pt'    # Using pretrained BERT as shared model
PRIVATE_MODEL_PATH = 'models/bert_mlm'   # Using same pretrained BERT for all private layers
TOKENIZER_NAME = 'bert-base-uncased'       # Name of your tokenizer

# Path to dataset Parquet file
PARQUET_FILE_PATH = './data/final_dataset_with_embeddings.parquet'

print("[INFO] Paths to models and data defined.")


[INFO] Paths to models and data defined.


In [None]:
# Cell 5: Load and Split Data for only two datasets
train_df, val_df, test_df, humor_type_to_idx = sharedprivate_load_and_split_data(PARQUET_FILE_PATH)

# Original list of humor types
humor_types = list(humor_type_to_idx.keys())

# Since we're only training on 'humicroedit' and 'shortjokes', filter the datasets
desired_humor_types = ['humicroedit', 'shortjokes']

# Filter the DataFrames
train_df = train_df[train_df['type'].isin(desired_humor_types)].reset_index(drop=True)
val_df = val_df[val_df['type'].isin(desired_humor_types)].reset_index(drop=True)
test_df = test_df[test_df['type'].isin(desired_humor_types)].reset_index(drop=True)

# Update humor_type_to_idx to include only desired types
humor_type_to_idx = {ht: idx for idx, ht in enumerate(sorted(set(desired_humor_types)))}

In [16]:
# Cell 6: Create Datasets and Dataloaders
print("Creating datasets and dataloaders...")

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained(TOKENIZER_NAME)

# Create datasets
train_dataset = HumorDataset(train_df, TOKENIZER_NAME, MAX_LENGTH)
val_dataset = HumorDataset(val_df, TOKENIZER_NAME, MAX_LENGTH)
test_dataset = HumorDataset(test_df, TOKENIZER_NAME, MAX_LENGTH)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)

# Print dataset sizes
print(f"Number of training samples: {len(train_dataset)}")
print(f"Number of validation samples: {len(val_dataset)}")
print(f"Number of test samples: {len(test_dataset)}")



Creating datasets and dataloaders...
Number of training samples: 28000
Number of validation samples: 6000
Number of test samples: 6000



In [23]:
# Cell 8: Train Shared-Private Models for Each Humor Dataset
print("\n--- Training Shared-Private Models for Each Humor Dataset ---")

# Define the directory to save shared-private models
shared_private_models_dir = './models/updated_shared_private'
if not os.path.exists(shared_private_models_dir):
    os.makedirs(shared_private_models_dir)
    print(f"[INFO] Created directory '{shared_private_models_dir}' for saving shared-private models.")
else:
    print(f"[INFO] Directory '{shared_private_models_dir}' already exists.")

for dataset_name in humor_types:
    print(f"\n--- Training on '{dataset_name}' dataset ---")
    
    # Filter data for the current humor type
    train_subset = train_df[train_df['type'] == dataset_name]
    val_subset = val_df[val_df['type'] == dataset_name]
    test_subset = test_df[test_df['type'] == dataset_name]

    # Create datasets and dataloaders
    train_dataset_subset = HumorDataset(train_subset, TOKENIZER_NAME, MAX_LENGTH)
    val_dataset_subset = HumorDataset(val_subset, TOKENIZER_NAME, MAX_LENGTH)
    test_dataset_subset = HumorDataset(test_subset, TOKENIZER_NAME, MAX_LENGTH)

    train_loader_subset = DataLoader(train_dataset_subset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=True)
    val_loader_subset = DataLoader(val_dataset_subset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)
    test_loader_subset = DataLoader(test_dataset_subset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)

    # Initialize a new SharedPrivateModel for the current dataset
    model = SharedPrivateModel(
        shared_model_path=SHARED_MODEL_PATH,  # Path to the shared BERT fine-tuned model
        private_model_paths=private_model_paths,   # Same private BERT paths
        num_labels=NUM_LABELS
    )
    model = model.to(device)
    print("[INFO] SharedPrivateModel initialized for current dataset and moved to device.")

    # Load the best shared-private model
    print(f"[INFO] Loading shared BERT fine-tuned model from '{best_shared_model_path}'.")
    model = load_model_weights(model, best_shared_model_path, device)

    # Define save path for the current model
    sanitized_dataset_name = dataset_name.replace(" ", "_")
    model_save_path = os.path.join(shared_private_models_dir, f'shared_private_model_{sanitized_dataset_name}.pt')

    # Train the model on the specific humor dataset
    print(f"Starting training for '{dataset_name}'...")
    model = sharedprivate_train_private_model(
        model=model,
        train_loader=train_loader_subset,
        val_loader=val_loader_subset,
        test_loader=test_loader_subset,
        epochs=EPOCHS_PRIVATE,
        learning_rate=LEARNING_RATE,
        device=device,
        save_dir=shared_private_models_dir,
        save_interval=1  # Save model every epoch
    )

    # Save the trained model
    final_model_save_path = os.path.join(shared_private_models_dir, f'shared_private_model_{sanitized_dataset_name}.pt')
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE).state_dict(),
        # Include other items if necessary
    }, final_model_save_path)
    print(f"Model for '{dataset_name}' training complete and saved to {final_model_save_path}")



--- Training Shared-Private Models for Each Humor Dataset ---

[INFO] Directory './models/updated_shared_private' already exists.

--- Training on 'humicroedit' dataset ---

[INFO] SharedPrivateModel initialized for current dataset and moved to device.
[INFO] Loading shared BERT fine-tuned model from './models/shared_bert_finetuned/shared_private_best_model_epoch_3.pt'.
[INFO] Loaded 'model_state_dict' from checkpoint.
Starting training for 'humicroedit'...

--- Epoch 1/3 ---
Training batches: 100%|███████████████████████| 875/875 [57:06<00:00,  2.57s/it]
Epoch 1 - Loss: 0.0045, Acc: 0.9350
Validation Metrics - Accuracy: 0.9380, Precision: 0.9365, Recall: 0.9380, F1-Score: 0.9368
[INFO] Saving checkpoint to: ./models/updated_shared_private/shared_private_private_epoch_1.pt
[INFO] New best model found. Saving to: ./models/shared_private_models/shared_private_private_best_model_epoch_1.pt

--- Epoch 2/3 ---
Training batches: 100%|███████████████████████| 875/875 [56:50<00:00,  2.54s/it]

In [20]:

criterion = nn.CrossEntropyLoss()

# Evaluate all trained models
for dataset_name in humor_types:
    print(f"\n--- Evaluating model for '{dataset_name}' dataset ---")
    model_save_path = os.path.join(shared_private_models_dir, f'shared_private_model_{dataset_name.replace(" ", "_")}.pt')
    model = SharedPrivateModel(
        shared_model_path=SHARED_MODEL_PATH,
        private_model_paths=private_model_paths,
        num_labels=NUM_LABELS
    )
    model = load_model_weights(model, model_save_path, device)
    model = model.to(device)
    
    # Define test loader for the current dataset
    test_subset = test_df[test_df['type'] == dataset_name]
    test_dataset_subset = HumorDataset(test_subset, TOKENIZER_NAME, MAX_LENGTH)
    test_loader_subset = DataLoader(test_dataset_subset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)
    
    # Evaluate the model
    test_metrics = evaluate_model(model, test_loader_subset, criterion, device)



--- Evaluating model for 'humicroedit' dataset ---
[INFO] Loaded 'model_state_dict' from checkpoint.
[INFO] Loaded 'model_state_dict' from checkpoint.

[INFO] Evaluating the model on the test set...
Evaluating batches: 100%|█████████████████████| 188/188 [01:00<00:00,  1.00s/it]
Test Metrics - Loss: 0.8136, Accuracy: 0.8136, Precision: 0.8136, Recall: 0.8136, F1-Score: 0.8136

--- Evaluating model for 'shortjokes' dataset ---
[INFO] Loaded 'model_state_dict' from checkpoint.
[INFO] Loaded 'model_state_dict' from checkpoint.

[INFO] Evaluating the model on the test set...
Evaluating batches: 100%|█████████████████████| 188/188 [00:58<00:00,  0.98s/it]
Test Metrics - Loss: 0.9877, Accuracy: 0.9877, Precision: 0.9877, Recall: 0.9877, F1-Score: 0.9877



UsageError: Cell magic `%%` not found.
