In [None]:
import pandas as pd
import re
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import hamming_loss, f1_score, classification_report
from sklearn.preprocessing import MultiLabelBinarizer
import joblib
from collections import Counter
from category import categories, multi_word_keywords
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier



def clean_lyrics(lyrics):
    """Clean lyrics by converting to lowercase, splitting hyphens, and removing special characters."""
    lyrics = str(lyrics).lower()  # Convert to string to handle non-string inputs
    # Handle possessives (e.g., cat's → cat)
    lyrics = re.sub(r'\b(\w+)\'s\b', r'\1', lyrics)  # Remove 's
    lyrics = re.sub(r'\b(\w+)\'\b', r'\1', lyrics)  # Remove standalone '
    # lyrics = lyrics.replace('-', ' ')  # Replace hyphens with spaces to split words

    # Preserve multi-word keywords by replacing spaces with underscores
    for phrase in multi_word_keywords:
        lyrics = lyrics.replace(phrase, phrase.replace(' ', '_'))

    lyrics = re.sub(r'[^\w\s]', '', lyrics)  # Remove special characters except spaces
    return lyrics


def assign_categories_and_words(lyrics):
    """Assign categories and track relevant words based on unique word occurrences."""
    words = set(lyrics.split())  # Use set to get unique words
    assigned_categories = []
    category_words = []
    
    for category, keywords in categories.items():
        # Find matched keywords, converting underscores to spaces for comparison
        matched_words = [word.replace('_', ' ') for word in words if word.replace('_', ' ') in keywords]
        # Ensure at least three different words
        if len(matched_words) >= 1:
            assigned_categories.append(category)
            category_words.append(matched_words)
    
    if not assigned_categories:
        return ["None"], ["None"]
    return assigned_categories, category_words

def preprocess_dataset(input_file, output_file, lyrics_column='Lyrics'):
    """Read dataset, process lyrics, assign categories and words, and save to new CSV."""
    try:
        # Read the input CSV
        df = pd.read_csv(input_file)
        print(f"Total songs in dataset: {input_file}: {len(df)}")
        # Check if required columns exist
        required_columns = [lyrics_column, 'Song', 'Artist', 'Genre']
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            print(f"Error: Missing columns {missing_columns} in {input_file}")
            print("Available columns:", df.columns.tolist())
            raise KeyError(f"Missing columns: {missing_columns}")
        
        # Clean lyrics
        df['cleaned_lyrics'] = df[lyrics_column].apply(clean_lyrics)
        
        # Assign categories and relevant words
        df[['categories', 'category_words']] = df['cleaned_lyrics'].apply(
            lambda x: pd.Series(assign_categories_and_words(x))
        )
        
        # Convert category_words to string representation for CSV storage
        df['category_words'] = df['category_words'].apply(
            lambda x: json.dumps(x) if x != ["None"] else "[]"
        )
        
        # Print label distribution
        print("\nLabel distribution in processed dataset:")
        label_counts = df['categories'].value_counts()
        print(label_counts)
        print(f"Number of songs with categories (non-['None']): {len(df[df['categories'] != '[\'None\']'])}")
        print(f"Number of songs with no categories (['None']): {len(df[df['categories'] == '[\'None\']'])}")
        
        # Select only the requested columns for output
        output_columns = ['Song', 'Artist', 'Genre', 'categories', 'category_words', 'cleaned_lyrics']
        df = df[output_columns]
        
        # Save the processed dataset
        df.to_csv(output_file, index=False)
        print(f"Processed dataset saved to {output_file}")
        print(f"Total songs in processed dataset: {len(df)}")
        return df
    
    except FileNotFoundError:
        print(f"Error: The file {input_file} was not found.")
        return None
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None
    
def print_most_wrongly_predicted_labels(test_df, y_test, y_pred, mlb, top_n=3, examples_per_category=5):
    """Print the most frequently mispredicted labels and example songs."""
    print("\nAnalyzing most wrongly predicted labels...")
    mispredicted_labels = []
    mispredicted_details = []

    # Iterate over test samples
    for i in range(len(test_df)):
        true_labels = set(mlb.inverse_transform(y_test[i:i+1])[0])
        pred_labels = set(mlb.inverse_transform(y_pred[i:i+1])[0])
        
        # Find incorrect labels (false positives and false negatives)
        false_positives = pred_labels - true_labels  # Predicted but not true
        false_negatives = true_labels - pred_labels  # True but not predicted
        incorrect_labels = false_positives.union(false_negatives)
        
        if incorrect_labels:
            song = test_df['Song'].iloc[i]
            artist = test_df['Artist'].iloc[i]
            genre = test_df['Genre'].iloc[i]
            category_words = test_df['category_words'].iloc[i]
            mispredicted_labels.extend(incorrect_labels)
            mispredicted_details.append({
                'song': song,
                'artist': artist,
                'genre': genre,
                'true_labels': true_labels,
                'pred_labels': pred_labels,
                'category_words': category_words,
                'incorrect_labels': incorrect_labels
            })
    
    if not mispredicted_labels:
        print("No mispredictions found in the test set.")
        return
    
    # Count frequency of mispredicted labels
    label_counts = Counter(mispredicted_labels)
    top_mispredicted = label_counts.most_common(top_n)
    
    print(f"\nTop {top_n} most frequently mispredicted categories:")
    for label, count in top_mispredicted:
        print(f"Category: {label}, Mispredicted {count} times")
        # Find example songs for this label
        examples = [d for d in mispredicted_details if label in d['incorrect_labels']][:examples_per_category]
        print("Example songs with mispredictions:")
        for ex in examples:
            print(f"  Song: {ex['song']}")
            print(f"  Artist: {ex['artist']}")
            print(f"  Genre: {ex['genre']}")
            print(f"  True Categories: {ex['true_labels']}")
            print(f"  Predicted Categories: {ex['pred_labels']}")
            print(f"  Relevant Words: {ex['category_words']}")
            print()
    
    # Recommendation
    print("Recommendation: Add more training songs containing keywords from these categories:")
    for label, _ in top_mispredicted:
        keywords = categories[label][:5]  # Show first 5 keywords as examples
        print(f"  - {label}: e.g., {', '.join(keywords)}")

def train_multi_label_classifier(train_file, test_file, model_output_path, vectorizer_output_path):
    """Train a multi-label classifier using TF-IDF features and save the model."""
    # Preprocess training data
    print("Preprocessing training data...")
    train_df = preprocess_dataset(train_file, "../data/processed_train_dataset.csv")
    if train_df is None:
        return
    
    # Preprocess test data (assuming already processed, but load for consistency)
    print("Loading test data...")
    try:
        test_df = pd.read_csv(test_file)
    except FileNotFoundError:
        print(f"Error: The file {test_file} was not found.")
        return
    
    # Prepare labels
    mlb = MultiLabelBinarizer(classes=list(categories.keys()))
    train_df['categories'] = train_df['categories'].apply(lambda x: eval(x) if isinstance(x, str) else x)
    train_df['categories'] = train_df['categories'].apply(lambda x: [] if x == ['None'] else x)
    y_train = mlb.fit_transform(train_df['categories'])
    
    test_df['categories'] = test_df['categories'].apply(lambda x: eval(x) if isinstance(x, str) else x)
    test_df['categories'] = test_df['categories'].apply(lambda x: [] if x == ['None'] else x)
    y_test = mlb.transform(test_df['categories'])
    
    # Extract TF-IDF features
    vectorizer = TfidfVectorizer(max_features=10000, stop_words='english', ngram_range=(1, 2), min_df=15)
    X_train = vectorizer.fit_transform(train_df['cleaned_lyrics'])
    X_test = vectorizer.transform(test_df['cleaned_lyrics'])
    
    # Train the classifier
    print("Training the classifier...")
    # classifier = OneVsRestClassifier(LogisticRegression(max_iter=1000, class_weight='balanced'))
    # classifier = OneVsRestClassifier(SVC(kernel='linear', class_weight='balanced', probability=True))
    base_classifier = MLPClassifier(
        hidden_layer_sizes=(100,50),
        activation='relu',
        solver='adam',
        max_iter=200,
        random_state=42,
        verbose=True,
        early_stopping=True,
        n_iter_no_change=10,
        validation_fraction=0.1
    )
    classifier = OneVsRestClassifier(base_classifier)
    classifier.fit(X_train, y_train)
    
    # Evaluate on test set
    print("Evaluating on test set...")
    y_pred = classifier.predict(X_test)
    hamming = hamming_loss(y_test, y_pred)
    f1_micro = f1_score(y_test, y_pred, average='micro')
    f1_macro = f1_score(y_test, y_pred, average='macro')
    
    print(f"Hamming Loss: {hamming:.4f}")
    print(f"F1 Score (Micro): {f1_micro:.4f}")
    print(f"F1 Score (Macro): {f1_macro:.4f}")
    
    # Print per-label performance
    print("\nPer-label classification report:")
    print(classification_report(y_test, y_pred, target_names=mlb.classes_, zero_division=0))
    
    # Print most wrongly predicted labels
    print_most_wrongly_predicted_labels(test_df, y_test, y_pred, mlb, top_n=1, examples_per_category=5)

    # Save the model and vectorizer
    joblib.dump(classifier, model_output_path)
    joblib.dump(vectorizer, vectorizer_output_path)
    print(f"Model saved to {model_output_path}")
    print(f"Vectorizer saved to {vectorizer_output_path}")
    
    
    # Example predictions
    print("\nExample predictions on test set:")
    for i in range(min(5, len(test_df))):
        song = test_df['Song'].iloc[i]
        true_labels = mlb.inverse_transform(y_test[i:i+1])[0]
        pred_labels = mlb.inverse_transform(y_pred[i:i+1])[0]
        print(f"Song: {song}")
        print(f"True Categories: {true_labels}")
        print(f"Predicted Categories: {pred_labels}")
        print()
    



if __name__ == "__main__":
    original_train = pd.read_csv("../data/train.csv")
    additional = pd.read_csv("../data/More_song_train.csv")
    # Merging the two datasets
    merged = pd.concat([original_train, additional], ignore_index=True)

    # Save the merged dataset
    merged.to_csv("../data/train_with_additional.csv",index=False)

    train_file = "../data/train_with_additional.csv"
    test_file = "../data/processed_test_dataset_more_categories.csv"
    model_output_path = "../models/multi_label_classifier.pkl"
    vectorizer_output_path = "../models/tfidf_vectorizer.pkl"
    train_multi_label_classifier(train_file, test_file, model_output_path, vectorizer_output_path)

Preprocessing training data...
Processed dataset saved to ../data/processed_train_dataset.csv
Loading test data...
Training the classifier...
Evaluating on test set...
Hamming Loss: 0.0672
F1 Score (Micro): 0.6326
F1 Score (Macro): 0.5182

Per-label classification report:
              precision    recall  f1-score   support

     Animals       0.76      0.34      0.47       343
    Clothing       0.75      0.34      0.47       462
        Food       0.77      0.35      0.48       438
    Emotions       0.74      0.28      0.40       892
  Body Parts       0.84      0.70      0.77      2304

   micro avg       0.82      0.52      0.63      4439
   macro avg       0.77      0.40      0.52      4439
weighted avg       0.80      0.52      0.61      4439
 samples avg       0.24      0.22      0.22      4439


Analyzing most wrongly predicted labels...

Top 3 most frequently mispredicted categories:
Category: Body Parts, Mispredicted 985 times
Example songs with mispredictions:
  Song: mama