In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from collections import Counter

In [None]:
# Load the Marvel dataset
marvel_url = "https://raw.githubusercontent.com/fivethirtyeight/data/master/comic-characters/marvel-wikia-data.csv"
marvel_data = pd.read_csv(marvel_url)

# Load the DC dataset
dc_url = "https://raw.githubusercontent.com/fivethirtyeight/data/master/comic-characters/dc-wikia-data.csv"
dc_data = pd.read_csv(dc_url)

In [None]:
def convert_first_appearance(row):
    if pd.isna(row):  # Handle NaN cases
        return row
    if type(row) == float: # Row has already been converted
        return row
    row = str(row)
    try:
        month_map = {
                        'January': 1, 'February': 2, 'March': 3, 'April': 4,
                        'May': 5, 'June': 6, 'July': 7, 'August': 8,
                        'September': 9, 'October': 10, 'November': 11, 'December': 12,
                        'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4,
                        'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8,
                        'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12,
                        'Holiday': 11.5
                    }
        if "-" in row:  # Marvel date format (e.g., 'Sep-75')
            parts = row.split("-")
            year = int("19" + parts[1] if int(parts[1]) > 30 else "20" + parts[1])
            month_abbrev = parts[0]
            month = month_map.get(month_abbrev)
        else:  # DC date format (e.g., '1999, April')
            parts = row.split(", ")
            if len(parts) == 1:
                year = int(parts[0])
                month = 1
            else:
                year = int(parts[0])
                month_abbrev = parts[1]
                month = month_map.get(month_abbrev)

        return year + (month - 1) / 12  # Normalize month to a fraction of a year
    except:
        print("Error parsing date: " + row)
        raise
    

In [None]:
marvel_data["FIRST APPEARANCE"] = marvel_data["FIRST APPEARANCE"].apply(convert_first_appearance)
dc_data["FIRST APPEARANCE"] = dc_data["FIRST APPEARANCE"].apply(convert_first_appearance)

In [None]:
marvel_data.rename(columns={"Year": "YEAR"}, inplace=True)

In [None]:
# Combine datasets
combined_data = pd.concat([marvel_data, dc_data])

In [None]:
marvel_names = marvel_data["name"]
dc_names = dc_data["name"]
combined_names = combined_data["name"]

In [None]:
marvel_data = marvel_data.drop(["urlslug", "page_id", "name", "YEAR"], axis=1)
dc_data = dc_data.drop(["urlslug", "page_id", "name", "YEAR"], axis=1)
combined_data = combined_data.drop(["urlslug", "page_id", "name", "YEAR"], axis=1)

In [None]:
marvel_dfs = []
for column in marvel_data.columns:
    marvel_dfs.append((column, pd.DataFrame(marvel_data[column].value_counts(dropna=False), columns=[f"count"], index=marvel_data[column].unique())))
for column, df in marvel_dfs:
    print(f"Column: {column}", df, sep="\n", end="\n\n")

In [None]:
dc_dfs = []
for column in dc_data.columns:
    dc_dfs.append((column, pd.DataFrame(dc_data[column].value_counts(dropna=False), columns=[f"count"], index=dc_data[column].unique())))
for column, df in dc_dfs:
    print(f"Column: {column}", df, sep="\n", end="\n\n")

In [None]:
combined_dfs = []
for column in combined_data.columns:
    combined_dfs.append((column, pd.DataFrame(combined_data[column].value_counts(dropna=False), columns=[f"count"], index=combined_data[column].unique())))
for column, df in combined_dfs:
    print(f"Column: {column}", df, sep="\n", end="\n\n")


In [None]:
# List of target features (exclude "url", "page_id", and "name")
target_features = ["ALIGN", "SEX", "EYE", "HAIR", "GSM", "ALIVE", "ID"]

In [None]:
rf_models_marvel = []
rf_models_dc = []
rf_models_combined = []

In [None]:
for target_feature in target_features:
    #print(f"""Target feature: {target_feature}""")

    # Create dataset with target_feature as the target column for each category
    data_marvel = marvel_data.dropna(subset=[target_feature])
    data_dc = dc_data.dropna(subset=[target_feature])
    data_combined = combined_data.dropna(subset=[target_feature])

    features_marvel = data_marvel.drop(target_feature, axis=1)
    features_dc = data_dc.drop(target_feature, axis=1)
    features_combined = data_combined.drop(target_feature, axis=1)

    targets_marvel = data_marvel[target_feature]
    targets_dc = data_dc[target_feature]
    targets_combined = data_combined[target_feature]

    classes = targets_marvel.unique()
    
    # Encode target feature
    le_marvel = LabelEncoder()
    le_dc = LabelEncoder()
    le_combined = LabelEncoder()
    y_marvel = le_marvel.fit_transform(targets_marvel)
    y_dc = le_dc.fit_transform(targets_dc)
    y_combined = le_combined.fit_transform(targets_combined)

    # One-hot encode features
    columns = [tf for tf in target_features if tf != target_feature]
    onehot_features_marvel = pd.get_dummies(features_marvel, columns=columns)
    onehot_features_dc = pd.get_dummies(features_dc, columns=columns)
    onehot_features_combined = pd.get_dummies(features_combined, columns=columns)

    # Impute missing values
    imputer = SimpleImputer(strategy="mean")
    imputed_onehot_features_marvel = imputer.fit_transform(onehot_features_marvel)
    imputed_onehot_features_dc = imputer.fit_transform(onehot_features_dc)
    imputed_onehot_features_combined = imputer.fit_transform(onehot_features_combined)

    # Final features dataframe
    X_marvel = pd.DataFrame(imputed_onehot_features_marvel, columns=onehot_features_marvel.columns)
    X_dc = pd.DataFrame(imputed_onehot_features_dc, columns=onehot_features_dc.columns)
    X_combined = pd.DataFrame(imputed_onehot_features_combined, columns=onehot_features_combined.columns)

    # Split data into training and testing sets

    X_train_marvel, X_test_marvel, y_train_marvel, y_test_marvel = train_test_split(X_marvel, y_marvel, test_size=0.2, random_state=42)
    X_train_dc, X_test_dc, y_train_dc, y_test_dc = train_test_split(X_dc, y_dc, test_size=0.2, random_state=42)
    X_train_combined, X_test_combined, y_train_combined, y_test_combined = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)

    # Create RandomForest classifiers for each category
    rf_classifier_marvel = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_classifier_dc = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_classifier_combined = RandomForestClassifier(n_estimators=100, random_state=42)

    # Get the unique class labels from the original target data
    marvel_unique_class_labels = np.unique(y_marvel)
    print(marvel_unique_class_labels)
    dc_unique_class_labels = np.unique(y_dc)
    combined_unique_class_labels = np.unique(y_combined)

    # Convert the unique class labels to a list
    marvel_class_names = le_marvel.inverse_transform(marvel_unique_class_labels)
    print(marvel_class_names)
    dc_class_names = le_dc.inverse_transform(dc_unique_class_labels)
    combined_class_names = le_combined.inverse_transform(combined_unique_class_labels)
    
    # Store the trained models in the respective lists
    rf_models_marvel.append((rf_classifier_marvel, X_train_marvel, y_train_marvel, X_test_marvel, y_test_marvel, target_feature, marvel_class_names))
    rf_models_dc.append((rf_classifier_dc, X_train_dc, y_train_dc, X_test_dc, y_test_dc, target_feature, dc_class_names))
    rf_models_combined.append((rf_classifier_combined, X_train_combined, y_train_combined, X_test_combined, y_test_combined, target_feature, combined_class_names))

In [None]:
for rf_model, X_train, y_train, X_test, y_test, target_feature, _ in rf_models_marvel:
    try:
        rf_model.fit(X_train, y_train)
        y_pred = rf_model.predict(X_test)
        print(f"""Accuracy score for Marvel {target_feature} predictions: {accuracy_score(y_test, y_pred)}""")
    except:
        print(f"""Not enough data for Marvel {target_feature} predictions""")
        continue


In [None]:
for rf_model, X_train, y_train, X_test, y_test, target_feature, _ in rf_models_dc:
    try:
        rf_model.fit(X_train, y_train)
        y_pred = rf_model.predict(X_test)
        print(f"""Accuracy score for DC {target_feature} predictions: {accuracy_score(y_test, y_pred)}""")
    except:
        print(f"""Not enough data for DC {target_feature} predictions""")
        continue

In [None]:
for rf_model, X_train, y_train, X_test, y_test, target_feature, _ in rf_models_combined:
    try:
        rf_model.fit(X_train, y_train)
        y_pred = rf_model.predict(X_test)
        print(f"""Accuracy score for combined {target_feature} predictions: {accuracy_score(y_test, y_pred)}""")
    except:
        print(f"""Not enough data for combined {target_feature} predictions""")
        continue

In [None]:
import shap

shap.initjs()

In [None]:
shap_models_marvel = []

for rf_model, X_train, y_train, X_test, y_test, target_feature, class_names in rf_models_marvel:
    print(f"""Creating SHAP values for Marvel {target_feature} predictions with {len(X_test)} samples""")
    subset_size = min((int(len(X_test) * 0.1) if len(X_test) > 250 else len(X_test)), 10)
    print(f"""Subset size: {subset_size}""")

    explainer = shap.TreeExplainer(rf_model)
    shap_values = explainer.shap_values(X_test[:subset_size])
    shap_models_marvel.append((explainer, shap_values, X_test[:subset_size], target_feature, class_names))
    print(f"""SHAP values for Marvel {target_feature} predictions created""")

print("Marvel SHAP values created")

In [None]:
# Create SHAP values for DC

shap_models_dc = []

for rf_model, X_train, y_train, X_test, y_test, target_feature, class_names in rf_models_dc:
    print(f"""Creating SHAP values for DC {target_feature} predictions with {len(X_test)} samples""")
    subset_size = min((int(len(X_test) * 0.1) if len(X_test) > 250 else len(X_test)), 10)
    print(f"""Subset size: {subset_size}""")

    explainer = shap.TreeExplainer(rf_model)
    shap_values = explainer.shap_values(X_test[:subset_size])
    shap_models_dc.append((explainer, shap_values, X_test[:subset_size], target_feature, class_names))
    print(f"""SHAP values for DC {target_feature} predictions created""")

print("DC SHAP values created")

In [None]:
# Create SHAP values for combined

shap_models_combined = []

for rf_model, X_train, y_train, X_test, y_test, target_feature, class_names in rf_models_combined:
    print(f"""Creating SHAP values for combined {target_feature} predictions with {len(X_test)} samples""")
    subset_size = min((int(len(X_test) * 0.1) if len(X_test) > 250 else len(X_test)), 10)
    print(f"""Subset size: {subset_size}""")

    explainer = shap.TreeExplainer(rf_model)
    shap_values = explainer.shap_values(X_test[:subset_size])
    shap_models_combined.append((explainer, shap_values, X_test[:subset_size], target_feature, class_names))
    print(f"""SHAP values for combined {target_feature} predictions created""")

print("Combined SHAP values created")

In [None]:
import random
import matplotlib.pyplot as plt

In [None]:
# Create plots for Marvel

for explainer, shap_values, feature_subset, target_feature, class_names in shap_models_marvel:
    num_samples = len(feature_subset)
    sample_index = random.randint(0, num_samples - 1)  # Generate a random index within the valid range

    print(f"Creating Summary plot for Marvel {target_feature} predictions")
    shap.summary_plot(shap_values, feature_subset, feature_names=feature_subset.columns, show=False, class_names=class_names)
    plt.savefig(f"figs/summary/Marvel_{target_feature}_summary.png")
    plt.show()
    print(f"Created Summary plot for Marvel {target_feature} predictions")

    print(f"Creating Waterfall plot for Marvel {target_feature} predictions")
    shap.plots.waterfall(shap.Explanation(values=shap_values[0][sample_index], base_values=explainer.expected_value[0], data=feature_subset.iloc[sample_index]), max_display=10, show=False)
    plt.savefig(f"figs/waterfall/Marvel_{target_feature}_{sample_index}_waterfall.png")
    plt.show()
    print(f"Created Waterfall plot for Marvel {target_feature} predictions")


print("Marvel plots created")
    

In [None]:
# Create plots for DC

for explainer, shap_values, feature_subset, target_feature, class_names in shap_models_dc:
    sample_index = random.randint(0, len(feature_subset))

    print(f"""Creating Summary plot for DC {target_feature} predictions""")
    shap.summary_plot(shap_values, feature_subset, feature_names=feature_subset.columns, show=False, class_names=class_names)
    plt.savefig(f"""figs/summary/DC_{target_feature}_summary.png""")
    plt.show()
    print(f"""Created Summary plot for DC {target_feature} predictions""")

    print(f"Creating Waterfall plot for DC {target_feature} predictions")
    shap.plots.waterfall(shap.Explanation(values=shap_values[0][sample_index], base_values=explainer.expected_value[0], data=feature_subset.iloc[sample_index]), max_display=10, show=False)
    plt.savefig(f"figs/waterfall/DC_{target_feature}_{sample_index}_waterfall.png")
    plt.show()
    print(f"Created Waterfall plot for DC {target_feature} predictions")


print("DC plots created")

In [None]:
# Create plots for Combined

for explainer, shap_values, feature_subset, target_feature, class_names in shap_models_combined:
    sample_index = random.randint(0, len(feature_subset))

    print(f"""Creating Summary plot for Combined {target_feature} predictions""")
    shap.summary_plot(shap_values, feature_subset, feature_names=feature_subset.columns, show=False, class_names=class_names)
    plt.savefig(f"""figs/summary/Combined_{target_feature}_summary.png""")
    plt.show()
    print(f"""Created Summary plot for Combined {target_feature} predictions""")

    print(f"Creating Waterfall plot for Combined {target_feature} predictions")
    shap.plots.waterfall(shap.Explanation(values=shap_values[0][sample_index], base_values=explainer.expected_value[0], data=feature_subset.iloc[sample_index]), max_display=10, show=False)
    plt.savefig(f"figs/waterfall/Combined_{target_feature}_{sample_index}_waterfall.png")
    plt.show()
    print(f"Created Waterfall plot for Combined {target_feature} predictions")

print("Combined plots created")

In [None]:
import math

# Loop through each dataset
for rf_model, X_train, y_train, X_test, y_test, target_feature, class_names in rf_models_marvel:
    # Select a random subset of instances from the test data
    num_instances = min((int(len(X_test) * 0.1) if len(X_test) > 250 else len(X_test)), 12)
    selected_indices = np.random.choice(len(X_test), num_instances, replace=False)
    X_test_subset = X_test.iloc[selected_indices]
    y_test_subset = y_test[selected_indices]
    
    # Get the predicted probabilities for the selected instances in the test data
    predicted_probabilities = rf_model.predict_proba(X_test_subset)

    # Get the subset of class names corresponding to the number of classes
    subset_class_names = class_names[:predicted_probabilities.shape[1]]

    # Calculate the number of rows and columns for subplots
    num_instances_subset, num_classes = predicted_probabilities.shape
    num_columns = min(num_classes, 5)  # Limit the number of columns to avoid too many subplots
    num_rows = math.ceil(num_instances_subset / num_columns)

    # Create a larger plot with subplots
    fig, axes = plt.subplots(num_rows, num_columns, figsize=(15, 15))
    fig.suptitle(f"Predicted Probabilities for {target_feature}", fontsize=16)

    # Loop through each instance and its corresponding predicted probabilities
    for i, (probs, ax) in enumerate(zip(predicted_probabilities, axes.flatten())):
        character_name = str(marvel_names[selected_indices[i]])  # Convert to string
        actual_class = y_test_subset[i]  # Get the actual class value
        
        # Plot the predicted probabilities as a bar chart
        ax.bar(subset_class_names, probs, label="Predicted", alpha=0.7)
        
        # Plot the actual class value as a point on top of the predicted probabilities
        ax.scatter(actual_class, 1, color="red", label="Actual")
        
        ax.set_title(character_name)  # Set character name as the title
        ax.set_xlabel("Class")
        ax.set_ylabel("Probability")
        ax.set_xticks(range(len(subset_class_names)))  # Set tick positions
        ax.set_xticklabels(subset_class_names, rotation=45)  # Set tick labels
        ax.legend()  # Show legend
        
    # Adjust layout and display the plot
    plt.tight_layout()
    plt.subplots_adjust(top=0.9)
    plt.savefig(f"""figs/prob_charts/Marvel_{target_feature}_{selected_indices}_probabilities.png""")
    plt.show()


In [None]:
# Loop through each dataset
for rf_model, X_train, y_train, X_test, y_test, target_feature, class_names in rf_models_dc:
    # Select a random subset of instances from the test data
    num_instances = min((int(len(X_test) * 0.1) if len(X_test) > 250 else len(X_test)), 12)
    selected_indices = np.random.choice(len(X_test), num_instances, replace=False)
    X_test_subset = X_test.iloc[selected_indices]
    y_test_subset = y_test[selected_indices]
    
    # Get the predicted probabilities for the selected instances in the test data
    predicted_probabilities = rf_model.predict_proba(X_test_subset)

    # Get the subset of class names corresponding to the number of classes
    subset_class_names = class_names[:predicted_probabilities.shape[1]]

    # Calculate the number of rows and columns for subplots
    num_instances_subset, num_classes = predicted_probabilities.shape
    num_columns = min(num_classes, 5)  # Limit the number of columns to avoid too many subplots
    num_rows = math.ceil(num_instances_subset / num_columns)

    # Create a larger plot with subplots
    fig, axes = plt.subplots(num_rows, num_columns, figsize=(15, 15))
    fig.suptitle(f"Predicted Probabilities for {target_feature}", fontsize=16)

    # Loop through each instance and its corresponding predicted probabilities
    for i, (probs, ax) in enumerate(zip(predicted_probabilities, axes.flatten())):
        character_name = str(dc_names[selected_indices[i]])  # Convert to string
        actual_class = y_test_subset[i]  # Get the actual class value
        
        # Plot the predicted probabilities as a bar chart
        ax.bar(subset_class_names, probs, label="Predicted", alpha=0.7)
        
        # Plot the actual class value as a point on top of the predicted probabilities
        ax.scatter(actual_class, 1, color="red", label="Actual")
        
        ax.set_title(character_name)  # Set character name as the title
        ax.set_xlabel("Class")
        ax.set_ylabel("Probability")
        ax.set_xticks(range(len(subset_class_names)))  # Set tick positions
        ax.set_xticklabels(subset_class_names, rotation=45)  # Set tick labels
        ax.legend()  # Show legend
        
    # Adjust layout and display the plot
    plt.tight_layout()
    plt.subplots_adjust(top=0.9)
    plt.savefig(f"""figs/prob_charts/DC_{target_feature}_{selected_indices}_probabilities.png""")
    plt.show()


In [None]:
# Loop through each dataset
for rf_model, X_train, y_train, X_test, y_test, target_feature, class_names in rf_models_combined:
    # Select a random subset of instances from the test data
    num_instances = min((int(len(X_test) * 0.1) if len(X_test) > 250 else len(X_test)), 12)
    selected_indices = np.random.choice(len(X_test), num_instances, replace=False)
    X_test_subset = X_test.iloc[selected_indices]
    y_test_subset = y_test[selected_indices]
    
    # Get the predicted probabilities for the selected instances in the test data
    predicted_probabilities = rf_model.predict_proba(X_test_subset)

    # Get the subset of class names corresponding to the number of classes
    subset_class_names = class_names[:predicted_probabilities.shape[1]]

    # Calculate the number of rows and columns for subplots
    num_instances_subset, num_classes = predicted_probabilities.shape
    num_columns = min(num_classes, 5)  # Limit the number of columns to avoid too many subplots
    num_rows = math.ceil(num_instances_subset / num_columns)

    # Create a larger plot with subplots
    fig, axes = plt.subplots(num_rows, num_columns, figsize=(15, 15))
    fig.suptitle(f"Predicted Probabilities for {target_feature}", fontsize=16)

    # Loop through each instance and its corresponding predicted probabilities
    for i, (probs, ax) in enumerate(zip(predicted_probabilities, axes.flatten())):
        character_name = str(combined_names[selected_indices[i]])  # Convert to string
        actual_class = y_test_subset[i]  # Get the actual class value
        
        # Plot the predicted probabilities as a bar chart
        ax.bar(subset_class_names, probs, label="Predicted", alpha=0.7)
        
        # Plot the actual class value as a point on top of the predicted probabilities
        ax.scatter(actual_class, 1, color="red", label="Actual")
        
        ax.set_title(character_name)  # Set character name as the title
        ax.set_xlabel("Class")
        ax.set_ylabel("Probability")
        ax.set_xticks(range(len(subset_class_names)))  # Set tick positions
        ax.set_xticklabels(subset_class_names, rotation=45)  # Set tick labels
        ax.legend()  # Show legend
        
    # Adjust layout and display the plot
    plt.tight_layout()
    plt.subplots_adjust(top=0.9)
    plt.savefig(f"""figs/prob_charts/Combined_{target_feature}_{selected_indices}_probabilities.png""")
    plt.show()
