In [1]:
import pandas as pd

data = pd.read_csv("..\..\data\processed_data.csv")

In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset



# (Optional) Build a simple knowledge graph for reference.
# Here, the kg dictionary maps each entity (user or movie) to a list of connected entities.
def build_knowledge_graph(data):
    kg = {}
    for _, row in data.iterrows():
        user = row["user_id"]
        movie = row["movie_id"]
        # Add movie to user's neighbors
        kg.setdefault(user, []).append(movie)
        # Add user to movie's neighbors
        kg.setdefault(movie, []).append(user)
    return kg

kg = build_knowledge_graph(data)

# Step 3: Prepare train and test data
def prepare_train_test_data(data):
    train_data, test_data = train_test_split(data, test_size=0.5, random_state=42)
    train_users = train_data["user_id"].values
    train_movies = train_data["movie_id"].values
    train_gender = train_data["gender"].values
    train_age = train_data["age"].values
    train_occupation = train_data["occupation"].values
    train_zip = train_data["zip_code"].values
    train_labels = train_data["rating"].values

    test_users = test_data["user_id"].values
    test_movies = test_data["movie_id"].values
    test_gender = test_data["gender"].values
    test_age = test_data["age"].values
    test_occupation = test_data["occupation"].values
    test_zip = test_data["zip_code"].values
    test_labels = test_data["rating"].values

    return (train_users, train_movies, train_gender, train_age, train_occupation, train_zip, train_labels,
            test_users, test_movies, test_gender, test_age, test_occupation, test_zip, test_labels)

(train_users, train_movies, train_gender, train_age, train_occupation, train_zip, train_labels,
 test_users, test_movies, test_gender, test_age, test_occupation, test_zip, test_labels) = prepare_train_test_data(data)

# -------------------------------
# New: Define the KGCN module
# -------------------------------
class KGCN(torch.nn.Module):
    def __init__(self, num_entities, embedding_dim, kg, num_neighbors=10):
        super(KGCN, self).__init__()
        self.entity_embedding = torch.nn.Embedding(num_entities, embedding_dim)
        self.kg = kg  # Pre-built knowledge graph (a dict mapping entity -> list of neighbor entities)
        self.num_neighbors = num_neighbors
        self.linear = torch.nn.Linear(embedding_dim, embedding_dim)
        # Initialize weights
        torch.nn.init.xavier_uniform_(self.entity_embedding.weight)
        torch.nn.init.xavier_uniform_(self.linear.weight)
        
    def aggregate_neighbors(self, entity_ids):
        # For each entity, retrieve its neighbor embeddings and aggregate via averaging.
        agg_list = []
        for entity in entity_ids:
            entity_int = entity.item()
            neighbors = self.kg.get(entity_int, [])
            if len(neighbors) == 0:
                # If no neighbors, use a zero vector
                agg_list.append(torch.zeros(self.entity_embedding.embedding_dim, device=entity_ids.device))
            else:
                # Sample neighbors if there are too many
                if len(neighbors) > self.num_neighbors:
                    sampled = np.random.choice(neighbors, self.num_neighbors, replace=False)
                else:
                    sampled = neighbors
                sampled_tensor = torch.tensor(sampled, dtype=torch.long, device=entity_ids.device)
                neighbor_embeds = self.entity_embedding(sampled_tensor)
                agg_list.append(neighbor_embeds.mean(dim=0))
        return torch.stack(agg_list)
    
    def forward(self, entity_ids):
        # Get the base entity embeddings
        entity_embeds = self.entity_embedding(entity_ids)
        # Aggregate neighbor information
        aggregated = self.aggregate_neighbors(entity_ids)
        # Combine the entity embedding with a linear transformation of the aggregated neighbors
        combined = entity_embeds + self.linear(aggregated)
        return combined

# -------------------------------
# Step 4: Define the Full Model with KGCN
# -------------------------------
class FullModel(torch.nn.Module):
    def __init__(self, num_users, num_movies, embedding_dim, kg):
        super(FullModel, self).__init__()
        self.num_users = num_users
        # Collaborative Filtering (CF) embeddings
        self.user_embedding = torch.nn.Embedding(num_users, embedding_dim)
        self.movie_embedding = torch.nn.Embedding(num_movies, embedding_dim)
        # KGCN module for knowledge graph based representations
        # Note: entities include both users and movies; movies are offset by num_users.
        self.kgcn = KGCN(num_users + num_movies, embedding_dim, kg, num_neighbors=10)
        # Fully connected layers to merge CF embeddings, KGCN embeddings, and additional features
        self.fc1 = torch.nn.Linear(embedding_dim * 4 + 4, 128)  # 4 extra features: gender, age, occupation, zip
        self.fc2 = torch.nn.Linear(128, 64)
        self.fc3 = torch.nn.Linear(64, 1)
        
    def forward(self, user_ids, movie_ids, gender, age, occupation, zip_code):
        # Get CF embeddings from dedicated embedding layers
        user_cf = self.user_embedding(user_ids)
        movie_cf = self.movie_embedding(movie_ids)
        
        # Get KGCN embeddings.
        # For users, use their indices directly.
        user_kgcn = self.kgcn(user_ids)
        # For movies, offset indices by num_users (because our embedding table covers all entities)
        movie_kgcn = self.kgcn(movie_ids + self.num_users)
        
        # Concatenate CF embeddings, KGCN embeddings, and additional features
        concat = torch.cat([
            user_cf, movie_cf, user_kgcn, movie_kgcn,
            gender.unsqueeze(1), age.unsqueeze(1),
            occupation.unsqueeze(1), zip_code.unsqueeze(1)
        ], dim=1)
        
        # Pass through fully connected layers for final rating prediction
        x = F.relu(self.fc1(concat))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

num_users = data["user_id"].nunique()
num_movies = data["movie_id"].nunique()
embedding_dim = 16

# Initialize the model, optimizer, and loss criterion
model = FullModel(num_users, num_movies, embedding_dim, kg)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.MSELoss()

# Step 5: Convert data to PyTorch tensors
train_users = torch.tensor(train_users, dtype=torch.long)
train_movies = torch.tensor(train_movies, dtype=torch.long)
train_gender = torch.tensor(train_gender, dtype=torch.float)
train_age = torch.tensor(train_age, dtype=torch.float)
train_occupation = torch.tensor(train_occupation, dtype=torch.float)
train_zip = torch.tensor(train_zip, dtype=torch.float)
train_labels = torch.tensor(train_labels, dtype=torch.float)

test_users = torch.tensor(test_users, dtype=torch.long)
test_movies = torch.tensor(test_movies, dtype=torch.long)
test_gender = torch.tensor(test_gender, dtype=torch.float)
test_age = torch.tensor(test_age, dtype=torch.float)
test_occupation = torch.tensor(test_occupation, dtype=torch.float)
test_zip = torch.tensor(test_zip, dtype=torch.float)
test_labels = torch.tensor(test_labels, dtype=torch.float)

# Create DataLoader for training
train_dataset = TensorDataset(train_users, train_movies, train_gender, train_age, train_occupation, train_zip, train_labels)
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)

# Step 6: Train the Model
for epoch in range(10):
    model.train()
    for batch in train_loader:
        user_ids, movie_ids, gender, age, occupation, zip_code, labels = batch
        optimizer.zero_grad()
        outputs = model(user_ids, movie_ids, gender, age, occupation, zip_code)
        loss = criterion(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

# Step 7: Evaluate the Model
model.eval()
with torch.no_grad():
    test_outputs = model(test_users, test_movies, test_gender, test_age, test_occupation, test_zip)
    test_loss = criterion(test_outputs.squeeze(), test_labels)
    print(f"Test Loss (MSE): {test_loss.item():.4f}")

# Step 8: Get predictions for both correct and flipped genders (for analysis)
with torch.no_grad():
    predictions_correct = model(test_users, test_movies, test_gender, test_age, test_occupation, test_zip)
    flipped_gender = 1 - test_gender  # Flip gender (0 -> 1, 1 -> 0)
    predictions_flipped = model(test_users, test_movies, flipped_gender, test_age, test_occupation, test_zip)

# Combine predictions with test data into a DataFrame.
test_results = pd.DataFrame({
    "user_id": test_users.numpy(),
    "movie_id": test_movies.numpy(),
    "actual_rating": test_labels.numpy(),
    "predicted_rating_correct_gender": predictions_correct.squeeze().numpy(),
    "predicted_rating_flipped_gender": predictions_flipped.squeeze().numpy()
})

# NEW: Function to get top-N recommendations for a particular user from the test dataset
def get_top_n_recommendations(user_id, n, results_df):
    """
    Given a user ID and the results DataFrame, return the top-N movie recommendations
    based on the predicted rating (using the correct gender predictions).
    """
    # Filter for the specified user
    user_results = results_df[results_df["user_id"] == user_id]
    
    # If no candidates are found, notify and return an empty DataFrame
    if user_results.empty:
        print(f"No candidate movies found for user {user_id}.")
        return pd.DataFrame()
    
    # Sort the candidate movies in descending order by predicted rating
    top_n = user_results.sort_values(by="predicted_rating_correct_gender", ascending=False).head(n)
    return top_n

# Example: Get top 5 movie recommendations for a particular user (e.g. user with encoded id 0)
top_movies = get_top_n_recommendations(user_id=0, n=5, results_df=test_results)
print("Top 5 recommended movies for user 0:")
print(top_movies)


Epoch 1, Loss: 0.897037148475647
Epoch 2, Loss: 0.9936805963516235
Epoch 3, Loss: 1.0258262157440186
Epoch 4, Loss: 0.7688721418380737
Epoch 5, Loss: 0.6220142245292664
Epoch 6, Loss: 0.8994971513748169
Epoch 7, Loss: 0.908700704574585
Epoch 8, Loss: 0.9084241390228271
Epoch 9, Loss: 0.8785103559494019
Epoch 10, Loss: 0.773776113986969
Test Loss (MSE): 0.8965
Top 5 recommended movies for user 0:
       user_id  movie_id  actual_rating  predicted_rating_correct_gender  \
12665        0       271            3.0                         4.653302   
3233         0       169            5.0                         4.631095   
18409        0        97            4.0                         4.556288   
1840         0        22            4.0                         4.436569   
17650        0       167            5.0                         4.382499   

       predicted_rating_flipped_gender  
12665                         4.686843  
3233                          4.682657  
18409                

In [3]:
import numpy as np
import pandas as pd
import os

# Helper function to compute Discounted Cumulative Gain (DCG)
def dcg_at_k(relevances, k=None):
    """Compute DCG for a list of relevance scores.
    If k is None, use all elements."""
    if k is None:
        k = len(relevances)
    relevances = np.array(relevances)[:k]
    # Compute gains: you can use 2^r - 1 (exponential gain) or simply r.
    gains = 2 ** relevances - 1
    # Compute discount factors: log2(rank+1) where rank starts at 1
    discounts = np.log2(np.arange(2, 2 + len(relevances)))
    return np.sum(gains / discounts)

# Function to compute NDCG for one user given a particular prediction column
def ndcg_for_user(user_df, predicted_col, k=None):
    """
    Given a user's DataFrame (with actual ratings and predictions),
    compute NDCG based on sorting by the predicted column.
    """
    # Sort the user data by predicted rating (descending order)
    user_df_sorted = user_df.sort_values(by=predicted_col, ascending=False)
    actual_relevances = user_df_sorted["actual_rating"].values
    dcg = dcg_at_k(actual_relevances, k)
    
    # Compute the ideal DCG by sorting actual ratings in descending order
    ideal_relevances = np.sort(user_df["actual_rating"].values)[::-1]
    idcg = dcg_at_k(ideal_relevances, k)
    
    # If the ideal DCG is zero, return zero to avoid division by zero.
    return dcg / idcg if idcg > 0 else 0.0

# Calculate NDCG for each user in the test set
user_ndcg = {}
for user in test_results_popular["user_id"].unique():
    user_df = test_results_popular[test_results_popular["user_id"] == user]
    ndcg_correct = ndcg_for_user(user_df, "predicted_rating_correct_gender")
    ndcg_flipped = ndcg_for_user(user_df, "predicted_rating_flipped_gender")
    user_ndcg[user] = {"ndcg_correct": ndcg_correct, "ndcg_flipped": ndcg_flipped}

# Convert the dictionary to a DataFrame
ndcg_df = pd.DataFrame.from_dict(user_ndcg, orient="index").reset_index()
ndcg_df = ndcg_df.rename(columns={"index": "user_id"})

# Ensure the "results_top_n" directory exists
# Ensure the "results_top_n" directory exists
output_dir = "results_top_n"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "per_user_ndcg_scores_KGCN_populer.csv")

# Save the DataFrame to CSV in the results_top_n directory
ndcg_df.to_csv(output_file, index=False)
print(f"Per-user NDCG scores saved to {output_file}")

# Optionally, print NDCG for each user to the console
print("Per-user NDCG scores:")
for user, scores in user_ndcg.items():
    print(f"User {user}: NDCG (ground truth gender) = {scores['ndcg_correct']:.4f}, NDCG (flipped gender) = {scores['ndcg_flipped']:.4f}")


NameError: name 'test_results_popular' is not defined

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F

# Function to compute Discounted Cumulative Gain (DCG)
def compute_dcg(relevance_scores):
    """
    Computes DCG given a list of relevance scores.
    DCG = sum(rel_i / log2(i + 1)), where rel_i is the rating at position i (1-based index)
    """
    return np.sum((2**relevance_scores - 1) / np.log2(np.arange(1, len(relevance_scores) + 1) + 1))

# Function to compute Normalized Discounted Cumulative Gain (NDCG)
def compute_ndcg(actual_ratings, predicted_ratings, top_n):
    """
    Computes NDCG for a given set of actual and predicted ratings.
    """
    # Get top-N predicted movie indices (sorted by predicted rating)
    top_n_pred_indices = np.argsort(predicted_ratings)[::-1][:top_n]
    
    # Get top-N actual movie ratings sorted in the ideal order
    ideal_ratings = np.sort(actual_ratings)[::-1][:top_n]

    # Retrieve the corresponding actual ratings for the predicted top-N movies
    predicted_ratings_sorted = actual_ratings[top_n_pred_indices]

    # Compute DCG and IDCG
    dcg = compute_dcg(predicted_ratings_sorted)
    idcg = compute_dcg(ideal_ratings)

    # Avoid division by zero
    return dcg / idcg if idcg > 0 else 0

# Function to evaluate NDCG over all users
def evaluate_ndcg(test_results, top_n=10):
    """
    Computes the average NDCG over all users in the test set.
    """
    ndcg_scores = []

    for user_id in test_results["user_id"].unique():
        user_results = test_results[test_results["user_id"] == user_id]

        # If user has fewer than top_n movies, skip
        if len(user_results) < top_n:
            continue

        actual_ratings = user_results["actual_rating"].values
        predicted_ratings = user_results["predicted_rating_correct_gender"].values

        ndcg_score = compute_ndcg(actual_ratings, predicted_ratings, top_n)
        ndcg_scores.append(ndcg_score)

    return np.mean(ndcg_scores) if ndcg_scores else 0

# Compute and print the NDCG score
ndcg_value = evaluate_ndcg(test_results, top_n=10)
print(f"Average NDCG@10: {ndcg_value:.4f}")


In [None]:
import pandas as pd
import numpy as np

# -------------------------------
# Step 9: Create Popular and Unpopular Movie Test Datasets
# -------------------------------

# Compute movie popularity based on the number of ratings
movie_popularity = data.groupby("movie_id")["rating"].count().reset_index()
movie_popularity = movie_popularity.rename(columns={"rating": "num_ratings"})

# Define threshold for popularity (median as the split point)
popularity_threshold = movie_popularity["num_ratings"].median()

# Identify popular and unpopular movies
popular_movies = movie_popularity[movie_popularity["num_ratings"] >= popularity_threshold]["movie_id"]
unpopular_movies = movie_popularity[movie_popularity["num_ratings"] < popularity_threshold]["movie_id"]

# Create new test datasets
test_results_popular = test_results[test_results["movie_id"].isin(popular_movies)]
test_results_unpopular = test_results[test_results["movie_id"].isin(unpopular_movies)]



In [None]:
import numpy as np
import pandas as pd
import os

def reciprocal_rank_at_k(relevances, k=None):
    """
    Compute the reciprocal rank for a list of actual relevance scores.
    The first occurrence of the maximum relevance is considered the correct item.
    """
    if k is None:
        k = len(relevances)
    relevances = np.array(relevances)[:k]
    max_rel = np.max(relevances)
    for idx, rel in enumerate(relevances, start=1):
        if rel == max_rel:
            return 1.0 / idx
    return 0.0

def mrr_for_user(user_df, predicted_col, k=None):
    """
    Given a user's DataFrame (with actual ratings and predictions),
    compute MRR based on sorting by the predicted column.
    """
    sorted_df = user_df.sort_values(by=predicted_col, ascending=False)
    actual_relevances = sorted_df["actual_rating"].values
    return reciprocal_rank_at_k(actual_relevances, 5)

# Calculate MRR for each user using the specified prediction columns
user_mrr = {}
for user in test_results["user_id"].unique():
    user_df = test_results[test_results["user_id"] == user]
    mrr_correct = mrr_for_user(user_df, "predicted_rating_correct_gender")
    mrr_flipped = mrr_for_user(user_df, "predicted_rating_flipped_gender")
    user_mrr[user] = {"mrr_correct": mrr_correct, "mrr_flipped": mrr_flipped}

# Convert the results into a DataFrame
mrr_df = pd.DataFrame.from_dict(user_mrr, orient="index").reset_index()
mrr_df = mrr_df.rename(columns={"index": "user_id"})

# Ensure the output directory exists and save the results
output_dir = "results_top_n_mrr/five"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "per_user_mrr_scores_KGCN_5.csv")
mrr_df.to_csv(output_file, index=False)
print(f"Per-user MRR scores saved to {output_file}")

# Optionally, print MRR for each user to the console
for user, scores in user_mrr.items():
    print(f"User {user}: MRR (ground truth) = {scores['mrr_correct']:.4f}, MRR (flipped) = {scores['mrr_flipped']:.4f}")


In [6]:
import numpy as np
import pandas as pd
import os

# Helper function to compute Discounted Cumulative Gain (DCG)
def dcg_at_k(relevances, k=None):
    """Compute DCG for a list of relevance scores.
    If k is None, use all elements."""
    if k is None:
        k = len(relevances)
    relevances = np.array(relevances)[:k]

    # Compute gains: you can use 2^r - 1 (exponential gain) or simply r.
    gains = 2 ** relevances - 1

    # Compute discount factors: log2(rank+1), where rank starts at 1
    discounts = np.log2(np.arange(2, 2 + len(relevances)))
    return np.sum(gains / discounts)

# Function to compute NDCG for one user given a particular prediction column
def ndcg_for_user(user_df, predicted_col, k=None):
    """
    Given a user's DataFrame (with actual ratings and predictions),
    compute NDCG based on sorting by the predicted column.
    """
    # Sort by predicted rating in descending order
    user_df_sorted = user_df.sort_values(by=predicted_col, ascending=False)
    actual_relevances = user_df_sorted["actual_rating"].values
    dcg = dcg_at_k(actual_relevances, k)

    # Compute the ideal DCG by sorting actual ratings in descending order
    ideal_relevances = np.sort(user_df["actual_rating"].values)[::-1]
    idcg = dcg_at_k(ideal_relevances, k)

    # Avoid division by zero if IDCG is 0
    return dcg / idcg if idcg > 0 else 0.0


 
genre_columns = [
     "Action", "Adventure", "Animation", "Children's", "Comedy", "Crime",
     "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical",
     "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]

genre_test_results = {}
for genre in genre_columns:
     genre_test_results[genre] = test_results_with_genre[test_results_with_genre[genre] == 1]


# Directory for saving the results
output_dir = "results_top_n"
os.makedirs(output_dir, exist_ok=True)

# Loop through each genre and compute per-user NDCG
for genre, genre_df in genre_test_results.items():
    user_ndcg = {}

    # Iterate over each user in this particular genre
    for user_id in genre_df["user_id"].unique():
        user_df = genre_df[genre_df["user_id"] == user_id]

        # Calculate NDCG for correct vs. flipped gender predictions
        ndcg_correct = ndcg_for_user(user_df, "predicted_rating_correct_gender")
        ndcg_flipped = ndcg_for_user(user_df, "predicted_rating_flipped_gender")

        user_ndcg[user_id] = {
            "ndcg_correct": ndcg_correct,
            "ndcg_flipped": ndcg_flipped
        }

    # Convert user_ndcg to a DataFrame
    ndcg_df = pd.DataFrame.from_dict(user_ndcg, orient="index").reset_index()
    ndcg_df.rename(columns={"index": "user_id"}, inplace=True)

    # Construct an output filename that includes the genre name
    output_dir = "results_top_n_genre/KGCN"
    os.makedirs(output_dir, exist_ok=True)
    output_file = os.path.join(
        output_dir, f"per_user_ndcg_scores_KGCN_{genre.lower()}.csv"
    )
    ndcg_df.to_csv(output_file, index=False)
    print(f"Per-user NDCG scores saved for '{genre}' to: {output_file}")

    # (Optional) Print some results to the console
    print(f"--- NDCG for {genre} ---")
    for u_id, scores in user_ndcg.items():
        print(f"User {u_id}: "
              f"NDCG (ground truth gender) = {scores['ndcg_correct']:.4f}, "
              f"NDCG (flipped gender) = {scores['ndcg_flipped']:.4f}")
    print("------\n")


Per-user NDCG scores saved for 'Action' to: results_top_n_genre/KGCN\per_user_ndcg_scores_KGCN_action.csv
--- NDCG for Action ---
User 258: NDCG (ground truth gender) = 1.0000, NDCG (flipped gender) = 1.0000
User 652: NDCG (ground truth gender) = 0.9082, NDCG (flipped gender) = 0.9080
User 599: NDCG (ground truth gender) = 0.9559, NDCG (flipped gender) = 0.9559
User 372: NDCG (ground truth gender) = 0.8203, NDCG (flipped gender) = 0.8203
User 486: NDCG (ground truth gender) = 0.9419, NDCG (flipped gender) = 0.9566
User 386: NDCG (ground truth gender) = 0.8459, NDCG (flipped gender) = 0.8497
User 935: NDCG (ground truth gender) = 0.8406, NDCG (flipped gender) = 0.8221
User 216: NDCG (ground truth gender) = 0.8730, NDCG (flipped gender) = 0.8720
User 269: NDCG (ground truth gender) = 0.8512, NDCG (flipped gender) = 0.8500
User 301: NDCG (ground truth gender) = 0.7592, NDCG (flipped gender) = 1.0000
User 895: NDCG (ground truth gender) = 0.9747, NDCG (flipped gender) = 0.9750
User 449: ND

In [4]:
# Define the genre columns as they appear in your original data
genre_columns = [
    "unknown", "Action", "Adventure", "Animation", "Children's", "Comedy", "Crime",
    "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery",
    "Romance", "Sci-Fi", "Thriller", "War", "Western"
]

# Create a DataFrame with unique movie genre information by dropping duplicates
movie_genres = data[['movie_id'] + genre_columns].drop_duplicates(subset='movie_id')

# Merge the unique movie genres into test_results using movie_id as the key
test_results_with_genre = test_results.merge(movie_genres, on='movie_id', how='left')

print(test_results)

       user_id  movie_id  actual_rating  predicted_rating_correct_gender  \
0           57       247            4.0                         3.910340   
1          363       324            4.0                         3.297034   
2          258       404            3.0                         3.294059   
3          497       521            3.0                         3.271795   
4          279       101            5.0                         3.032650   
...        ...       ...            ...                              ...   
19995      617         3            2.0                         3.051993   
19996       59       162            4.0                         3.835208   
19997      726       277            2.0                         3.049197   
19998      167       923            2.0                         3.174374   
19999      576       364            5.0                         3.384022   

       predicted_rating_flipped_gender  
0                             3.990082  
1    

In [5]:
test_results_with_genre

Unnamed: 0,user_id,movie_id,actual_rating,predicted_rating_correct_gender,predicted_rating_flipped_gender,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,57,247,4.0,3.910340,3.990082,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,363,324,4.0,3.297034,3.371068,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,258,404,3.0,3.294059,3.387242,0,1,1,0,0,...,0,0,0,0,1,0,0,0,0,0
3,497,521,3.0,3.271795,3.316890,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,279,101,5.0,3.032650,2.958055,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,617,3,2.0,3.051993,2.991374,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19996,59,162,4.0,3.835208,3.992004,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19997,726,277,2.0,3.049197,3.111559,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
19998,167,923,2.0,3.174374,3.248888,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Assuming test_result is your prediction DataFrame with genre columns

# Define the genre columns you want to split by (excluding "unknown" if not needed)
genre_columns = [
    "Action", "Adventure", "Animation", "Children's", "Comedy", "Crime",
    "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical",
    "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]

# Create a dictionary to hold the filtered DataFrames for each genre
genre_test_results = {}

for genre in genre_columns:
    # Filter rows where the genre flag is 1 (i.e., the movie belongs to that genre)
    genre_test_results[genre] = test_results_with_genre[test_results[genre] == 1]

# Example: access the test results for Action movies
test_result_action = genre_test_results["Adventure"]
