In [23]:
import pandas as pd
import numpy as np
from datetime import datetime
from scipy import stats

def load_data():
    # Read CSV files
    ratings = pd.read_csv('extracted_ratings.csv')
    users = pd.read_csv('user_details.csv')
    movies = pd.read_csv('movie_details.csv')

    # Merge datasets
    analysis_df = ratings.merge(users, on='user_id', how='left')
    analysis_df = analysis_df.merge(movies, on='movie_id', how='left')

    return analysis_df

def analyze_demographic_representation():
    df = load_data()

    # Calculate gender distribution
    gender_dist = df.groupby('gender')['user_id'].nunique()
    total_users = gender_dist.sum()
    gender_percentages = (gender_dist / total_users * 100).round(2)

    # Calculate age distribution
    age_stats = df.groupby('user_id')['age'].first().agg(['count', 'mean', 'std', 'min', 'max'])
    missing_age = df['age'].isnull().sum()

    return gender_dist, gender_percentages, age_stats, missing_age

def analyze_rating_bias():
    df = load_data()

    # Calculate rating statistics by gender
    rating_by_gender = df.groupby('gender')['rating'].agg(['mean', 'std', 'count'])

    # Perform t-test for ratings between genders
    male_ratings = df[df['gender'] == 'M']['rating']
    female_ratings = df[df['gender'] == 'F']['rating']
    t_stat, p_value = stats.ttest_ind(male_ratings, female_ratings)

    return rating_by_gender, t_stat, p_value

def generate_enhanced_fairness_report():
    # Perform analyses
    gender_dist, gender_pct, age_stats, missing_age = analyze_demographic_representation()
    rating_stats, t_stat, p_value = analyze_rating_bias()

    report = f"""
    Enhanced Fairness Analysis Report
    Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

    1. Demographic Representation (Negative Findings)
    ---------------------------------------------
    Gender Distribution:
    {gender_dist.to_string()}

    Gender Representation (%):
    {gender_pct.to_string()}

    FINDING: Significant gender imbalance - {gender_pct['M']:.1f}% male vs {gender_pct['F']:.1f}% female users

    2. Age Data Analysis
    ------------------
    Total Users with Age Data: {age_stats['count']}
    Average Age: {age_stats['mean']:.1f}
    Age Range: {age_stats['min']:.0f} to {age_stats['max']:.0f}

    FINDING: No underage users detected - unable to evaluate youth protection measures

    3. Rating Bias Analysis
    --------------------
    Rating Statistics by Gender:
    {rating_stats.to_string()}

    Statistical Analysis:
    - Rating Difference (M-F): {rating_stats['mean']['M'] - rating_stats['mean']['F']:.2f}
    - T-statistic: {t_stat:.3f}
    - P-value: {p_value:.3f}

    FINDING: {'Statistically significant' if p_value < 0.05 else 'No significant'} difference in ratings between genders

    Limitations and Concerns:
    1. Severe gender imbalance (8:1 male:female ratio) limits reliability of gender-based analysis
    2. Lack of underage users prevents evaluation of youth protection measures
    3. Rating disparity requires further investigation into potential recommendation bias

    Recommendations:
    1. Implement targeted outreach to improve gender balance in user base
    2. Develop specific testing protocols for youth protection measures
    3. Investigate potential sources of gender-based rating differences
    4. Consider collecting additional demographic data to enable intersectional analysis
    """

    return report

# Generate and print the enhanced report
if __name__ == "__main__":
    print(generate_enhanced_fairness_report())


    Enhanced Fairness Analysis Report
    Generated on: 2024-12-08 03:45:01
    
    1. Demographic Representation (Negative Findings)
    ---------------------------------------------
    Gender Distribution:
    gender
F    10
M    82
    
    Gender Representation (%):
    gender
F    10.87
M    89.13
    
    FINDING: Significant gender imbalance - 89.1% male vs 10.9% female users
    
    2. Age Data Analysis
    ------------------
    Total Users with Age Data: 92.0
    Average Age: 29.8
    Age Range: 19 to 57
    
    FINDING: No underage users detected - unable to evaluate youth protection measures
    
    3. Rating Bias Analysis
    --------------------
    Rating Statistics by Gender:
                mean       std  count
gender                           
F       3.900000  0.737865     10
M       4.170732  0.699292     82
    
    Statistical Analysis:
    - Rating Difference (M-F): 0.27
    - T-statistic: 1.149
    - P-value: 0.253
    
    FINDING: No significant differe

In [24]:
import pandas as pd
import numpy as np
from datetime import datetime
from scipy import stats

def load_data():
    # Read CSV files
    ratings = pd.read_csv('extracted_ratings.csv')
    users = pd.read_csv('user_details.csv')
    movies = pd.read_csv('movie_details.csv')

    # Merge datasets
    analysis_df = ratings.merge(users, on='user_id', how='left')
    analysis_df = analysis_df.merge(movies, on='movie_id', how='left')

    # Create demographic group column
    analysis_df['demographic_group'] = analysis_df['gender'] + '_' + analysis_df['occupation'].fillna('Unknown')

    return analysis_df

def analyze_demographic_representation():
    df = load_data()

    # Calculate gender distribution
    gender_dist = df.groupby('gender')['user_id'].nunique()
    total_users = gender_dist.sum()
    gender_percentages = (gender_dist / total_users * 100).round(2)

    # Calculate occupation distribution
    occupation_dist = df.groupby(['gender', 'occupation'])['user_id'].nunique().reset_index()
    occupation_dist.columns = ['gender', 'occupation', 'user_count']

    # Calculate age distribution
    age_stats = df.groupby('user_id')['age'].first().agg(['count', 'mean', 'std', 'min', 'max'])
    missing_age = df['age'].isnull().sum()

    return gender_dist, gender_percentages, occupation_dist, age_stats, missing_age

def analyze_genre_diversity():
    df = load_data()

    # Split genres string and explode to separate rows
    df['genres'] = df['genres'].fillna('').str.split(', ')
    df_exploded = df.explode('genres')

    # Calculate genre diversity by demographic group
    genre_diversity = df_exploded.groupby('demographic_group')['genres'].nunique()
    total_recommendations = df_exploded.groupby('demographic_group').size()
    normalized_diversity = (genre_diversity / total_recommendations).round(6)

    return normalized_diversity

def analyze_rating_bias():
    df = load_data()

    # Calculate rating statistics by gender
    rating_by_gender = df.groupby('gender')['rating'].agg(['mean', 'std', 'count'])

    # Calculate rating statistics by demographic group
    rating_by_demo = df.groupby('demographic_group')['rating'].agg(['mean', 'std', 'count'])

    # Perform t-test for ratings between genders
    male_ratings = df[df['gender'] == 'M']['rating']
    female_ratings = df[df['gender'] == 'F']['rating']
    t_stat, p_value = stats.ttest_ind(male_ratings, female_ratings)

    return rating_by_gender, rating_by_demo, t_stat, p_value

def generate_enhanced_fairness_report():
    # Perform analyses
    gender_dist, gender_pct, occupation_dist, age_stats, missing_age = analyze_demographic_representation()
    rating_stats, rating_by_demo, t_stat, p_value = analyze_rating_bias()
    genre_diversity = analyze_genre_diversity()

    report = f"""
    Enhanced Fairness Analysis Report
    Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

    1. Demographic Representation (Negative Findings)
    ---------------------------------------------
    Gender Distribution:
    {gender_dist.to_string()}

    Gender Representation (%):
    {gender_pct.to_string()}

    FINDING: Significant gender imbalance - {gender_pct['M']:.1f}% male vs {gender_pct['F']:.1f}% female users

    Occupation Distribution by Gender:
    {occupation_dist.to_string()}

    2. Age Data Analysis
    ------------------
    Total Users with Age Data: {age_stats['count']}
    Average Age: {age_stats['mean']:.1f}
    Age Range: {age_stats['min']:.0f} to {age_stats['max']:.0f}

    FINDING: No underage users detected - unable to evaluate youth protection measures

    3. Genre Diversity Analysis
    ------------------------
    Normalized Genre Diversity by Demographic Group:
    {genre_diversity.to_string()}

    FINDING: Varying levels of genre diversity across demographic groups

    4. Rating Bias Analysis
    --------------------
    Rating Statistics by Gender:
    {rating_stats.to_string()}

    Rating Statistics by Demographic Group:
    {rating_by_demo.to_string()}

    Statistical Analysis:
    - Rating Difference (M-F): {rating_stats['mean']['M'] - rating_stats['mean']['F']:.2f}
    - T-statistic: {t_stat:.3f}
    - P-value: {p_value:.3f}

    FINDING: {'Statistically significant' if p_value < 0.05 else 'No significant'} difference in ratings between genders

    Limitations and Concerns:
    1. Severe gender imbalance (8:1 male:female ratio) limits reliability of gender-based analysis
    2. Lack of underage users prevents evaluation of youth protection measures
    3. Rating disparity requires further investigation into potential recommendation bias
    4. Some demographic groups have limited representation, affecting analysis reliability
    5. Genre diversity varies significantly across demographic groups

    Recommendations:
    1. Implement targeted outreach to improve gender balance in user base
    2. Develop specific testing protocols for youth protection measures
    3. Investigate potential sources of gender-based rating differences
    4. Consider collecting additional demographic data for intersectional analysis
    5. Monitor and adjust recommendation algorithms to ensure equal genre diversity across groups
    """

    return report

# Generate and print the enhanced report
if __name__ == "__main__":
    print(generate_enhanced_fairness_report())


    Enhanced Fairness Analysis Report
    Generated on: 2024-12-08 03:48:51
    
    1. Demographic Representation (Negative Findings)
    ---------------------------------------------
    Gender Distribution:
    gender
F    10
M    82
    
    Gender Representation (%):
    gender
F    10.87
M    89.13
    
    FINDING: Significant gender imbalance - 89.1% male vs 10.9% female users
    
    Occupation Distribution by Gender:
       gender              occupation  user_count
0       F       academic/educator           1
1       F                  artist           1
2       F    college/grad student           2
3       F               homemaker           1
4       F  other or not specified           2
5       F         sales/marketing           2
6       F           self-employed           1
7       M            K-12 student           2
8       M       academic/educator           2
9       M          clerical/admin           1
10      M    college/grad student          27
11      M  