In [118]:
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import random

In [119]:
import pandas as pd

# Read the CSV file
df_shows = pd.read_csv('../data/all_data.csv')

In [120]:
df_shows.columns

Index(['category', 'title', 'tags', 'age_rating', 'rating_description',
       'description', 'representation', 'duration_txt', 'duration_sec',
       'first_broadcast', 'synopsis_small', 'synopsis_medium',
       'synopsis_large', 'image', 'showId'],
      dtype='object')

In [121]:
genres = df_shows['category'].unique()

In [122]:
genres

array(['CBBC', 'Films', 'Documentaries', 'From the Archives', 'Sports',
       'Entertainment', 'Comedy', 'Signed', 'Science & Nature'],
      dtype=object)

In [130]:
def generate_user_ratings_with_profile(df_shows, personas, min_history=5, max_history=15):
    user_data = []

    for persona, preferences in personas.items():
        # Generating user IDs for each persona
        user_ids = [f"{persona}_{i+1}" for i in range(preferences['n_users'])]

        for user_id in user_ids:
            # Shuffle the preferred genres for each persona
            shuffled_genres = preferences['preferred_genres']
            random.shuffle(shuffled_genres)
            
            # If the persona is children_viewer, exclude 'CBBC' genre from additional genres
            if persona == 'children_viewer':
                additional_genres = list(set(df_shows['category'].unique()) - set(shuffled_genres) - {'CBBC'})
            else:
                additional_genres = list(set(df_shows['category'].unique()) - set(shuffled_genres))
            random.shuffle(additional_genres)
            selected_additional_genres = additional_genres[:random.randint(1, 2)]
            genres_to_watch = shuffled_genres + selected_additional_genres

            # Determine the number of shows the user has seen within the specified range
            num_shows = random.randint(min_history, max_history)

            # Select shows to add to user's viewing history
            shows_watched = df_shows[df_shows['category'].isin(genres_to_watch)].sample(n=num_shows, replace=True)

            for _, row in shows_watched.iterrows():
                # Assigning rating based on user's preferences
                rating = np.nan if random.random() > preferences['rating_probability'] else random.randint(1, 5)
                if not np.isnan(rating):
                    # Generating a random date within the past year
                    date_watched = datetime.now() - timedelta(days=random.randint(0, 365))
                    user_data.append({
                        "userId": user_id,
                        "showId": row["showId"],  
                        "rating": rating,
                        "date_watched": date_watched
                    })

    user_ratings_df = pd.DataFrame(user_data)

    # Save data for specific users to separate CSV files
    specific_users = ["zang", "michelle", "asha", "sine", "zane"]
    for userId in specific_users:
        user_df = user_ratings_df[user_ratings_df['userId'].str.startswith(userId)]
        user_df.to_csv(f"{userId}_ratings.csv", index=False)

    return user_ratings_df

In [131]:
personas = {
    "sports_fan": {
        "preferred_genres": ["Sports"],
        "rating_probability": 0.8,
        "n_users": 100
    },
    "documentary_enthusiast": {
        "preferred_genres": ["Documentaries", "From the Archives", "Science & Nature"],
        "rating_probability": 0.7,
        "n_users": 130
    },
    "comedy_lover": {
        "preferred_genres": ["Comedy"],
        "rating_probability": 0.7,
        "n_users": 120
    },
    "entertainment_buff": {
        "preferred_genres": ["Entertainment"],
        "rating_probability": 0.7,
        "n_users": 120
    },
    
    "films_freak": {
        "preferred_genres": ["Entertainment"],
        "rating_probability": 0.7,
        "n_users": 90
    },
    "signed_fav": {
        "preferred_genres": ["Entertainment"],
        "rating_probability": 0.7,
        "n_users": 70
    },
    "children_viewer": {
        "preferred_genres": ["CBBC"],
        "rating_probability": 0.8,
        "n_users": 80
    }
}

# Our users
personas.update({
    "zang": {
        "preferred_genres": ["Entertainment", "Comedy", "From the Archives"],
        "rating_probability": 0.6,
        "n_users": 1
    },
    "michelle": {
        "preferred_genres": ["CBBC"],
        "rating_probability": 0.7,
        "n_users": 1
    },
    "asha": {
        "preferred_genres": ["Documentaries", "Entertainment", "CBBC", "Films"],
        "rating_probability": 0.7,
        "n_users": 1
    },
    "sine": {
        "preferred_genres": ["Comedy", "Sports", "Films"],
        "rating_probability": 0.8,
        "n_users": 1
    },
    "zane": {
        "preferred_genres": ["Sports", "Science & Nature", "Comedy", "Signed"],
        "rating_probability": 0.6,
        "n_users": 1
    }
})


# Generate user ratings with updated personas
user_ratings = generate_user_ratings_with_profile(df_shows, personas)
user_ratings.head()

Unnamed: 0,userId,showId,rating,date_watched
0,sports_fan_1,134,3,2024-02-23 17:55:10.181891
1,sports_fan_1,226,5,2023-04-21 17:55:10.181983
2,sports_fan_1,224,4,2023-08-21 17:55:10.182056
3,sports_fan_1,140,1,2023-04-01 17:55:10.182125
4,sports_fan_1,121,5,2023-11-09 17:55:10.182191


In [132]:
user_ratings[user_ratings["userId"]=="sports_fan_2"]

Unnamed: 0,userId,showId,rating,date_watched
13,sports_fan_2,149,5,2024-02-28 17:55:10.183677
14,sports_fan_2,125,5,2023-06-25 17:55:10.183759
15,sports_fan_2,121,3,2023-12-13 17:55:10.183831
16,sports_fan_2,143,4,2024-02-04 17:55:10.183898
17,sports_fan_2,206,3,2023-06-19 17:55:10.183963
18,sports_fan_2,191,1,2024-01-11 17:55:10.184074


In [133]:
user_ratings[user_ratings["userId"]=="michelle_1"]

Unnamed: 0,userId,showId,rating,date_watched
5157,michelle_1,172,5,2023-08-28 17:55:10.880599
5158,michelle_1,167,2,2023-04-09 17:55:10.880651
5159,michelle_1,171,1,2023-04-08 17:55:10.880696
5160,michelle_1,19,3,2023-11-27 17:55:10.880739
5161,michelle_1,10,5,2024-01-18 17:55:10.880781


In [134]:
file_path = "user_ratings.csv"

# Save the DataFrame to a CSV file
user_ratings.to_csv(file_path, index=False)