### install packages

In [None]:
pip install pandas numpy matplotlib polars pyarrow plotly nbformat kaleido

In [None]:
import pandas as pd
import numpy as np
import polars as pl
import gc
import pyarrow
from pathlib import Path #to keep importing datasets from relative paths consistent across OS
import plotly
import plotly.express as px
import nbformat
import plotly.io as pio
import kaleido
import matplotlib.pyplot as plt


### define path

In [4]:
#Get current notebook directory
current_dir = Path(__file__).parent if "__file__" in globals() else Path.cwd()

# Define path relative to project root
data_path = current_dir.parent.parent / "data" 


## read data

In [5]:
df = pl.read_csv(data_path / 'random_state158.csv')

### data cleaning

In [6]:
# userId_y and timestamp_y come from tags and should be dropped
# correct userId_x and timestamp_x names
df = df.drop(["userId_y","timestamp_y"])


In [7]:
df = df.rename({
    'userId_x' : 'userId',
    'timestamp_x' : 'timestamp'
})

In [8]:
df.head()

userId,movieId,rating,timestamp,tag,title,genres,imdbId,tmdbId
i64,i64,f64,i64,str,str,str,i64,f64
17682,31433,3.5,1544094795,"""hot guys""","""Wedding Date, The (2005)""","""Comedy|Romance""",372532,6961.0
17682,31433,3.5,1544094795,"""family""","""Wedding Date, The (2005)""","""Comedy|Romance""",372532,6961.0
17682,31433,3.5,1544094795,"""romance""","""Wedding Date, The (2005)""","""Comedy|Romance""",372532,6961.0
17682,31433,3.5,1544094795,"""watched 2007""","""Wedding Date, The (2005)""","""Comedy|Romance""",372532,6961.0
17682,31433,3.5,1544094795,"""Debra Messing""","""Wedding Date, The (2005)""","""Comedy|Romance""",372532,6961.0


In [9]:
df.shape

(103531029, 9)

### statistics: totals

In [10]:
# Count unique userId and movieId
unique_userId = df["userId"].n_unique()
unique_movieId = df["movieId"].n_unique()

print(f"Total number of unique userId: {unique_userId}, unique movieId: {unique_movieId}")

Total number of unique userId: 75598, unique movieId: 12301


## Remove "tag" column for fast summaries

In [11]:
# 1. Drop the 'tag' column if it exists
if "tag" in df.columns:
    df = df.drop("tag")

# 2. Remove duplicate rows
df = df.unique()


In [12]:
gc.collect()
df.shape

(149177, 8)

### convert to pandas df

In [11]:
df = df.to_pandas()

In [None]:
# write to csv for future use
df.to_csv(data_path / 'sample_no_tags.csv')

In [None]:
### read no tags csv sample if needed
df = pd.read_csv(data_path / "sample_no_tags.csv")

### statistics

In [12]:
# show plots in browser
pio.renderers.default = "browser"  # change to notebook if you want in line in notebook

In [52]:

#4a. Distribution of unique userId per movieId
user_per_movie = df.groupby('movieId')['userId'].nunique().reset_index(name='unique_user_count')
user_per_movie['unique_user_count'] = np.minimum(user_per_movie['unique_user_count'], 10)
fig2 = px.histogram(user_per_movie, x='unique_user_count', nbins=10, title='Distribution of Number of Ratings among movieIds',
                   labels={"unique_user_count": "Ratings"}) # Rename axes using the labels dict

# Add tick marks at the edges of the histogram bars
fig2.update_xaxes(ticks="outside", ticklen=10) # Place ticks outside the plotting area and set their length
fig2.update_yaxes(ticks="outside", ticklen=10) #
fig2.update_traces(marker_line_width=1, marker_line_color='darkslategrey') 
fig2.update_yaxes(title_text="Number of movieIds", ticks="outside", ticklen=10)
# Define the desired tick values
# Assuming you want ticks at 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
# Define the numerical locations for the ticks
tick_values = list(range(1, 11))

# Define the corresponding text labels for those locations
tick_labels = [str(i) for i in tick_values]
# Change the last label from "10" to "10 =<"
tick_labels[-1] = "10 =<"

# Update the x-axis to show specific tick values and custom text
fig2.update_xaxes(
    title_text="Number of Ratings",
    tickvals=tick_values,  # Numerical positions
    ticktext=tick_labels,  # Custom labels
    ticks="outside",
    ticklen=10
)
fig2.update_layout(
    font_size=16,           # Set default font size for general text/legend
    title_font_size=24      # Make the main plot title larger
)

fig2.show()


Python(29359) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


In [54]:
# 4b. Distribution of number of ratings per userId
movie_per_user = df.groupby('userId')['movieId'].nunique().reset_index(name='unique_movie_count')
movie_per_user['unique_movie_count'] = np.minimum(movie_per_user['unique_movie_count'], 10)
fig2 = px.histogram(movie_per_user, x='unique_movie_count', nbins=10, title='Distribution of Number of Ratings among userIds',
                   labels={"unique_movie_count": "Ratings"}) # Rename axes using the labels dict

# Add tick marks at the edges of the histogram bars
fig2.update_xaxes(ticks="outside", ticklen=10) # Place ticks outside the plotting area and set their length
fig2.update_yaxes(ticks="outside", ticklen=10) #
fig2.update_traces(marker_line_width=1, marker_line_color='darkslategrey') 
fig2.update_yaxes(title_text="Number of userIds", ticks="outside", ticklen=10)
# Define the desired tick values
# Assuming you want ticks at 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
# Define the numerical locations for the ticks
tick_values = list(range(1, 11))

# Define the corresponding text labels for those locations
tick_labels = [str(i) for i in tick_values]
# Change the last label from "10" to "10 =<"
tick_labels[-1] = "10 =<"

# Update the x-axis to show specific tick values and custom text
fig2.update_xaxes(
    title_text="Number of Ratings",
    tickvals=tick_values,  # Numerical positions
    ticktext=tick_labels,  # Custom labels
    ticks="outside",
    ticklen=10
)
fig2.update_layout(
    font_size=16,           # Set default font size for general text/legend
    title_font_size=24      # Make the main plot title larger
)

fig2.show()


Python(29432) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


In [71]:
rating_counts_df = df['rating'].value_counts().reset_index()

# Optional: Rename the columns to something more descriptive
rating_counts_df.columns = ['Rating', 'Count']

# Optional: Sort by the rating value itself
rating_counts_df = rating_counts_df.sort_values('Rating')

fig2 = px.histogram(rating_counts_df, x='Rating', y="Count", nbins=10, title='Number of Occurrences of each Unique Rating Value',
                   labels={"Rating": "Rating"}) # Rename axes using the labels dict

# Add tick marks at the edges of the histogram bars
fig2.update_xaxes(ticks="outside", ticklen=10) # Place ticks outside the plotting area and set their length
fig2.update_yaxes(ticks="outside", ticklen=10) #
fig2.update_traces(marker_line_width=1, marker_line_color='darkslategrey') 
fig2.update_yaxes(title_text="Count", ticks="outside", ticklen=10)
# Define the desired tick values
# Assuming you want ticks at 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
# Define the numerical locations for the ticks
tick_values = [x / 2 for x in range(1, 11)]

# # Define the corresponding text labels for those locations
tick_labels = [str(i) for i in tick_values]
# # Change the last label from "10" to "10 =<"
# tick_labels[-1] = "10 =<"

# Update the x-axis to show specific tick values and custom text
fig2.update_xaxes(
    title_text="Rating",
    tickvals=tick_values,  # Numerical positions
    ticktext=tick_labels,  # Custom labels
    ticks="outside",
    ticklen=10
)
fig2.update_layout(
    font_size=16,           # Set default font size for general text/legend
    title_font_size=24      # Make the main plot title larger
)

fig2.show()

Python(30138) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


In [73]:

# Calculate each user's average rating
user_avg_rating = df.groupby("userId")["rating"].mean().reset_index(name="avg_rating")



# Plot the distribution
fig = px.histogram(user_avg_rating, x="avg_rating", nbins=10,
                   title="Distribution of Each User's Average Ratings")
fig.show()


Python(30277) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


### convert timestamp to datetime

In [None]:
df['datetime'] = pd.to_datetime(df['timestamp'], unit='s')

In [None]:

#Group by time (e.g., monthly) and count ratings
user_activity = df.groupby(pd.Grouper(key='datetime', freq='M'))['userId'].count().reset_index(name='rating_count')

# Plot the distribution
fig = px.line(user_activity, x='datetime', y='rating_count',
              title='User Rating Activity Over Time',
              labels={'rating_count': 'Number of Ratings per Month'})
fig.show()


In [78]:

# Compute average rating per movie
movie_avg = df.groupby("movieId")["rating"].mean().reset_index(name="avg_rating")

# Plot histogram of average ratings

fig2 = px.histogram(movie_avg, x='avg_rating', nbins=10, title='Average Rating Value per movieId',
                   labels={"avg_rating": "Average Rating"}) # Rename axes using the labels dict

# Add tick marks at the edges of the histogram bars
fig2.update_xaxes(ticks="outside", ticklen=10) # Place ticks outside the plotting area and set their length
fig2.update_yaxes(ticks="outside", ticklen=10) #
fig2.update_traces(marker_line_width=1, marker_line_color='darkslategrey') 
fig2.update_yaxes(title_text="Number of movieIds", ticks="outside", ticklen=10)
# Define the desired tick values
# Assuming you want ticks at 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
# Define the numerical locations for the ticks
tick_values = [x / 2 for x in range(1, 11)]

# # Define the corresponding text labels for those locations
tick_labels = [str(i) for i in tick_values]
# # Change the last label from "10" to "10 =<"
# tick_labels[-1] = "10 =<"

# Update the x-axis to show specific tick values and custom text
fig2.update_xaxes(
    title_text="Rating",
    tickvals=tick_values,  # Numerical positions
    ticktext=tick_labels,  # Custom labels
    ticks="outside",
    ticklen=10
)
fig2.update_layout(
    font_size=16,           # Set default font size for general text/legend
    title_font_size=24      # Make the main plot title larger
)

fig2.show()


Python(30700) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
