<a href="https://colab.research.google.com/github/migub/recommender-systems/blob/main/Notebooks/03_Exploratory_Data_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
## 1. Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
from sklearn.preprocessing import LabelEncoder

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
## 2. Load Preprocessed Data
df = pd.read_csv("/content/drive/MyDrive/Recommender_Systems/train_preprocessed.csv")

# Encode Only Relevant Categorical Features
categorical_cols = ['genre_id', 'context_type', 'platform_name', 'platform_family', 'listen_type', 'user_gender']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])  # Convert to integer labels
    df[col] = df[col].astype('category')  # Convert to categorical type
    label_encoders[col] = le  # Store encoders for future use

# Display basic info
print("🔹 Dataset Overview:")
df.info()
display(df.head())


🔹 Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7123897 entries, 0 to 7123896
Data columns (total 16 columns):
 #   Column            Dtype   
---  ------            -----   
 0   genre_id          category
 1   media_id          int64   
 2   album_id          int64   
 3   context_type      category
 4   platform_name     category
 5   platform_family   category
 6   media_duration    float64 
 7   listen_type       category
 8   user_gender       category
 9   user_id           int64   
 10  artist_id         int64   
 11  user_age          float64 
 12  is_listened       bool    
 13  listen_hour       int64   
 14  listen_dayofweek  int64   
 15  song_age          float64 
dtypes: bool(1), category(6), float64(3), int64(6)
memory usage: 543.6 MB


Unnamed: 0,genre_id,media_id,album_id,context_type,platform_name,platform_family,media_duration,listen_type,user_gender,user_id,artist_id,user_age,is_listened,listen_hour,listen_dayofweek,song_age
0,1282,222606,41774,12,1,0,0.491379,0,0,9241,55164,0.916667,False,13,3,4533.0
1,1290,250467,43941,0,2,1,0.267241,0,0,16547,55830,1.0,True,22,2,3927.0
2,14,305197,48078,1,2,1,0.172414,1,1,7665,2704,0.916667,True,13,5,859.0
3,6,900502,71521,0,0,0,0.564655,0,1,1580,938,1.0,False,9,5,5871.0
4,6,542335,71718,0,0,0,0.176724,0,1,1812,2939,0.5,True,18,5,3186.0


In [7]:
## 3. Checking for Missing Values & Duplicates
print("\n🔹 Checking for missing values:")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

print("\n🔹 Checking for duplicate rows:")
duplicate_rows = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_rows}")


🔹 Checking for missing values:
Series([], dtype: int64)

🔹 Checking for duplicate rows:
Number of duplicate rows: 251159


In [None]:
## 4. Distribution of Numerical Features
num_cols = ['media_duration', 'user_age', 'listen_hour', 'listen_dayofweek', 'song_age']

plt.figure(figsize=(12, 8))
for i, col in enumerate(num_cols, 1):
    plt.subplot(2, 3, i)
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.show()


In [None]:
## 5. Categorical Feature Counts
cat_cols = ['genre_id', 'context_type', 'platform_name', 'platform_family', 'listen_type', 'user_gender', 'is_listened']

plt.figure(figsize=(12, 10))
for i, col in enumerate(cat_cols, 1):
    plt.subplot(3, 3, i)
    sns.countplot(x=df[col])
    plt.title(f'Count of {col}')
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
## 6. Correlation Heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
## 7. Listening Behavior by Hour & Day
plt.figure(figsize=(12, 5))
sns.countplot(x=df['listen_hour'], hue=df['is_listened'])
plt.title("Listening Behavior by Hour")
plt.xlabel("Hour of the Day")
plt.ylabel("Count")
plt.legend(["Not Listened", "Listened"])
plt.show()

plt.figure(figsize=(12, 5))
sns.countplot(x=df['listen_dayofweek'], hue=df['is_listened'])
plt.title("Listening Behavior by Day of the Week")
plt.xlabel("Day of the Week (0=Monday, 6=Sunday)")
plt.ylabel("Count")
plt.legend(["Not Listened", "Listened"])
plt.show()

In [None]:
## 8. Listening Behavior by Genre
plt.figure(figsize=(15, 6))
top_genres = df['genre_id'].value_counts().index[:10]
sns.countplot(x=df[df['genre_id'].isin(top_genres)]['genre_id'], hue=df['is_listened'])
plt.title("Top 10 Genres - Listened vs Not Listened")
plt.xlabel("Genre ID")
plt.ylabel("Count")
plt.legend(["Not Listened", "Listened"])
plt.show()

print("\n✅ EDA Complete! Insights have been visualized.")