<a href="https://colab.research.google.com/github/migub/recommender-systems/blob/main/Notebooks/01_Data_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Data Preparation


In [None]:
## 1. Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
drive.mount('/content/drive')

In [None]:
## 2. Data Loading
data = pd.read_csv("/content/drive/MyDrive/Recommender_Systems/train.csv")

# Display basic info
data.info()
display(data.head())

In [None]:
## 3. Handling Missing Values
# Check for missing values
missing_values = data.isnull().sum()
print("Missing values per column:\n", missing_values)

# Drop rows or columns with too many missing values if needed
data.dropna(inplace=True)

In [None]:
## 4. Convert Data Types
# Convert timestamps to datetime format
data['ts_listen'] = pd.to_datetime(data['ts_listen'], unit='s')
data['release_date'] = pd.to_datetime(data['release_date'], format='%Y%m%d', errors='coerce')

# Ensure categorical columns are treated as categorical
data['platform_name'] = data['platform_name'].astype('category')
data['platform_family'] = data['platform_family'].astype('category')
data['listen_type'] = data['listen_type'].astype('category')
data['user_gender'] = data['user_gender'].astype('category')
data['context_type'] = data['context_type'].astype('category')

data['is_listened'] = data['is_listened'].astype(bool)

display(data.dtypes)

In [None]:
## 5. Feature Engineering
# Extract time-based features from the timestamp
data['listen_hour'] = data['ts_listen'].dt.hour
data['listen_dayofweek'] = data['ts_listen'].dt.dayofweek

display(data[['ts_listen', 'listen_hour', 'listen_dayofweek']].head())

In [None]:
## 6. Data Visualization
# Distribution of target variable
sns.countplot(x='is_listened', data=data)
plt.title('Distribution of is_listened')
plt.show()

# User age distribution
sns.histplot(data['user_age'], bins=30)
plt.title('User Age Distribution')
plt.show()

# Most common genres
top_genres = data['genre_id'].value_counts().head(10)
top_genres.plot(kind='bar', title='Top 10 Genres')
plt.show()

# Save cleaned data for further processing
data.to_csv('/content/drive/MyDrive/Recommender_Systems/cleaned_data.csv', index=False)

print("Data preparation complete. Cleaned dataset saved.")