# Deep Learning Midterm - Song Release Year Prediction

## *Nfal Rifky Atsilah Maulana - 1103223106*

## Objective
Build an end-to-end deep learning regression model to predict song release years from audio features using PyTorch.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

## 2. Load Dataset

In [None]:
print("Loading dataset...")
df = pd.read_csv('midterm-regresi-dataset.csv', header=None)

# First column is target (release year), rest are features
X = df.iloc[:, 1:].values  # Features
y = df.iloc[:, 0].values   # Target (release year)

print(f"Dataset shape: {df.shape}")
print(f"Features: {X.shape[1]}")
print(f"Target range: {y.min()} - {y.max()}")
print(f"Sample target values: {y[:5]}")

## 3. Data Exploration

In [None]:
print("\n" + "="*50)
print("DATA EXPLORATION")
print("="*50)

# Basic statistics
print(f"\n1. Target Statistics:")
print(f"   Mean: {np.mean(y):.2f}")
print(f"   Std: {np.std(y):.2f}")
print(f"   Min: {np.min(y)}")
print(f"   Max: {np.max(y)}")
print(f"   Median: {np.median(y)}")

# Distribution of target
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(y, bins=30, edgecolor='black', alpha=0.7, color='skyblue')
plt.title('Distribution of Song Release Years', fontsize=14, fontweight='bold')
plt.xlabel('Release Year')
plt.ylabel('Frequency')
plt.grid(alpha=0.3)

plt.subplot(1, 2, 2)
sns.boxplot(y=y, color='lightcoral')
plt.title('Boxplot of Release Years', fontsize=14, fontweight='bold')
plt.xlabel('Release Year')
plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()

# Feature statistics
print(f"\n2. Feature Statistics:")
print(f"   Number of features: {X.shape[1]}")
print(f"   Feature means (first 5): {np.mean(X, axis=0)[:5]}")
print(f"   Feature stds (first 5): {np.std(X, axis=0)[:5]}")

# Check for missing values
print(f"\n3. Missing Values:")
print(f"   Features: {np.isnan(X).sum()}")
print(f"   Target: {np.isnan(y).sum()}")

## 4. Data Preprocessing

In [None]:
print("\n" + "="*50)
print("DATA PREPROCESSING")
print("="*50)

# Handle missing values
if np.isnan(X).sum() > 0:
    print("Handling missing values in features...")
    col_means = np.nanmean(X, axis=0)
    X = np.where(np.isnan(X), col_means, X)

if np.isnan(y).sum() > 0:
    print("Handling missing values in target...")
    y = np.nan_to_num(y)

# Feature scaling
print("\nScaling features...")
scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(X)

# Scale target
scaler_y = StandardScaler()
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()

# Train-validation split (80-20)
print("\nSplitting data into train and validation sets...")
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y_scaled, test_size=0.2, random_state=42
)

print(f"   Training set: {X_train.shape[0]} samples")
print(f"   Validation set: {X_val.shape[0]} samples")
print(f"   Number of features: {X_train.shape[1]}")

## 5. Prepare PyTorch DataLoaders

In [None]:
print("\n" + "="*50)
print("PREPARING PYTORCH DATALOADERS")
print("="*50)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.FloatTensor(y_train).unsqueeze(1)
X_val_tensor = torch.FloatTensor(X_val)
y_val_tensor = torch.FloatTensor(y_val).unsqueeze(1)

# Create datasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

# Create dataloaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

print(f"   Batch size: {batch_size}")
print(f"   Training batches: {len(train_loader)}")
print(f"   Validation batches: {len(val_loader)}")

## 6. Build Deep Learning Model

In [None]:
print("\n" + "="*50)
print("BUILDING DEEP LEARNING MODEL")
print("="*50)

class DeepRegressionModel(nn.Module):
    """Deep Neural Network for regression task"""
    def __init__(self, input_dim):
        super(DeepRegressionModel, self).__init__()
        
        self.model = nn.Sequential(
            # Input layer
            nn.Linear(input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),
            
            # Hidden layer 1
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.3),
            
            # Hidden layer 2
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            # Hidden layer 3
            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            
            # Output layer (single value for regression)
            nn.Linear(32, 1)
        )
        
    def forward(self, x):
        return self.model(x)

# Initialize model
input_dim = X_train.shape[1]
model = DeepRegressionModel(input_dim)
print(f"Model Architecture:")
print(model)
print(f"\nTotal parameters: {sum(p.numel() for p in model.parameters()):,}")

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
print(f"\nUsing device: {device}")

## 7. Training Setup

In [None]:
print("\n" + "="*50)
print("TRAINING SETUP")
print("="*50)

# Loss function (Mean Squared Error for regression)
criterion = nn.MSELoss()

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

# Learning rate scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=5, verbose=True
)

# Early stopping
class EarlyStopping:
    def __init__(self, patience=10, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
        
    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss > self.best_loss - self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.counter = 0

early_stopper = EarlyStopping(patience=10, min_delta=0.001)

# Training parameters
n_epochs = 100
train_losses = []
val_losses = []

## 8. Training Loop

In [None]:
print("\n" + "="*50)
print("TRAINING MODEL")
print("="*50)

for epoch in range(n_epochs):
    # Training phase
    model.train()
    train_loss = 0
    
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        
        optimizer.zero_grad()
        predictions = model(batch_X)
        loss = criterion(predictions, batch_y)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item() * batch_X.size(0)
    
    train_loss /= len(train_loader.dataset)
    train_losses.append(train_loss)
    
    # Validation phase
    model.eval()
    val_loss = 0
    
    with torch.no_grad():
        for batch_X, batch_y in val_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            
            predictions = model(batch_X)
            loss = criterion(predictions, batch_y)
            val_loss += loss.item() * batch_X.size(0)
    
    val_loss /= len(val_loader.dataset)
    val_losses.append(val_loss)
    
    # Learning rate scheduling
    scheduler.step(val_loss)
    
    # Early stopping check
    early_stopper(val_loss)
    if early_stopper.early_stop:
        print(f"\nEarly stopping at epoch {epoch+1}")
        break
    
    # Print progress
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1:3d}/{n_epochs} | "
              f"Train Loss: {train_loss:.6f} | "
              f"Val Loss: {val_loss:.6f}")

print("\nTraining completed!")

## 9. Training Visualization

In [None]:
print("\n" + "="*50)
print("TRAINING VISUALIZATION")
print("="*50)

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Training Loss', linewidth=2)
plt.plot(val_losses, label='Validation Loss', linewidth=2)
plt.xlabel('Epoch', fontsize=12, fontweight='bold')
plt.ylabel('Loss (MSE)', fontsize=12, fontweight='bold')
plt.title('Training and Validation Loss', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(alpha=0.3)

# Plot final predictions vs actual
model.eval()
with torch.no_grad():
    val_predictions_scaled = model(X_val_tensor.to(device)).cpu().numpy()
    
# Convert back to original scale
val_predictions = scaler_y.inverse_transform(val_predictions_scaled)
val_actual = scaler_y.inverse_transform(y_val_tensor.numpy())

plt.subplot(1, 2, 2)
plt.scatter(val_actual, val_predictions, alpha=0.6, color='steelblue')
plt.plot([val_actual.min(), val_actual.max()], 
         [val_actual.min(), val_actual.max()], 
         'r--', linewidth=2, label='Perfect Prediction')
plt.xlabel('Actual Release Year', fontsize=12, fontweight='bold')
plt.ylabel('Predicted Release Year', fontsize=12, fontweight='bold')
plt.title('Predictions vs Actual', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 10. Model Evaluation

In [None]:
print("\n" + "="*50)
print("MODEL EVALUATION")
print("="*50)

# Make predictions
model.eval()
with torch.no_grad():
    # Validation set predictions
    val_predictions_scaled = model(X_val_tensor.to(device)).cpu().numpy()
    val_predictions = scaler_y.inverse_transform(val_predictions_scaled)
    val_actual = scaler_y.inverse_transform(y_val_tensor.numpy())
    
    # Training set predictions
    train_predictions_scaled = model(X_train_tensor.to(device)).cpu().numpy()
    train_predictions = scaler_y.inverse_transform(train_predictions_scaled)
    train_actual = scaler_y.inverse_transform(y_train_tensor.numpy())

# Calculate metrics
def calculate_metrics(actual, predicted):
    mse = mean_squared_error(actual, predicted)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(actual, predicted)
    r2 = r2_score(actual, predicted)
    return mse, rmse, mae, r2

train_mse, train_rmse, train_mae, train_r2 = calculate_metrics(train_actual, train_predictions)
val_mse, val_rmse, val_mae, val_r2 = calculate_metrics(val_actual, val_predictions)

print("\n1. Performance Metrics:")
print("-" * 40)
print(f"{'Metric':<10} {'Training':>12} {'Validation':>12}")
print("-" * 40)
print(f"{'MSE':<10} {train_mse:>12.4f} {val_mse:>12.4f}")
print(f"{'RMSE':<10} {train_rmse:>12.4f} {val_rmse:>12.4f}")
print(f"{'MAE':<10} {train_mae:>12.4f} {val_mae:>12.4f}")
print(f"{'R²':<10} {train_r2:>12.4f} {val_r2:>12.4f}")

# Interpretation of results
print("\n2. Interpretation of Results:")
print("-" * 40)
print(f"• Mean Absolute Error (MAE): {val_mae:.2f} years")
print(f"  → On average, predictions are off by {val_mae:.2f} years")
print(f"• Root Mean Squared Error (RMSE): {val_rmse:.2f} years")
print(f"• R² Score: {val_r2:.4f}")
if val_r2 > 0.7:
    print(f"  → Good fit (explains {val_r2*100:.1f}% of variance)")
elif val_r2 > 0.5:
    print(f"  → Moderate fit (explains {val_r2*100:.1f}% of variance)")
else:
    print(f"  → Poor fit (explains only {val_r2*100:.1f}% of variance)")

# Error distribution
errors = val_actual - val_predictions
print(f"• Error Statistics:")
print(f"  Mean error: {np.mean(errors):.2f} years")
print(f"  Std of errors: {np.std(errors):.2f} years")

## 11. Error Analysis

In [None]:
print("\n" + "="*50)
print("ERROR ANALYSIS")
print("="*50)

plt.figure(figsize=(12, 10))

# Error distribution
plt.subplot(2, 2, 1)
plt.hist(errors, bins=30, edgecolor='black', alpha=0.7, color='lightcoral')
plt.xlabel('Prediction Error (Years)', fontsize=12, fontweight='bold')
plt.ylabel('Frequency', fontsize=12, fontweight='bold')
plt.title('Distribution of Prediction Errors', fontsize=14, fontweight='bold')
plt.axvline(x=0, color='red', linestyle='--', linewidth=2)
plt.grid(alpha=0.3)

# Errors vs actual values
plt.subplot(2, 2, 2)
plt.scatter(val_actual, errors, alpha=0.6, color='steelblue')
plt.axhline(y=0, color='red', linestyle='--', linewidth=2)
plt.xlabel('Actual Release Year', fontsize=12, fontweight='bold')
plt.ylabel('Prediction Error', fontsize=12, fontweight='bold')
plt.title('Errors vs Actual Values', fontsize=14, fontweight='bold')
plt.grid(alpha=0.3)

# Prediction vs actual with errors
plt.subplot(2, 2, 3)
for i in range(min(100, len(val_actual))):
    plt.plot([val_actual[i], val_actual[i]], 
             [val_actual[i], val_predictions[i]], 
             'gray', alpha=0.5)
plt.scatter(val_actual[:100], val_predictions[:100], alpha=0.7, color='green')
plt.plot([val_actual.min(), val_actual.max()], 
         [val_actual.min(), val_actual.max()], 
         'r--', linewidth=2)
plt.xlabel('Actual Release Year', fontsize=12, fontweight='bold')
plt.ylabel('Predicted Release Year', fontsize=12, fontweight='bold')
plt.title('Predictions with Error Bars (First 100 samples)', fontsize=14, fontweight='bold')
plt.grid(alpha=0.3)

# Residual plot
plt.subplot(2, 2, 4)
plt.scatter(val_predictions, errors, alpha=0.6, color='purple')
plt.axhline(y=0, color='red', linestyle='--', linewidth=2)
plt.xlabel('Predicted Release Year', fontsize=12, fontweight='bold')
plt.ylabel('Residuals', fontsize=12, fontweight='bold')
plt.title('Residual Plot', fontsize=14, fontweight='bold')
plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 12. Save Model and Results

In [None]:
print("\n" + "="*50)
print("SAVING MODEL AND RESULTS")
print("="*50)

# Save the trained model
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'input_dim': input_dim,
    'scaler_X': scaler_X,
    'scaler_y': scaler_y
}, 'deep_regression_model.pth')

print("✓ Model saved as 'deep_regression_model.pth'")

# Save predictions
predictions_df = pd.DataFrame({
    'Actual_Year': val_actual.flatten(),
    'Predicted_Year': val_predictions.flatten(),
    'Absolute_Error': np.abs(errors).flatten()
})
predictions_df.to_csv('predictions_results.csv', index=False)
print("✓ Predictions saved as 'predictions_results.csv'")

# Save metrics
metrics_df = pd.DataFrame({
    'Dataset': ['Training', 'Validation'],
    'MSE': [train_mse, val_mse],
    'RMSE': [train_rmse, val_rmse],
    'MAE': [train_mae, val_mae],
    'R2': [train_r2, val_r2]
})
metrics_df.to_csv('model_metrics.csv', index=False)
print("✓ Metrics saved as 'model_metrics.csv'")

print("\n" + "="*50)
print("END OF DEEP LEARNING MIDTERM PROJECT")
print("="*50)
print("Student: Nfal Rifky Atsilah Maulana")
print("NIM: 1103223106")
print("="*50)