In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from scipy.optimize import minimize
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import requests
import zipfile
import io
import os

In [5]:
def download_movielens():
    """Download the MovieLens dataset"""
    url = "https://files.grouplens.org/datasets/movielens/ml-latest.zip"
    response = requests.get(url)
    with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
        zip_ref.extractall("data/raw")

def load_data():
    """Load and preprocess the MovieLens data"""
    ratings = pd.read_csv("data/raw/ml-latest/ratings.csv")
    movies = pd.read_csv("data/raw/ml-latest/movies.csv")
    
    # Convert ratings to binary (like/dislike) based on threshold
    ratings['binary_rating'] = (ratings['rating'] >= 3.5).astype(int)
    
    # Create user-movie interaction matrix
    interaction_matrix = ratings.pivot(
        index='userId', 
        columns='movieId', 
        values='binary_rating'
    ).fillna(0)
    
    return interaction_matrix, movies


In [6]:

class IsingModel:
    def __init__(self, lambda_reg=0.1):
        self.lambda_reg = lambda_reg
        self.J = None  # Coupling matrix
        self.h = None  # External field
    
    def negative_log_likelihood(self, params, X):
        """Compute negative log likelihood with L1 regularization"""
        n_movies = X.shape[1]
        J = params[:n_movies**2].reshape(n_movies, n_movies)
        h = params[n_movies**2:]
        
        # Compute energy for each sample
        energy = np.zeros(X.shape[0])
        for i in range(X.shape[0]):
            energy[i] = -0.5 * X[i].dot(J).dot(X[i]) - h.dot(X[i])
        
        # Add regularization
        reg_term = self.lambda_reg * np.sum(np.abs(J))
        
        return -np.sum(energy) + reg_term
    
    def fit(self, X, max_iter=1000):
        """Fit the Ising model to the data"""
        n_movies = X.shape[1]
        n_params = n_movies**2 + n_movies
        
        # Initialize parameters
        initial_params = np.zeros(n_params)
        
        # Optimize parameters
        result = minimize(
            self.negative_log_likelihood,
            initial_params,
            args=(X,),
            method='L-BFGS-B',
            options={'maxiter': max_iter}
        )
        
        # Store optimized parameters
        self.J = result.x[:n_movies**2].reshape(n_movies, n_movies)
        self.h = result.x[n_movies**2:]
        
        return self
    
    def predict(self, X):
        """Predict ratings using the fitted model"""
        if self.J is None or self.h is None:
            raise ValueError("Model has not been fitted yet")
        
        predictions = np.zeros_like(X)
        for i in range(X.shape[0]):
            # Compute local field for each movie
            local_field = self.J.dot(X[i]) + self.h
            # Predict based on sign of local field
            predictions[i] = (local_field > 0).astype(int)
        
        return predictions

In [7]:

def plot_coupling_matrix(J, movies, interaction_matrix, top_n=10):
    """Plot the coupling matrix for the top N movies"""
    # Get top N movies by number of ratings
    movie_counts = interaction_matrix.sum(axis=0)
    top_movies = movie_counts.nlargest(top_n).index
    
    # Get movie titles
    movie_titles = movies[movies['movieId'].isin(top_movies)]['title'].values
    
    # Plot coupling matrix
    plt.figure(figsize=(12, 10))
    sns.heatmap(
        J[np.ix_(top_movies, top_movies)],
        xticklabels=movie_titles,
        yticklabels=movie_titles,
        cmap='RdBu_r',
        center=0
    )
    plt.title('Movie Coupling Matrix')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

def plot_model_performance(results):
    """Plot model performance metrics vs regularization strength"""
    plt.figure(figsize=(10, 6))
    plt.plot(list(results.keys()), [r['mse'] for r in results.values()], 'bo-', label='MSE')
    plt.plot(list(results.keys()), [r['sparsity'] for r in results.values()], 'ro-', label='Sparsity')
    plt.xlabel('Regularization Strength (λ)')
    plt.ylabel('Value')
    plt.title('Model Performance vs Regularization')
    plt.legend()
    plt.show()

: 

In [None]:


# Prepare data for training
X = interaction_matrix.values
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

# Train model with different regularization strengths
lambda_values = [0.01, 0.1, 1.0]
models = {}
results = {}

for lambda_reg in lambda_values:
    print(f"Training model with lambda={lambda_reg}")
    model = IsingModel(lambda_reg=lambda_reg)
    model.fit(X_train)
    models[lambda_reg] = model
    
    # Evaluate model
    predictions = model.predict(X_test)
    mse = mean_squared_error(X_test, predictions)
    results[lambda_reg] = {
        'mse': mse,
        'sparsity': np.mean(np.abs(model.J) < 1e-6)
    }

# Visualize results
plot_model_performance(results)

# Plot coupling matrix for the best model
best_lambda = min(results.keys(), key=lambda x: results[x]['mse'])
plot_coupling_matrix(models[best_lambda].J, movies, interaction_matrix)

# Print analysis summary
print("\nAnalysis Summary:")
print("----------------")
print(f"Best regularization strength: {best_lambda}")
print(f"Best model MSE: {results[best_lambda]['mse']:.4f}")
print(f"Best model sparsity: {results[best_lambda]['sparsity']:.2%}")
print("\nThe Ising model has been trained to capture the dependencies between movies")
print("based on user ratings. The coupling matrix J shows how strongly different")
print("movies are related to each other, while the external field h represents")
print("the overall popularity of each movie.")

