# Setup

## Download the MovieLens dataset

In [None]:
import requests
import zipfile
import os

# Step 0: Navigate to the data folder
## a): Change working directory to "Movie-Recommendation-Engine"
current_directory = os.getcwd()
parent_directory = os.path.abspath(os.path.join(current_directory, '..'))
os.chdir(parent_directory)

## b): Validate that we're actually in the correct working directory
current_directory = os.getcwd()
expected_directory = 'Movie-Recommendation-Engine'
if not current_directory.endswith(expected_directory):
    raise FileNotFoundError(f"You are not in the '{expected_directory}' directory. Please navigate to the correct directory and try again.")

# Step I: Download the ZIP file
## Small dataset (for discovery and development)
url_small = 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
response = requests.get(url_small)

## Large dataset
# url_large = 'https://files.grouplens.org/datasets/movielens/ml-32m.zip' ## Large version
# response = requests.get(url_large)

# Step II: Save it as a zip file in the data directory
## Small dataset
zip_file_path = './raw_data/ml-latest-small.zip'
with open(zip_file_path, 'wb') as file:
    file.write(response.content)

## Large dataset
#zip_file_path = '/data/ml-32m.zip'
#with open(zip_file_path, 'wb') as file:
    #file.write(response.content)
    

# Step III: Unzip the file
extract_path = './raw_data/'
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# Step IV: Verify the files
## a) Print data main directory content
extracted_files = os.listdir(extract_path)
print(f'Data main directory content: {extracted_files}')
## b) Print unzipped dataset directory content
data_folder_path = os.path.join(extract_path, 'ml-latest-small') ## Change this for large dataset
files_in_data_folder = os.listdir(data_folder_path)
print(f'Unzipped dataset directory content: {files_in_data_folder}')

## Load the dataset

In [2]:
import pandas as pd

In [3]:
# Load each CSV file into a DataFrame
links_df = pd.read_csv(os.path.join(data_folder_path, 'links.csv'))
ratings_df = pd.read_csv(os.path.join(data_folder_path, 'ratings.csv'))
tags_df = pd.read_csv(os.path.join(data_folder_path, 'tags.csv'))
movies_df = pd.read_csv(os.path.join(data_folder_path, 'movies.csv'))

### Ratings

In [4]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
ratings_df.info()

### Movies

In [5]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
movies_df.info()

### Links

In [6]:
links_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [7]:
links_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   imdbId   9742 non-null   int64  
 2   tmdbId   9734 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 228.5 KB
