In [None]:
import pandas as pd
import os

## Movie Lens - 1M 2023

#### DAT to CSV / Parquet

In [None]:
# Define column names based on the MovieLens README
# The files do not have headers, so we must provide them manually.
ratings_cols = ["UserID", "MovieID", "Rating", "Timestamp"]
users_cols = ["UserID", "Gender", "Age", "Occupation", "Zip-code"]
movies_cols = ["MovieID", "Title", "Genres"]

folder = "../ml-2003-1m"

# Read 'ratings.dat'
# engine='python' is required because the separator '::' is more than one character.
print("Reading ratings...")
df_ratings = pd.read_csv(
    f"{folder}/ratings.dat",
    sep="::",
    names=ratings_cols,
    engine="python",
    encoding="latin-1",
)

# Read 'users.dat'
print("Reading users...")
df_users = pd.read_csv(
    f"{folder}/users.dat",
    sep="::",
    names=users_cols,
    engine="python",
    encoding="latin-1",
)

# Read 'movies.dat'
print("Reading movies...")
df_movies = pd.read_csv(
    f"{folder}/movies.dat",
    sep="::",
    names=movies_cols,
    engine="python",
    encoding="latin-1",
)

In [None]:
df_ratings

##### to CSV

In [None]:
# index=False prevents pandas from adding a generic row number column
print("Converting to CSV...")
df_ratings.to_csv(f"{folder}/ratings.csv", index=False)
df_users.to_csv(f"{folder}/users.csv", index=False)
df_movies.to_csv(f"{folder}/movies.csv", index=False)

##### to PARQUET

In [None]:
# Recommended for speed and compression
print("Converting to Parquet...")
df_ratings.to_parquet(f"{folder}/ratings.parquet", index=False)
df_users.to_parquet(f"{folder}/users.parquet", index=False)
df_movies.to_parquet(f"{folder}/movies.parquet", index=False)

## Movie Lens - Latest Small

##### Silver version - CSV to PARQUET

In [None]:
# List of files usually found in ml-latest-small
files = ["ratings", "movies", "tags", "links"]

folder = "../ml-latest-small"

for name in files:
    csv_file = f"{folder}/{name}.csv"
    parquet_file = f"{folder}/{name}.parquet"

    if os.path.exists(csv_file):
        print(f"Converting {csv_file}...")

        # Read CSV
        # Unlike ML-1M, these have headers and use standard comma separators
        df = pd.read_csv(csv_file)

        # Convert to Parquet
        df.to_parquet(parquet_file, index=False)
        print(f"Saved {parquet_file}")
    else:
        print(f"Warning: {csv_file} not found.")

print("All done!")