In [1]:
import pandas as pd
import os
import urllib.request
import zipfile

# 1. Tạo thư mục lưu dữ liệu        
data_dir = "movielens"
os.makedirs(data_dir, exist_ok=True)

# 2. Tải file zip latest-small
url = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
zip_path = os.path.join(data_dir, "ml-latest-small.zip")
urllib.request.urlretrieve(url, zip_path)

# 3. Giải nén
with zipfile.ZipFile(zip_path, 'r') as z:
    z.extractall(data_dir)

# 4. Đọc dữ liệu movies và ratings
movies = pd.read_csv(os.path.join(data_dir, "ml-latest-small", "movies.csv"))
ratings = pd.read_csv(os.path.join(data_dir, "ml-latest-small", "ratings.csv"))

# 5. Xem thông tin DataFrame
print(movies.head())
print(movies.info())
print(ratings.head())
print(ratings.info())

# 6. Kiểm tra dữ liệu thiếu
print(movies.isnull().sum())
print(ratings.isnull().sum())

# 7. Nếu cần xử lý dữ liệu thiếu
# (trong dataset small, movies.csv và ratings.csv gần như không thiếu, nhưng ví dụ:)
movies = movies.dropna()  # bỏ dòng thiếu
ratings = ratings.dropna()

# 8. Gộp DataFrame movies + ratings theo movieId
movies_ratings = pd.merge(ratings, movies, on="movieId", how="left")
print(movies_ratings.head())

# 9. Lọc ví dụ: phim có rating >= 4.5
high_rated = movies_ratings[movies_ratings["rating"] >= 4.5]
print(high_rated.head())

# 10. Thống kê dữ liệu: rating trung bình theo phim
avg_rating_per_movie = movies_ratings.groupby("title")["rating"].mean().sort_values(ascending=False)
print(avg_rating_per_movie.head(10))

# 11. Parsing timestamps
movies_ratings["timestamp"] = pd.to_datetime(movies_ratings["timestamp"], unit="s")
print(movies_ratings[["userId", "title", "rating", "timestamp"]].head())

# 12. Thống kê rating theo năm/tháng
movies_ratings["year"] = movies_ratings["timestamp"].dt.year
ratings_per_year = movies_ratings.groupby("year")["rating"].mean()
print(ratings_per_year)



   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
None
   userId  movieId  rating  timestamp
0       1        1 