In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_absolute_error

Phân tích tổng quan

In [2]:
file_path = "fashion_trend_data.csv"
df = pd.read_csv(file_path)

In [5]:
df_info = df.info
df_head = df.head

df_types = df.dtypes
df_names = df.columns.tolist()

df_null = df.isnull().sum()
df_duplicate = df.duplicated().sum()

In [6]:
df_info, df_head

(<bound method DataFrame.info of                                    User_ID  Age  Gender              Location  \
 0     0e9a893c-69a7-466b-90e2-a9e81734f5eb   26  Female               Eritrea   
 1     3d7e7bbf-692b-4eed-950e-95178ff22d12   49    Male             Venezuela   
 2     8f63e022-0599-424c-94d8-82c3e17ebdc2   50  Female  Syrian Arab Republic   
 3     d77cf66b-a57c-492a-ba4c-c8334c1467a8   49  Female              Paraguay   
 4     cc6106d6-d474-4bdb-9fb9-b2118efb230a   27    Male                 Italy   
 ...                                    ...  ...     ...                   ...   
 2995  e4a37dec-e35a-4e00-b7af-5b50de4917bf   30  Female                Sweden   
 2996  e5fcd895-6466-47d6-afda-ebb180849aa1   36    Male                  Cuba   
 2997  76eb1a44-ceb4-4179-bec7-cd25d40ed7d5   45  Female          Cook Islands   
 2998  7a4a18c0-9c85-479f-b678-aa8aeaf8ab88   42  Female               Ukraine   
 2999  b2f175e2-2019-4902-8a7d-0ce3655da92f   18  Female          

In [None]:
df_types, df_names

(User_ID               object
 Age                    int64
 Gender                object
 Location              object
 Fashion_Item          object
 Brand                 object
 Price                float64
 Rating                 int64
 Review_Text           object
 Sentiment             object
 Purchase_Decision     object
 Trendy_Score           int64
 Review_Date           object
 dtype: object,
 ['User_ID',
  'Age',
  'Gender',
  'Location',
  'Fashion_Item',
  'Brand',
  'Price',
  'Rating',
  'Review_Text',
  'Sentiment',
  'Purchase_Decision',
  'Trendy_Score',
  'Review_Date'])

In [7]:
df_null, df_duplicate

(User_ID                0
 Age                    0
 Gender                 0
 Location               0
 Fashion_Item           0
 Brand                  0
 Price                276
 Rating                 0
 Review_Text            0
 Sentiment              0
 Purchase_Decision      0
 Trendy_Score           0
 Review_Date          306
 dtype: int64,
 np.int64(0))

Xử lý, mã hóa dữ liệu phân loại

In [8]:
data = df.copy()
data

Unnamed: 0,User_ID,Age,Gender,Location,Fashion_Item,Brand,Price,Rating,Review_Text,Sentiment,Purchase_Decision,Trendy_Score,Review_Date
0,0e9a893c-69a7-466b-90e2-a9e81734f5eb,26,Female,Eritrea,Jeans,Uniqlo,143.0,3,Care central fish tend top term so address ear...,Positive,Yes,93,2020-08-21
1,3d7e7bbf-692b-4eed-950e-95178ff22d12,49,Male,Venezuela,T-shirt,Zara,414.0,2,Common seat want only she hair purpose option ...,Neutral,Yes,89,2023-11-14
2,8f63e022-0599-424c-94d8-82c3e17ebdc2,50,Female,Syrian Arab Republic,Jacket,Nike,250.0,3,Total pay ever guess if culture message better...,Positive,No,75,2023-10-31
3,d77cf66b-a57c-492a-ba4c-c8334c1467a8,49,Female,Paraguay,Shoes,H&M,172.0,3,System certainly person data their turn by be ...,Positive,Yes,12,
4,cc6106d6-d474-4bdb-9fb9-b2118efb230a,27,Male,Italy,Dress,H&M,256.0,4,Hold instead down candidate analysis too effec...,Negative,Yes,37,2022-02-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,e4a37dec-e35a-4e00-b7af-5b50de4917bf,30,Female,Sweden,Bag,Adidas,205.0,4,Design discussion writer possible expert free ...,Positive,Yes,88,2023-07-08
2996,e5fcd895-6466-47d6-afda-ebb180849aa1,36,Male,Cuba,Dress,H&M,259.0,1,Letter break again season interest front herse...,Positive,Yes,74,2022-06-04
2997,76eb1a44-ceb4-4179-bec7-cd25d40ed7d5,45,Female,Cook Islands,Jacket,Nike,60.0,4,Attack lose marriage teach study major challen...,Neutral,Yes,76,2024-08-09
2998,7a4a18c0-9c85-479f-b678-aa8aeaf8ab88,42,Female,Ukraine,Dress,Uniqlo,395.0,3,Company find black herself attack bank perform...,Positive,Yes,20,2020-08-17


In [9]:
brand_price = data.groupby("Brand")["Price"].mean()
brand_price

for index, row in df.iterrows():
    if pd.isnull(row["Price"]):
        data.at[index, "Price"] = brand_price[row["Brand"]]

data.isnull().sum()["Price"]

np.int64(0)

In [10]:
le = LabelEncoder()
data["Gender"] = le.fit_transform(data["Gender"])
data["Sentiment"] = le.fit_transform(data["Sentiment"])
data["Purchase_Decision"] = le.fit_transform(data["Purchase_Decision"])  # Yes=1, No=0

Biến đổi cột Brand và Fashion_Item bằng one-hot encoding

In [11]:
data = pd.get_dummies(data, columns = ["Brand", "Fashion_Item"], drop_first = True)

Chuẩn hóa cột số

In [12]:
scaler = StandardScaler()
data[["Price", "Rating", "Trendy_Score", "Age"]] = scaler.fit_transform(data[["Price", "Rating", "Trendy_Score", "Age"]])

Chia dữ liệu cho mô hình phân loại (dự đoán quyết định mua hàng)

In [13]:
X_class = data.drop(columns = ["Purchase_Decision", "Review_Text", "Location", "Review_Date", "User_ID"])
y_class = data["Purchase_Decision"]
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_class, y_class, test_size = 0.2, random_state = 42)

Chia dữ liệu cho mô hình hồi quy (dự đoán điểm xu hướng)

In [16]:
X_reg = data.drop(columns = ["Trendy_Score", "Review_Text", "Location", "Review_Date", "User_ID"])
y_reg = data["Trendy_Score"]
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_reg, y_reg, test_size = 0.2, random_state = 42)

Huấn luyện mô hình phân loại

In [14]:
clf = RandomForestClassifier(n_estimators = 100, random_state = 42)
clf.fit(X_train_c, y_train_c)
y_pred_c = clf.predict(X_test_c)
acc = accuracy_score(y_test_c, y_pred_c)

print(f"Accuracy (Purchase Decision): {acc:.2%}")

Accuracy (Purchase Decision): 65.00%


Huấn luyện mô hình hồi quy

In [17]:
reg = RandomForestRegressor(n_estimators = 100, random_state = 42)
reg.fit(X_train_r, y_train_r)
y_pred_r = reg.predict(X_test_r)
mae = mean_absolute_error(y_test_r, y_pred_r)

print(f"Mean Absolute Error (Trendy Score): {mae:.4f}")

Mean Absolute Error (Trendy Score): 0.8636
