In [1]:
pip install pandas numpy scikit-learn




Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [3]:
df = pd.read_csv('Crawl/Data/Perfume_Data.csv')

In [4]:
print(df.head())


   Unnamed: 0                                               name  \
0           0             Nước Hoa Nữ Diamond Femme 45ml (Trắng)   
1           1  Nước Hoa Vùng Kín Foellie Hương Hoa Hồng Mạnh ...   
2           2         Nước Hoa Nữ Diamond Femme Pink 45ml (Hồng)   
3           3              Nước Hoa Nam Diamond Homme 45ml (Đen)   
4           4       Nước Hoa Nữ Diamond Femme Ruby Red 45ml (Đỏ)   

                         brand   pricenew sale   priceold  
0                Eau De Parfum  301.000 ₫  45%  549.000 ₫  
1  Inner Perfume #Eau De Bijou  144.000 ₫  59%  350.000 ₫  
2                Eau De Parfum  275.000 ₫  50%  549.000 ₫  
3                Eau De Parfum  275.000 ₫  50%  549.000 ₫  
4                Eau De Parfum  301.000 ₫  45%  549.000 ₫  


In [5]:
df = df[['brand', 'pricenew']]

In [6]:
def clean_price(price):
    if isinstance(price, str):
        price = price.replace('₫', '').replace(',', '').replace('.', '').strip()
        return float(price)
    return np.nan

In [7]:
df['pricenew'] = df['pricenew'].apply(clean_price)

ValueError: could not convert string to float: '2250000 VND'

In [None]:
print(df)

                           brand   pricenew
0                  Eau De Parfum   301000.0
1    Inner Perfume #Eau De Bijou   144000.0
2                  Eau De Parfum   275000.0
3                  Eau De Parfum   275000.0
4                  Eau De Parfum   301000.0
..                           ...        ...
258                    Valentino  3270000.0
259                    Valentino  4131000.0
260                    Valentino  4131000.0
261                    Valentino  4131000.0
262                    Valentino  4132000.0

[263 rows x 2 columns]


In [None]:
df = df.dropna()

In [None]:
t = 0
dic = {}
df['brand_encoded'] = -1
for index, row in df.iterrows():
    brand = row['brand']
    if brand in dic:
        df.at[index, 'brand_encoded'] = dic[brand]
    else:
        dic[brand] = t
        t += 1
        df.at[index, 'brand_encoded'] = dic[brand]


In [None]:
print(df)

                           brand   pricenew  brand_encoded
0                  Eau De Parfum   301000.0              0
1    Inner Perfume #Eau De Bijou   144000.0              1
2                  Eau De Parfum   275000.0              0
3                  Eau De Parfum   275000.0              0
4                  Eau De Parfum   301000.0              0
..                           ...        ...            ...
258                    Valentino  3270000.0             88
259                    Valentino  4131000.0             88
260                    Valentino  4131000.0             88
261                    Valentino  4131000.0             88
262                    Valentino  4132000.0             88

[230 rows x 3 columns]


In [None]:
encoder = OneHotEncoder()
brand_encoded = encoder.fit_transform(df[['brand_encoded']]).toarray()


In [None]:
brand_encoded_df = pd.DataFrame(brand_encoded, columns=encoder.get_feature_names_out(['brand_encoded']))


In [None]:
X = brand_encoded_df
y = df['pricenew']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Root Mean Squared Error: {rmse}')


Root Mean Squared Error: 1.4105022970290151e+19


In [None]:
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

In [None]:
print(results)

        Actual     Predicted
168  1509000.0  1.321557e+19
234  1957000.0  3.111424e+06
143  2932000.0  1.014272e+06
219  3408000.0  4.273152e+06
172  1930000.0  4.775606e+19
113  1160000.0  1.474560e+06
250  2668000.0  2.228736e+06
92    134000.0  1.172480e+05
254  3410000.0  3.392000e+06
203  2745000.0  3.570688e+06
177    45000.0  1.172480e+05
128  1750000.0  1.474560e+06
78    160000.0  3.327880e+18
8     134000.0  1.423360e+05
152   212000.0  8.893165e+18
167   156000.0  1.418240e+05
118   989000.0  1.474560e+06
5     146000.0  2.914335e+19
48    134000.0  1.172480e+05
192  3628000.0  4.273152e+06
211  2802000.0  3.327880e+18
125  1521000.0  3.327880e+18
241  3657000.0  2.604544e+06
245  2688000.0  3.366400e+06
126   813000.0  1.474560e+06
76    134000.0  1.172480e+05
186    44000.0  9.472000e+04
40    208000.0  2.121944e+19
14    148000.0  1.384806e+19
174   148000.0  3.327894e+18
213  2896000.0  3.392000e+06
162   203000.0 -1.367065e+19
223  4626000.0  4.618240e+05
64    149000.0