In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
df = pd.read_csv(r'C:\Users\ADMIN\Downloads\laptop_da\cleaned_asin_added.csv')
df_laptop = df.loc[df['type'] == 'laptop']
df_laptop
df_laptop.info()

In [None]:
numeric_cols = df_laptop.select_dtypes(include=['float64', 'int64']).columns

corr_matrix = df_laptop[numeric_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.show()

In [None]:
threshold = 0.5
high_corr_vars = corr_matrix[(corr_matrix.abs() > threshold) & (corr_matrix.abs() < 1)]
print(high_corr_vars)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_laptop[numeric_cols] = scaler.fit_transform(df_laptop[numeric_cols])

corr_matrix = df_laptop[numeric_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.show()


In [None]:
df_laptop.info()

In [None]:
features = ['lmsales_converted', 'reviews', 'rating', 'wait_days', 'ram', 'storage_capacity']
target = 'price'

X = df_laptop[features]
y = df_laptop[target]

numeric_features = ['lmsales_converted', 'reviews', 'rating', 'wait_days', 'ram', 'storage_capacity']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R2 Score: {r2}')

In [None]:
new_data = {
    'lmsales_converted': [200, 150],
    'reviews': [50, 30],
    'rating': [4.5, 4.0],
    'wait_days': [7, 10],
    'ram': [16, 8],
    'storage_capacity': [512, 256]
}

df_new_data = pd.DataFrame(new_data)

y_real_pred = model.predict(df_new_data)

df_new_data['predicted_price'] = y_real_pred

print(df_new_data)

In [None]:
features = ['lmsales_converted', 'reviews', 'rating', 'wait_days', 'ram', 'storage_capacity']
target = 'price'

X = df_laptop[features]
y = df_laptop[target]

numeric_features = ['lmsales_converted', 'reviews', 'rating', 'wait_days', 'ram', 'storage_capacity']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ])

models = {
    "RandomForest": RandomForestRegressor(),
    "LinearRegression": LinearRegression(),
    "DecisionTree": DecisionTreeRegressor()
}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

for model_name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='r2')
    print(f"Model: {model_name}")
    print(f"Cross-Validation R2 Scores: {cv_scores}")
    print(f"Mean CV R2 Score: {cv_scores.mean()}")
    
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    print(f"MAE: {mae}")
    print(f"MSE: {mse}")
    print(f"RMSE: {rmse}")
    print(f"R2 Score: {r2}")
    print("\n")

param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_features': ['auto', 'sqrt', 'log2'],
    'regressor__max_depth': [10, 20, 30, None]
}

grid_search = GridSearchCV(estimator=Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
]), param_grid=param_grid, cv=5, scoring='r2', n_jobs=-1)

grid_search.fit(X_train, y_train)

print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation R2 score: {grid_search.best_score_}")

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Final Model MAE: {mae}")
print(f"Final Model MSE: {mse}")
print(f"Final Model RMSE: {rmse}")
print(f"Final Model R2 Score: {r2}")

In [None]:
models = {
    "RandomForest": RandomForestRegressor(),
    "LinearRegression": LinearRegression(),
    "DecisionTree": DecisionTreeRegressor()
}

In [None]:
new_data = {
    'lmsales_converted': [200, 150],
    'reviews': [50, 30],
    'rating': [4.5, 4.0],
    'wait_days': [7, 10],
    'ram': [16, 8],
    'storage_capacity': [512, 256]
}

df_new_data = pd.DataFrame(new_data)

for model_name, model in models.items():
    print(f"Testing Model: {model_name}")
    
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    print(f"MAE: {mae}")
    print(f"MSE: {mse}")
    print(f"RMSE: {rmse}")
    print(f"R2 Score: {r2}")
    
    y_real_pred = pipeline.predict(df_new_data)
    
    df_new_data[f'predicted_price_{model_name}'] = y_real_pred
    
    print(df_new_data)
    print("\n")

In [None]:
corr_lmsales_reviews = df_laptop[['lmsales_converted', 'reviews']].corr().iloc[0, 1]
print(f"Hệ số tương quan giữa 'lmsales_converted' và 'reviews': {corr_lmsales_reviews}")

plt.figure(figsize=(10, 6))
sns.scatterplot(x='lmsales_converted', y='reviews', data=df_laptop)
plt.title('Biểu đồ phân tán giữa lmsales_converted và reviews')
plt.xlabel('lmsales_converted')
plt.ylabel('reviews')
plt.show()

In [None]:
avg_price_by_brand = df_laptop.groupby('brand')['price'].mean().reset_index()
avg_price_by_brand.columns = ['brand', 'avg_price']

print(avg_price_by_brand)

plt.figure(figsize=(12, 8))
sns.barplot(x='brand', y='avg_price', data=avg_price_by_brand, palette='viridis')
plt.title('Giá tiền trung bình theo thương hiệu')
plt.xlabel('Thương hiệu')
plt.ylabel('Giá tiền trung bình')
plt.xticks(rotation=45)
plt.show()

In [None]:
avg_price_by_brand_cpu = df_laptop.groupby(['brand', 'cpu_brand'])['price'].mean().reset_index()
avg_price_by_brand_cpu.columns = ['brand', 'cpu_brand', 'avg_price']

print(avg_price_by_brand_cpu)

plt.figure(figsize=(14, 10))
sns.barplot(x='avg_price', y='brand', hue='cpu_brand', data=avg_price_by_brand_cpu, palette='viridis')
plt.title('Giá tiền trung bình theo thương hiệu và loại CPU')
plt.xlabel('Giá tiền trung bình')
plt.ylabel('Thương hiệu')
plt.legend(title='CPU Brand')
plt.show()

In [None]:
brand_counts = df_laptop['brand'].value_counts()
print(brand_counts)

plt.figure(figsize=(10, 6))
brand_counts.plot(kind='bar')
plt.title('Số lượng laptop theo nhãn hiệu')
plt.xlabel('Nhãn hiệu')
plt.ylabel('Số lượng')
plt.xticks(rotation=45)
plt.show()

In [None]:
cpu_counts = df_laptop['cpu_brand'].value_counts()
cpu_counts
plt.figure(figsize=(10, 6))
cpu_counts.plot(kind='bar')
plt.title('Số lượng laptop theo nhãn hiệu CPU')
plt.xlabel('Nhãn hiệu CPU')
plt.ylabel('Số lượng')
plt.xticks(rotation=45)
plt.show()

In [None]:
ram_counts = df_laptop['ram'].value_counts()
ram_counts
plt.figure(figsize=(10, 6))
ram_counts.plot(kind='bar')
plt.title('Số lượng laptop theo nhãn hiệu')
plt.xlabel('Dung lượng RAM')
plt.ylabel('Số lượng')
plt.xticks(rotation=45)
plt.show()

In [None]:
capacity_counts = df_laptop['storage_capacity'].value_counts()
capacity_counts
plt.figure(figsize=(10, 6))
capacity_counts.plot(kind='bar')
plt.title('Số lượng laptop theo dung lượng bộ nhớ')
plt.xlabel('Dung lượng')
plt.ylabel('Số lượng')
plt.xticks(rotation=45)
plt.show()

In [None]:
capacity_counts = df_laptop['storage_type'].value_counts()
capacity_counts
plt.figure(figsize=(10, 6))
capacity_counts.plot(kind='bar')
plt.title('Số lượng laptop theo kiểu bộ nhớ')
plt.xlabel('Loại')
plt.ylabel('Số lượng')
plt.xticks(rotation=45)
plt.show()

In [None]:
df_laptop['rating'] = pd.to_numeric(df_laptop['rating'], errors='coerce')
df_laptop['lmsales_converted'] = pd.to_numeric(df_laptop['lmsales_converted'], errors='coerce')
df_laptop['reviews'] = pd.to_numeric(df_laptop['reviews'], errors='coerce')

print(df_laptop.dtypes)

grouped_df = df_laptop.groupby('brand')[['rating', 'lmsales_converted', 'reviews']].mean()

print(grouped_df.head())

brands = grouped_df.index
avg_values = grouped_df.values.T

fig, ax = plt.subplots(figsize=(15, 8))

bar_width = 0.2
index = np.arange(len(brands))

for i, (metric, color) in enumerate(zip(['rating', 'lmsales_converted', 'reviews'], ['blue', 'green', 'red'])):
    plt.bar(index + i * bar_width, avg_values[i], bar_width, label=metric, color=color)

plt.title('Trung bình các chỉ số theo Brand')
plt.xlabel('Brand')
plt.ylabel('Giá trị trung bình')
plt.xticks(index + bar_width * 1.5, brands, rotation=45)
plt.legend()

plt.tight_layout()
plt.show()