In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [3]:
df = pd.read_csv('sales.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'sales.csv'

In [None]:
# Fix inconsistent category names in 'Item_Fat_Content'
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace(['LF', 'low fat'], 'Low Fat')
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace('reg', 'Regular')

In [4]:
# Drop duplicates
df = df.drop_duplicates()

# Drop 'Item_Identifier'
df = df.drop(columns=['Item_Identifier'])

# Identify the features (X) and target (y)
X = df.drop(columns=['Item_Outlet_Sales'])
y = df['Item_Outlet_Sales']

NameError: name 'df' is not defined

In [5]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [6]:
# Preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, X.select_dtypes(include=['float64', 'int64']).columns),
        ('cat', categorical_transformer, X.select_dtypes(include=['object']).columns)
    ])

In [7]:
# Create a linear regression model
lr = LinearRegression()

# Train the model
model_lr = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', lr)])
model_lr.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer())]),
                                                  Index(['Item_Weight', 'Item_Visibility', 'Item_MRP',
       'Outlet_Establishment_Year'],
      dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size',
       'Outlet_Location_

In [1]:
# Evaluate the performance of the linear regression model
r2_lr = model_lr.score(X_test, y_test)
print(f"R-squared for linear regression: {r2_lr:.3f}")

y_pred_lr = model_lr.predict(X_test)
rmse_lr = np.sqrt(((y_pred_lr - y_test) ** 2).mean())
print(f"RMSE for linear regression: {rmse_lr:.3f}")

from sklearn.model_selection import GridSearchCV

# Define the hyperparameters to tune
param_grid = {
    'regressor__max_depth': [5, 10, 15, 20],
    'regressor__min_samples_split': [2, 5, 10, 15],
    'regressor__min_samples_leaf': [1, 2, 5, 10]
}

# Create the decision tree pipeline
dtr = DecisionTreeRegressor(random_state=0)
model_dtr = Pipeline(steps=[('preprocessor', preprocessor),
                            ('regressor', dtr)])

# Initialize GridSearchCV
grid_search_dtr = GridSearchCV(model_dtr, param_grid, cv=5, n_jobs=-1, verbose=2)

# Train the model
grid_search_dtr.fit(X_train, y_train)

# Check the best hyperparameters
print("Best hyperparameters for decision tree regressor: ", grid_search_dtr.best_params_)

# Evaluate the performance of the tuned decision tree model
r2_dtr_tuned = grid_search_dtr.score(X_test, y_test)
print(f"R-squared for tuned decision tree regression: {r2_dtr_tuned:.3f}")

y_pred_dtr_tuned = grid_search_dtr.predict(X_test)
rmse_dtr_tuned = np.sqrt(((y_pred_dtr_tuned - y_test) ** 2).mean())
print(f"RMSE for tuned decision tree regression: {rmse_dtr_tuned:.3f}")


NameError: name 'model_lr' is not defined

In [10]:
# Create a decision tree regression model
dtr = DecisionTreeRegressor(random_state=0)

# Train the model
model_dtr = Pipeline(steps=[('preprocessor', preprocessor),
                            ('regressor', dtr)])
model_dtr.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer())]),
                                                  Index(['Item_Weight', 'Item_Visibility', 'Item_MRP',
       'Outlet_Establishment_Year'],
      dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size',
       'Outlet_Location_

In [11]:
# Evaluate the performance of the decision tree regression model
r2_dtr = model_dtr.score(X_test, y_test)
print(f"R-squared for decision tree regression: {r2_dtr:.3f}")

y_pred_dtr = model_dtr.predict(X_test)
rmse_dtr = np.sqrt(((y_pred_dtr - y_test) ** 2).mean())
print(f"RMSE for decision tree regression: {rmse_dtr:.3f}")

R-squared for decision tree regression: 0.187
RMSE for decision tree regression: 1573.714


In [None]:
#According to the evaluation measures, it appears that the linear regression model outperforms the decision tree regression model in terms of performance. The linear regression model has a lower RMSE value, which implies it has a smaller average deviation from the real values, and a higher R-squared value, which indicates that it explains more of the variance in the target variable.