In [1]:
import pandas as pd
import numpy as np
import os
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
DATA_PATH = os.path.join("..", "data", "Sales-Data-Analysis.csv")

if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Dataset not found at {DATA_PATH}")

df = pd.read_csv(DATA_PATH)
print("Dataset loaded successfully ✅")
print(df.head())

Dataset loaded successfully ✅
   Order ID        Date             Product  Price  Quantity Purchase Type  \
0     10452  07-11-2022               Fries   3.49    573.07       Online    
1     10453  07-11-2022           Beverages   2.95    745.76       Online    
2     10454  07-11-2022       Sides & Other   4.99    200.40     In-store    
3     10455  08-11-2022             Burgers  12.99    569.67     In-store    
4     10456  08-11-2022  Chicken Sandwiches   9.95    201.01     In-store    

  Payment Method             Manager    City  
0      Gift Card    Tom      Jackson  London  
1      Gift Card         Pablo Perez  Madrid  
2      Gift Card       Joao    Silva  Lisbon  
3    Credit Card       Walter Muller  Berlin  
4    Credit Card       Walter Muller  Berlin  


In [5]:
df.columns = [c.strip() for c in df.columns]  

for col in df.select_dtypes(include=["object"]).columns:
    df[col] = df[col].astype(str).str.strip()

# Convert Date column if it exists
if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day

# Try converting object columns to numeric where possible
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = pd.to_numeric(df[col], errors='ignore')

print("Data cleaned successfully ✅")

Data cleaned successfully ✅


  df[col] = pd.to_numeric(df[col], errors='ignore')


In [6]:
if 'Price' in df.columns and 'Quantity' in df.columns:
    df['Price'] = pd.to_numeric(df['Price'], errors='coerce')
    df['Quantity'] = pd.to_numeric(df['Quantity'], errors='coerce')
    df['Revenue'] = df['Price'] * df['Quantity']

# Drop rows with missing values
df = df.dropna()

print("Feature engineering complete ✅")
print(df.head())

Feature engineering complete ✅
   Order ID       Date             Product  Price  Quantity Purchase Type  \
0     10452 2022-07-11               Fries   3.49    573.07        Online   
1     10453 2022-07-11           Beverages   2.95    745.76        Online   
2     10454 2022-07-11       Sides & Other   4.99    200.40      In-store   
3     10455 2022-08-11             Burgers  12.99    569.67      In-store   
4     10456 2022-08-11  Chicken Sandwiches   9.95    201.01      In-store   

  Payment Method           Manager    City    Year  Month   Day    Revenue  
0      Gift Card  Tom      Jackson  London  2022.0    7.0  11.0  2000.0143  
1      Gift Card       Pablo Perez  Madrid  2022.0    7.0  11.0  2199.9920  
2      Gift Card     Joao    Silva  Lisbon  2022.0    7.0  11.0   999.9960  
3    Credit Card     Walter Muller  Berlin  2022.0    8.0  11.0  7400.0133  
4    Credit Card     Walter Muller  Berlin  2022.0    8.0  11.0  2000.0495  


In [7]:
if 'Revenue' not in df.columns:
    raise ValueError("Revenue column not found. Please check feature engineering step.")

X = df[['Price', 'Quantity']]
y = df['Revenue']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Data split into training and testing sets ✅")

Data split into training and testing sets ✅


In [8]:
model = LinearRegression()
model.fit(X_train, y_train)
print("Model trained successfully ✅")


Model trained successfully ✅


In [9]:
MODEL_PATH = os.path.join("..", "model.pkl")
with open(MODEL_PATH, "wb") as file:
    pickle.dump(model, file)

print(f"Model saved to {MODEL_PATH} ✅")

Model saved to ..\model.pkl ✅


In [10]:
with open(MODEL_PATH, "rb") as file:
    loaded_model = pickle.load(file)

print("Loaded model from disk ✅")
sample_input = [[100, 2]]  # Price=100, Quantity=2
print(f"Sample prediction: {loaded_model.predict(sample_input)}")

Loaded model from disk ✅
Sample prediction: [48688.42025254]


