# 04_regression_analysis.ipynb

## **Objective:**
Train a linear regression model to analyze customer purchase behavior.

---

## **1️⃣ Import Necessary Libraries**

In [12]:
import pandas as pd
import logging
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

## **2️⃣ Set Up Logging**

In [13]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

## **3️⃣ Define File Paths**

In [None]:
DATA_PATH = "data/customer_data_clean.csv"
RESULTS_PATH = "data/regression_results.csv"

## **4️⃣ Load Dataset**

In [15]:
def load_data(file_path):
    """Load dataset safely."""
    if not os.path.exists(file_path):
        logging.error(f"File not found: {file_path}")
        return None
    try:
        df = pd.read_csv(file_path)
        logging.info("Data loaded successfully.")
        return df
    except Exception as e:
        logging.error(f"Error loading data: {e}")
        return None

# Load data
df = load_data(DATA_PATH)

2025-02-24 23:26:38,884 - INFO - Data loaded successfully.


## **5️⃣ Train Regression Model**

In [16]:
def train_regression(df):
    """Train a linear regression model."""
    if df is None:
        logging.error("No data available for training.")
        return
    
    try:
        # Creating a feature for transaction order
        df['transaction_id'] = range(1, len(df) + 1)
        X = df[['transaction_id']]
        y = df['purchase_amount_log']

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Train model
        model = LinearRegression()
        model.fit(X_train, y_train)

        # Make predictions
        predictions = model.predict(X_test)
        mse = mean_squared_error(y_test, predictions)
        logging.info(f"Model trained successfully. Mean Squared Error: {mse}")

        # Save results
        results = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})
        results.to_csv(RESULTS_PATH, index=False)
        logging.info(f"Regression results saved: {RESULTS_PATH}")

    except Exception as e:
        logging.error(f"Error during model training: {e}")

# Train model
train_regression(df)

2025-02-24 23:26:38,925 - INFO - Model trained successfully. Mean Squared Error: 0.023012271112997723
2025-02-24 23:26:38,975 - INFO - Regression results saved: C:\Users\mahin\CustomerPurchaseAnalysis\customer_purchase_analysis\data\regression_results.csv


## **6️⃣ Summary & Next Steps**
✅ Regression model trained and evaluated.  
✅ Results saved for further analysis.  
➡️ Next, proceed to `05_visualization_export.ipynb` to visualize insights.