In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import os

ModuleNotFoundError: No module named 'pandas'

In [None]:
# Set up logging
log_dir = '../logs'
os.makedirs(log_dir, exist_ok=True)
logging.basicConfig(level=logging.INFO, filename='../logs/project.log', filemode='a', format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [None]:
# Load datasets
logger.info("Loading datasets...")
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
store = pd.read_csv('../data/store.csv')

In [None]:
# Initial inspection
logger.info("Inspecting datasets...")
logger.info(f"Train set shape: {train.shape}")
logger.info(f"Test set shape: {test.shape}")
logger.info(f"Store set shape: {store.shape}")

In [None]:
logger.info("Train columns: %s", train.columns.tolist())
logger.info("Test columns: %s", test.columns.tolist())
logger.info("Store columns: %s", store.columns.tolist())

In [None]:
# Merging train and store data
logger.info("Merging train and store datasets...")
train = train.merge(store, on='Store', how='left')
test = test.merge(store, on='Store', how='left')


In [None]:
logger.info("Checking for missing values...")
logger.info(train.isna().sum())
logger.info(test.isna().sum())

In [None]:
# Fill missing values
logger.info("Handling missing values...")
train['CompetitionDistance'].fillna(train['CompetitionDistance'].median(), inplace=True)
test['CompetitionDistance'].fillna(test['CompetitionDistance'].median(), inplace=True)

In [None]:
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

In [None]:
# Feature Engineering - Extract date features
logger.info("Extracting date features...")
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])


In [None]:
train['Year'] = train['Date'].dt.year
train['Month'] = train['Date'].dt.month
train['Day'] = train['Date'].dt.day
train['DayOfWeek'] = train['Date'].dt.dayofweek
train['WeekOfYear'] = train['Date'].dt.isocalendar().week

In [None]:
# Distribution of sales and promotions
logger.info("Visualizing sales and promotions distribution...")
plt.figure(figsize=(10, 6))
sns.histplot(train['Sales'], bins=50, kde=True)
plt.title('Sales Distribution')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='Promo', data=train)
plt.title('Promotion Count')
plt.show()

In [None]:
# Correlation between sales and number of customers
logger.info("Checking correlation between Sales and Customers...")
correlation = train[['Sales', 'Customers']].corr()
logger.info(f"Correlation: \n{correlation}")

In [None]:
lt.figure(figsize=(10, 6))
sns.scatterplot(x='Customers', y='Sales', data=train)
plt.title('Sales vs. Customers')
plt.show()

In [None]:
# Sales behavior before, during, and after holidays
logger.info("Exploring sales around holidays...")
holiday_sales = train.groupby(['StateHoliday'])['Sales'].mean().reset_index()
sns.barplot(x='StateHoliday', y='Sales', data=holiday_sales)
plt.title('Sales during Holidays')
plt.show()