In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Load the dataset
file_path = "indian_retailsales_data.csv"
data = pd.read_csv(file_path)

# Display the first few rows
print(data.head())


In [None]:
# Check for missing values
print(data.isnull().sum())

# Fill missing values if any
data.fillna(0, inplace=True)

# Convert 'Order Date' to datetime
data['Order Date'] = pd.to_datetime(data['Order Date'])

# Drop duplicates
data.drop_duplicates(inplace=True)

# Check data types
print(data.dtypes)


In [None]:
region_sales = data.groupby('Region')['Sales'].sum().reset_index()
print(region_sales)

# Visualization
plt.figure(figsize=(8, 6))
sns.barplot(x='Region', y='Sales', data=region_sales)
plt.title('Total Sales by Region')
plt.show()


In [None]:
data['Month'] = data['Order Date'].dt.to_period('M')
monthly_sales = data.groupby('Month')['Sales'].sum().reset_index()

# Visualization
plt.figure(figsize=(10, 6))
plt.plot(monthly_sales['Month'].astype(str), monthly_sales['Sales'], marker='o')
plt.title('Monthly Sales Trend')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Select numerical columns
corr_data = data[['Sales', 'Profit', 'Quantity', 'Discount']]
plt.figure(figsize=(8, 6))
sns.heatmap(corr_data.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()


In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Select numerical features
numerical_data = data[['Sales', 'Profit', 'Quantity', 'Discount']]

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numerical_data)

# Apply PCA
pca = PCA(n_components=2)  # Reduce to 2 components for visualization
pca_data = pca.fit_transform(scaled_data)

# Explained variance ratio
print("Explained Variance Ratio:", pca.explained_variance_ratio_)

# Visualize the PCA result
plt.scatter(pca_data[:, 0], pca_data[:, 1], alpha=0.7)
plt.title('PCA - Reduced Dimensionality')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Prepare data
X = data[['Sales', 'Quantity', 'Discount']]  # Features
y = data['Profit']  # Target

# Handle missing values
X.fillna(X.mean(), inplace=True)
y.fillna(y.mean(), inplace=True)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Regressor
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))


In [None]:
# Calculate Recency, Frequency, and Monetary value
today = data['Order Date'].max()
rfm = data.groupby('Customer Segment').agg({
    'Order Date': lambda x: (today - x.max()).days,  # Recency
    'Sales': 'sum',                                 # Monetary
    'Customer Segment': 'count'                    # Frequency
}).rename(columns={'Order Date': 'Recency', 'Sales': 'Monetary', 'Customer Segment': 'Frequency'})

print(rfm)


In [None]:
product_sales = data.groupby('Product Category')['Sales'].sum().reset_index()
product_profit = data.groupby('Product Category')['Profit'].sum().reset_index()

# Merge for comparison
product_analysis = pd.merge(product_sales, product_profit, on='Product Category')
print(product_analysis)


In [None]:
#A grouped bar chart comparing Sales and Profit across categories.
category_analysis = data.groupby('Product Category')[['Sales', 'Profit']].sum().reset_index()
category_analysis.plot(kind='bar', x='Product Category', figsize=(10, 6))
plt.title('Sales and Profit by Product Category')
plt.ylabel('Amount (INR)')
plt.show()


In [None]:

#Top 10 Cities by Total Sales:
top_cities = data.groupby('City')['Sales'].sum().nlargest(10).reset_index()
sns.barplot(x='Sales', y='City', data=top_cities, palette='viridis')
plt.title('Top 10 Cities by Sales')
plt.show()


In [None]:
#Sales Contribution by Region:

region_sales = data.groupby('Region')['Sales'].sum()
region_sales.plot(kind='pie', autopct='%1.1f%%', figsize=(8, 8), startangle=90)
plt.title('Sales Contribution by Region')
plt.ylabel('')  # Remove default ylabel
plt.show()
