In [9]:
#Import Libraries
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [10]:
#Load Dataset
df = pd.read_csv("SampleSuperstore.csv")
df.head()


Unnamed: 0,Ship Mode,Segment,Country,City,State,Postal Code,Region,Category,Sub-Category,Sales,Quantity,Discount,Profit
0,Second Class,Consumer,United States,Henderson,Kentucky,42420,South,Furniture,Bookcases,261.96,2,0.0,41.9136
1,Second Class,Consumer,United States,Henderson,Kentucky,42420,South,Furniture,Chairs,731.94,3,0.0,219.582
2,Second Class,Corporate,United States,Los Angeles,California,90036,West,Office Supplies,Labels,14.62,2,0.0,6.8714
3,Standard Class,Consumer,United States,Fort Lauderdale,Florida,33311,South,Furniture,Tables,957.5775,5,0.45,-383.031
4,Standard Class,Consumer,United States,Fort Lauderdale,Florida,33311,South,Office Supplies,Storage,22.368,2,0.2,2.5164


In [11]:
#Basic Descriptive Analysis (EDA)
print("Rows:", df.shape[0])
print("Columns:", df.shape[1])

# Column names
df.columns

Rows: 9994
Columns: 13


Index(['Ship Mode', 'Segment', 'Country', 'City', 'State', 'Postal Code',
       'Region', 'Category', 'Sub-Category', 'Sales', 'Quantity', 'Discount',
       'Profit'],
      dtype='object')

In [12]:
df.describe()

Unnamed: 0,Postal Code,Sales,Quantity,Discount,Profit
count,9994.0,9994.0,9994.0,9994.0,9994.0
mean,55190.379428,229.858001,3.789574,0.156203,28.656896
std,32063.69335,623.245101,2.22511,0.206452,234.260108
min,1040.0,0.444,1.0,0.0,-6599.978
25%,23223.0,17.28,2.0,0.0,1.72875
50%,56430.5,54.49,3.0,0.2,8.6665
75%,90008.0,209.94,5.0,0.2,29.364
max,99301.0,22638.48,14.0,0.8,8399.976


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Ship Mode     9994 non-null   object 
 1   Segment       9994 non-null   object 
 2   Country       9994 non-null   object 
 3   City          9994 non-null   object 
 4   State         9994 non-null   object 
 5   Postal Code   9994 non-null   int64  
 6   Region        9994 non-null   object 
 7   Category      9994 non-null   object 
 8   Sub-Category  9994 non-null   object 
 9   Sales         9994 non-null   float64
 10  Quantity      9994 non-null   int64  
 11  Discount      9994 non-null   float64
 12  Profit        9994 non-null   float64
dtypes: float64(3), int64(2), object(8)
memory usage: 1015.1+ KB


In [14]:
# missing values
df.isnull().sum()


Ship Mode       0
Segment         0
Country         0
City            0
State           0
Postal Code     0
Region          0
Category        0
Sub-Category    0
Sales           0
Quantity        0
Discount        0
Profit          0
dtype: int64

In [16]:
#Category-wise Sales
cat_sales = df.groupby("Category")["Sales"].sum().reset_index()
cat_sales

Unnamed: 0,Category,Sales
0,Furniture,741999.7953
1,Office Supplies,719047.032
2,Technology,836154.033


In [17]:
#Plotly Chart 1: Category-wise Sales
fig = px.bar(cat_sales, 
             x="Category", 
             y="Sales", 
             title="Category-wise Sales",
             text_auto=".2s")
fig.show()


In [None]:
#Region-wise Profit Heatmap
region_profit = df.groupby("Region")["Profit"].sum().reset_index()

fig = px.density_heatmap(
    region_profit,
    x="Region",
    y="Profit",
    title="Region vs Profit Heatmap"
)
fig.show()


In [19]:
#Discount vs Profit Scatter Plot
fig = px.scatter(
    df, 
    x="Discount", 
    y="Profit",
    color="Category",
    title="Discount vs Profit (Colored by Category)"
)
fig.show()


In [20]:
#Predictive Analysis (Linear Regression)
# Predict Sales based on:
# Quantity and Discount
X = df[["Quantity", "Discount"]]
y = df["Sales"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


In [22]:
#Model Evaluation
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
# Coefficients
print("Coefficients:", model.coef_)


Mean Absolute Error (MAE): 274.98422630547105
Mean Squared Error (MSE): 569282.568307617
RMSE: 754.5081631815635
Coefficients: [  52.74024386 -128.73447597]


In [24]:
#Show Prediction for Example Input
sample_input = pd.DataFrame({
    "Quantity": [5],
    "Discount": [0.1]
})

prediction = model.predict(sample_input)[0]
print("Predicted Sales:", prediction)



Predicted Sales: 297.7757920223821
