In [None]:
from google.colab import files
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import plotly.colors as colors

pio.templates.default = "plotly_white"

#Upload and Read CSV

In [None]:
# Upload the CSV file
uploaded = files.upload()

Saving Sample - Superstore.csv to Sample - Superstore.csv


In [None]:
data = pd.read_csv("Sample - Superstore.csv", encoding='latin-1')
print(data.head())

   Row ID        Order ID  Order Date   Ship Date       Ship Mode Customer ID  \
0       1  CA-2016-152156   11/8/2016  11/11/2016    Second Class    CG-12520   
1       2  CA-2016-152156   11/8/2016  11/11/2016    Second Class    CG-12520   
2       3  CA-2016-138688   6/12/2016   6/16/2016    Second Class    DV-13045   
3       4  US-2015-108966  10/11/2015  10/18/2015  Standard Class    SO-20335   
4       5  US-2015-108966  10/11/2015  10/18/2015  Standard Class    SO-20335   

     Customer Name    Segment        Country             City  ...  \
0      Claire Gute   Consumer  United States        Henderson  ...   
1      Claire Gute   Consumer  United States        Henderson  ...   
2  Darrin Van Huff  Corporate  United States      Los Angeles  ...   
3   Sean O'Donnell   Consumer  United States  Fort Lauderdale  ...   
4   Sean O'Donnell   Consumer  United States  Fort Lauderdale  ...   

  Postal Code  Region       Product ID         Category Sub-Category  \
0       42420   Sout

#Check Columns

In [None]:
print(data.columns)

Index(['Row ID', 'Order ID', 'Order Date', 'Ship Date', 'Ship Mode',
       'Customer ID', 'Customer Name', 'Segment', 'Country', 'City', 'State',
       'Postal Code', 'Region', 'Product ID', 'Category', 'Sub-Category',
       'Product Name', 'Sales', 'Quantity', 'Discount', 'Profit'],
      dtype='object')


In [None]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Row ID         9994 non-null   int64  
 1   Order ID       9994 non-null   object 
 2   Order Date     9994 non-null   object 
 3   Ship Date      9994 non-null   object 
 4   Ship Mode      9994 non-null   object 
 5   Customer ID    9994 non-null   object 
 6   Customer Name  9994 non-null   object 
 7   Segment        9994 non-null   object 
 8   Country        9994 non-null   object 
 9   City           9994 non-null   object 
 10  State          9994 non-null   object 
 11  Postal Code    9994 non-null   int64  
 12  Region         9994 non-null   object 
 13  Product ID     9994 non-null   object 
 14  Category       9994 non-null   object 
 15  Sub-Category   9994 non-null   object 
 16  Product Name   9994 non-null   object 
 17  Sales          9994 non-null   float64
 18  Quantity

In [None]:

data['Row ID'] = data['Row ID'].astype(str)
data['Postal Code'] = data['Postal Code'].astype(str)

# Verify the changes
print(data.dtypes)


Row ID            object
Order ID          object
Order Date        object
Ship Date         object
Ship Mode         object
Customer ID       object
Customer Name     object
Segment           object
Country           object
City              object
State             object
Postal Code       object
Region            object
Product ID        object
Category          object
Sub-Category      object
Product Name      object
Sales            float64
Quantity           int64
Discount         float64
Profit           float64
dtype: object


#Statistic Descriptive

In [None]:
print(data.describe())

              Sales     Quantity     Discount       Profit
count   9994.000000  9994.000000  9994.000000  9994.000000
mean     229.858001     3.789574     0.156203    28.656896
std      623.245101     2.225110     0.206452   234.260108
min        0.444000     1.000000     0.000000 -6599.978000
25%       17.280000     2.000000     0.000000     1.728750
50%       54.490000     3.000000     0.200000     8.666500
75%      209.940000     5.000000     0.200000    29.364000
max    22638.480000    14.000000     0.800000  8399.976000


#Check Missing Values

In [None]:
print(data.isnull().sum())

Row ID           0
Order ID         0
Order Date       0
Ship Date        0
Ship Mode        0
Customer ID      0
Customer Name    0
Segment          0
Country          0
City             0
State            0
Postal Code      0
Region           0
Product ID       0
Category         0
Sub-Category     0
Product Name     0
Sales            0
Quantity         0
Discount         0
Profit           0
dtype: int64


#Distribution of Quantity

In [None]:
import plotly.express as px

# Assuming 'data' is your DataFrame
fig = px.histogram(data, x='Quantity', nbins=10, title='Distribution of Quantity')

# Add data labels
fig.update_traces(texttemplate='%{y}', textposition='outside')

# Set labels for axes
fig.update_layout(xaxis_title='Quantity', yaxis_title='Count')

# Show the plot
fig.show()


#Distribution of Sales

In [None]:
# Assuming 'data' is your DataFrame
fig = px.histogram(data, x='Sales', nbins=10, title='Distribution of Sales')

# Add data labels
fig.update_traces(texttemplate='%{y}', textposition='outside')

# Set labels for axes
fig.update_layout(xaxis_title='Sales', yaxis_title='Count')

# Show the plot
fig.show()

#Distribution of Discount

In [None]:
# Assuming 'data' is your DataFrame
fig = px.histogram(data, x='Discount', nbins=10, title='Distribution of Discount')

# Add data labels
fig.update_traces(texttemplate='%{y}', textposition='outside')

# Set labels for axes
fig.update_layout(xaxis_title='Discount', yaxis_title='Count')

# Format x-axis as percentage
fig.update_layout(xaxis_tickformat='%')

# Show the plot
fig.show()

#Relationship Sales and Profit

In [None]:
fig = px.scatter(data,
                 x='Sales',
                 y='Profit',
                 trendline = 'ols',
                 title='Sales vs. Profit')
fig.show()

#Relationship Sales and Qty


In [None]:
fig = px.scatter(data,
                 x='Sales',
                 y='Quantity',
                 trendline = 'ols',
                 title='Sales vs. Quantity')
fig.show()

#Relationship Profit and Discount

In [None]:
fig = px.scatter(data,
                 x='Profit',
                 y='Discount',
                 trendline = 'ols',
                 title='Profit vs. Discount')
fig.show()

#Relationship Sales and Discount

In [None]:
fig = px.scatter(data,
                 x='Sales',
                 y='Discount',
                 trendline = 'ols',
                 title='Sales vs. Discount')
fig.show()

#Correlation Matrix

In [None]:
corr_matrix = data.corr()

fig = go.Figure(data=go.Heatmap(z=corr_matrix.values,
                               x=corr_matrix.columns,
                               y=corr_matrix.index,
                               colorscale='RdBu',
                               zmin=-1,
                               zmax=1))

fig.update_layout(title='Correlation Matrix',
                  xaxis_title='Features',
                  yaxis_title='Features')

fig.show()



