In [3]:
import pandas as pd

# Load the dataset
df = pd.read_csv("sales_data.csv")

# Then preview the first 5 rows
print("First 5 rows:")
display(df.head())

# check basic info about the dataset
print("\nDataset Info:")
df.info()

# summarize key statistics
print("\nSummary Statistics:")
display(df.describe())

First 5 rows:


Unnamed: 0,OrderID,Date,Product,Category,Quantity,UnitPrice,TotalSales
0,1000,2023-01-01,Headphones,Accessories,1,150,150
1,1001,2023-01-01,Mouse,Accessories,3,50,150
2,1002,2023-01-01,Monitor,Electronics,1,300,300
3,1003,2023-01-01,Mouse,Accessories,2,50,100
4,1004,2023-01-01,Tablet,Electronics,3,500,1500



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3481 entries, 0 to 3480
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   OrderID     3481 non-null   int64 
 1   Date        3481 non-null   object
 2   Product     3481 non-null   object
 3   Category    3481 non-null   object
 4   Quantity    3481 non-null   int64 
 5   UnitPrice   3481 non-null   int64 
 6   TotalSales  3481 non-null   int64 
dtypes: int64(4), object(3)
memory usage: 190.5+ KB

Summary Statistics:


Unnamed: 0,OrderID,Quantity,UnitPrice,TotalSales
count,3481.0,3481.0,3481.0,3481.0
mean,2740.0,2.485205,445.101982,1106.004022
std,1005.022471,1.133792,395.928992,1199.33285
min,1000.0,1.0,50.0,50.0
25%,1870.0,1.0,100.0,200.0
50%,2740.0,2.0,300.0,600.0
75%,3610.0,4.0,800.0,1500.0
max,4480.0,4.0,1200.0,4800.0


## Now we clean the data; check for missing/duplicate values, drop any columns, make any changes necessary

In [None]:
# check for missing values
print("Missing values per column:")
print(df.isnull().sum())

# drop any duplicate rows, if any
df = df.drop_duplicates()

# correct data types
df['Date'] = pd.to_datetime(df['Date'])

# double check the data types
print("\nData types after cleaning:")
print(df.dtypes)

# check table
print(df.sample(20))
display(df.sample(20))

Missing values per column:
OrderID       0
Date          0
Product       0
Category      0
Quantity      0
UnitPrice     0
TotalSales    0
dtype: int64

Data types after cleaning:
OrderID                int64
Date          datetime64[ns]
Product               object
Category              object
Quantity               int64
UnitPrice              int64
TotalSales             int64
dtype: object
      OrderID       Date     Product     Category  Quantity  UnitPrice  \
1936     2936 2023-07-22      Laptop  Electronics         2       1200   
835      1835 2023-03-31      Tablet  Electronics         1        500   
2742     3742 2023-10-16     Monitor  Electronics         1        300   
1830     2830 2023-07-10       Phone  Electronics         2        800   
1078     2078 2023-04-26      Tablet  Electronics         4        500   
648      1648 2023-03-13      Tablet  Electronics         2        500   
2690     3690 2023-10-11     Monitor  Electronics         4        300   
3116     41

Unnamed: 0,OrderID,Date,Product,Category,Quantity,UnitPrice,TotalSales
801,1801,2023-03-28,Laptop,Electronics,2,1200,2400
2003,3003,2023-07-29,Headphones,Accessories,1,150,150
992,1992,2023-04-16,Keyboard,Accessories,1,100,100
765,1765,2023-03-24,Mouse,Accessories,3,50,150
1720,2720,2023-06-30,Monitor,Electronics,3,300,900
255,1255,2023-01-26,Monitor,Electronics,4,300,1200
1226,2226,2023-05-09,Mouse,Accessories,1,50,50
3186,4186,2023-11-29,Mouse,Accessories,4,50,200
1942,2942,2023-07-23,Tablet,Electronics,2,500,1000
857,1857,2023-04-03,Laptop,Electronics,1,1200,1200


In [11]:
# quick check for calculation errors
df['CheckTotal'] = df['Quantity'] * df['UnitPrice']

# compare with TotalSales
mismatches = df[df['CheckTotal'] != df['TotalSales']]
print("Number of mismatches:", len(mismatches))

# if any mismatches, show the first few
if len(mismatches)> 0:
    display(mismatches.head())

Number of mismatches: 0


## Data set is clean. Now we will move on to analyzing the data/answering some questions that might be useful

In [12]:
# total sales overall
total_sales = df['TotalSales'].sum()
print("Total Sales (all time): $", total_sales)

Total Sales (all time): $ 3850000
