In [1]:
import pandas as pd
import csv

# Load the dataset
df = pd.read_csv('online_retail.csv')

# Display basic information about the dataset
df.info()

# Show summary statistics for numerical and categorical columns
df.describe(include='all')

# Check for missing values
missing_rows = df[df.isnull().any(axis=1)]
# missing_rows

# Remove missing customers
df = df.dropna(subset=['Customer ID'])

# Convert 'Customer Id' to integer
df['Customer ID'] = df['Customer ID'].astype(int)

# Identify canceled orders (Invoice start with 'c')
cancelled_invoice = df[df['Invoice'].str.startswith('C')]
# cancelled_invoice

# Remove canceled orders
df = df[~df['Invoice'].str.startswith('C')]

# Identify rows where item price was '0'
zero_price_items = df[df['Price'] <= 0]
# zero_price_items

# Remove rows where item price was '0'
df = df[df['Price'] > 0]

# View all unique stock codes sorted
unique_stock_codes = df['StockCode'].unique()
print("Unique StockCodes:")
print(unique_stock_codes)

# Remove non-product stock codes
# List of non-product codes
non_product_codes = ['POST', 'DOT', 'ADJUST', 'C2', 'M', 'BANK CHARGES']

# Keep only rows where StockCode is NOT in the list
df_clean = df[~df['StockCode'].isin(non_product_codes)]

# Check if any remain
remaining_non_products = df_clean[df_clean['StockCode'].isin(non_product_codes)]
# remaining_non_products

# Create 'TotalPrice' column
df['TotalPrice'] = df['Quantity'] * df['Price']

# Check the new column for first few rows
df[['Quantity', 'Price', 'TotalPrice']].head()

# Convert 'InvoiceDate' to datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

# Extract Year, Month, DayOfWeek, Hour:
df['Year'] = df['InvoiceDate'].dt.year
df['Month'] = df['InvoiceDate'].dt.month
df['DayOfWeek'] = df['InvoiceDate'].dt.dayofweek
df['Hour'] = df['InvoiceDate'].dt.hour

# Check the extracted values
df[['InvoiceDate','Year', 'Month', 'DayOfWeek', 'Hour']].head()

# Finally check the dataset 
df.info() 
df.isnull().sum()

# Save the cleaned dataset to a new CSV file
df.to_csv('online_retail_cleaned.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1067371 entries, 0 to 1067370
Data columns (total 8 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   Invoice      1067371 non-null  object 
 1   StockCode    1067371 non-null  object 
 2   Description  1062989 non-null  object 
 3   Quantity     1067371 non-null  int64  
 4   InvoiceDate  1067371 non-null  object 
 5   Price        1067371 non-null  float64
 6   Customer ID  824364 non-null   float64
 7   Country      1067371 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 65.1+ MB
Unique StockCodes:
['85048' '79323P' '79323W' ... '23562' '23561' '23843']
<class 'pandas.core.frame.DataFrame'>
Index: 805549 entries, 0 to 1067370
Data columns (total 13 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Invoice      805549 non-null  object        
 1   StockCode    805549 non-null  object        
 2 