In [4]:
import pandas as pd
import csv

# Read the CSV file
df = pd.read_csv('online_retail.csv')

# Remove missing customers
df = df.dropna(subset=['Customer ID'])

# Convert 'Customer Id' to integer
df['Customer ID'] = df['Customer ID'].astype(int)

# Remove canceled orders
df = df[~df['Invoice'].str.startswith('C')]

# Remove rows where item price was '0'
df = df[df['Price'] > 0]

# Remove non-product stock codes
# List of non-product codes
non_product_codes = ['POST', 'DOT', 'ADJUST', 'C2', 'M', 'BANK CHARGES']

# Keep only rows where StockCode is NOT in the list
df = df[~df['StockCode'].isin(non_product_codes)]

# Create 'TotalPrice' column
df['TotalPrice'] = df['Quantity'] * df['Price']

# Check the new column for first few rows
df[['Quantity', 'Price', 'TotalPrice']].head()

# Convert 'InvoiceDate' to datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

# Extract Year, Month, DayOfWeek, Hour:
df['Year'] = df['InvoiceDate'].dt.year
df['Month'] = df['InvoiceDate'].dt.month
df['DayOfWeek'] = df['InvoiceDate'].dt.dayofweek
df['Hour'] = df['InvoiceDate'].dt.hour

# Check the extracted values
df[['InvoiceDate','Year', 'Month', 'DayOfWeek', 'Hour']].head()

# Finally check the dataset 
df.info() 
df.isnull().sum()

# Save the cleaned dataset to a new CSV file
df.to_csv('online_retail_cleaned.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
Index: 802669 entries, 0 to 1067369
Data columns (total 13 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Invoice      802669 non-null  object        
 1   StockCode    802669 non-null  object        
 2   Description  802669 non-null  object        
 3   Quantity     802669 non-null  int64         
 4   InvoiceDate  802669 non-null  datetime64[ns]
 5   Price        802669 non-null  float64       
 6   Customer ID  802669 non-null  int64         
 7   Country      802669 non-null  object        
 8   TotalPrice   802669 non-null  float64       
 9   Year         802669 non-null  int32         
 10  Month        802669 non-null  int32         
 11  DayOfWeek    802669 non-null  int32         
 12  Hour         802669 non-null  int32         
dtypes: datetime64[ns](1), float64(2), int32(4), int64(2), object(4)
memory usage: 73.5+ MB
