In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

In [None]:
# The file is not encoded in UTF-8, so we specify the correct encoding (ISO-8859-1)
# to avoid UnicodeDecodeError when reading the CSV file
df = pd.read_csv('/content/OnlineRetail.csv', encoding='ISO-8859-1')

In [None]:
df.head()

In [None]:
data =df.copy()

# Display the shape of the dataset (number of rows and columns)
df.shape

df.columns

# Get a quick summary of data types and missing values
df.info()

# Display summary statistics for numerical columns
df.describe()

In [None]:
# Display number of unique values in each column
print(df.nunique())

In [None]:
# Count unique countries
print("Number of countries:", df['Country'].nunique())
print(df['Country'].value_counts())

In [None]:
# Check top 5 most common customers
print(df['CustomerID'].value_counts().head())

In [None]:
# Check if there are any negative quantities or prices (possibly returns or errors or outlier)
print("Negative Quantity values:", (df['Quantity'] < 0).sum())
print("Negative UnitPrice values:", (df['UnitPrice'] < 0).sum())

In [None]:
# Check for duplicates
print("Number of duplicate rows:", df.duplicated().sum())

In [None]:
# Check missing values in each column
df.isnull().sum()

In [None]:
# Drop rows where CustomerID is missing, since it's a key identifier and cannot be imputed reliably
df = df.dropna(subset=['CustomerID'])

In [None]:
# Create a mapping from StockCode to Description (only for non-NaN values)
stockcode_to_desc = df.dropna(subset=['Description']).drop_duplicates('StockCode').set_index('StockCode')['Description']

# Use the map to fill in missing Description values
df['Description'] = df['Description'].fillna(df['StockCode'].map(stockcode_to_desc)) 

In [None]:
df = df.dropna(subset=['Quantity'])  # Drop row with missing Quantity

In [None]:
df = df.dropna(subset=['InvoiceDate'])  # Drop row with missing InvoiceDate

In [None]:
df = df.dropna(subset=['Country'])  # Drop row with missing Country

In [None]:
df.isnull().sum()   #done

In [None]:
# remove duplicates
df = df.drop_duplicates()

In [None]:
# Check for duplicates
print("Number of duplicate rows:", df.duplicated().sum())

In [None]:
#Convert InvoiceDate to datetime format
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], errors='coerce')

In [None]:
print(df['InvoiceDate'].dtypes)

In [None]:
# Extract year, month, day, and hour from the 'InvoiceDate' column
df['Year'] = df['InvoiceDate'].dt.year
df['Month'] = df['InvoiceDate'].dt.month
df['Day'] = df['InvoiceDate'].dt.day
df['Hour'] = df['InvoiceDate'].dt.hour
df['Weekday'] = df['InvoiceDate'].dt.day_name() # Create a new column with the name of the weekday from 'InvoiceDate'

In [None]:
df.head()

In [None]:
# Clean the 'Description' text by stripping whitespace and converting to lowercase
df['Description'] = df['Description'].str.strip().str.lower()      

In [None]:
# Encode the 'Weekday' column using Label Encoding (e.g., Monday = 0, Tuesday = 1, ...)
le = LabelEncoder()
df['Weekday_encoded'] = le.fit_transform(df['Weekday'])

In [None]:
# Convert 'Country' column to dummy variables (one-hot encoding), excluding the first category to avoid multicollinearity
df = pd.get_dummies(df, columns=['Country'], drop_first=True)

In [None]:
df.head()

In [None]:
# Remove rows with negative or zero quantity
df = df[df['Quantity'] > 0]
df = df[df['UnitPrice'] > 0]

In [None]:
# Drop rows with missing CustomerID
df = df.dropna(subset=['CustomerID'])

In [None]:
# Remove rows with empty or invalid descriptions
df = df[df['Description'].str.len() > 1]  # remove descriptions like "?"

In [None]:
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return df[(df[column] >= lower) & (df[column] <= upper)]

# Remove outliers from 'Quantity' and 'UnitPrice'
df = remove_outliers_iqr(df, 'Quantity')
df = remove_outliers_iqr(df, 'UnitPrice')

In [None]:
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']  #get total price

In [None]:
# Create a binary feature indicating whether the invoice was canceled (starts with 'C')
df['IsCanceled'] = df['InvoiceNo'].str.startswith('C').astype(int)

In [None]:
cancel_rate = df['IsCanceled'].mean()
cancel_rate

In [None]:
# Calculate the number of unique invoices per customer
customer_invoice_counts = df.groupby('CustomerID')['InvoiceNo'].nunique()
# Map the calculated frequency back to the main DataFrame as a new feature
df['Frequency'] = df['CustomerID'].map(customer_invoice_counts)       

In [None]:
# Calculate the total spend per customer by grouping and summing the TotalSpend
customer_total_spend = df.groupby('CustomerID')['TotalPrice'].sum()
df['CustomerSpend'] = df['CustomerID'].map(customer_total_spend) 

In [None]:
# Calculate total spend per invoice
invoice_spend = df.groupby('InvoiceNo')['TotalPrice'].sum()

# Get a unique mapping of each invoice to its custome
invoice_customer = df.drop_duplicates('InvoiceNo')[['InvoiceNo', 'CustomerID']]

# Map the total spend back to each invoice
invoice_customer['InvoiceTotal'] = invoice_customer['InvoiceNo'].map(invoice_spend)

# Calculate the average basket value (average spend per invoice) for each customer
average_basket = invoice_customer.groupby('CustomerID')['InvoiceTotal'].mean()

# Map the average basket value back to the main dataframe                                          
df['AverageBasketValue'] = df['CustomerID'].map(average_basket)

In [None]:
df.head()

In [None]:
# Convert 'InvoiceDate' column to datetime format
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
# Group total spend per month and plot
sales_per_month = df.groupby(df['InvoiceDate'].dt.to_period('M'))['TotalPrice'].sum()
sales_per_month.plot()

In [None]:
march_data = df[df['InvoiceDate'].dt.month == 3]
march_invoice_count = march_data['InvoiceNo'].nunique()
print("Number of invoices in March:", march_invoice_count)
print("#################################")
march_customers = march_data['CustomerID'].nunique()
print("Unique customers in March:", march_customers)

In [None]:
march_top_products = march_data.groupby('Description')['Quantity'].sum().sort_values(ascending=False).head(10)
march_low_products = march_data.groupby('Description')['Quantity'].sum().sort_values().head(10)
print(march_top_products)
print("#################################")
print(march_low_products)

In [None]:
march_returns = march_data[march_data['Quantity'] < 0]
print("Number of returns in March:", len(march_returns))

In [None]:
rfm_df = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (df['InvoiceDate'].max() - x.max()).days,  # Recency
    'InvoiceNo': 'nunique',                                             # Frequency
    'TotalPrice': 'sum'                                                 # Monetary    )(
})
rfm_df.columns = ['Recency', 'Frequency', 'Monetary']

In [None]:
# Convert InvoiceDate to datetime if not already
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

# Calculate the reference date (the last date in the dataset + 1 day)
reference_date = df['InvoiceDate'].max() + pd.Timedelta(days=1)

# Recency: Number of days since the customer's last purchase
recency_df = df.groupby('CustomerID')['InvoiceDate'].max().reset_index()
recency_df['Recency'] = (reference_date - recency_df['InvoiceDate']).dt.days

# Merge Recency back to the original dataframe
df = df.merge(recency_df[['CustomerID', 'Recency']], on='CustomerID', how='left')

# Monetary: Total amount spent by the customer
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']  # Create a total price column
monetary_df = df.groupby('CustomerID')['TotalPrice'].sum().reset_index()
monetary_df.rename(columns={'TotalPrice': 'Monetary'}, inplace=True)

# Merge Monetary back to the original dataframe
df = df.merge(monetary_df, on='CustomerID', how='left')

In [None]:
# Get top 10 selling products by total quantity
top_products = df.groupby('Description')['Quantity'].sum().sort_values(ascending=False).head(10)
top_products

In [None]:
# Add a new column 'Month' to extract month from InvoiceDate
df['Month'] = df['InvoiceDate'].dt.month
# Calculate total spend per month for seasonal trend analysis
seasonal_sales = df.groupby('Month')['TotalPrice'].sum()
seasonal_sales

In [None]:
df['Country']=data['Country']

In [None]:
# Group sales by country to find top-performing countri
country_sales = df.groupby('Country')['TotalPrice'].sum().sort_values(ascending=False)

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [None]:
# Select RFM features
rfm = df[['Recency', 'Frequency', 'Monetary']]

In [None]:
# Scale the data
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
rfm_imputed = pd.DataFrame(imputer.fit_transform(rfm), columns=rfm.columns, index=rfm.index)
scaler = StandardScaler()
rfm_scaled = pd.DataFrame(scaler.fit_transform(rfm_imputed), columns=rfm.columns, index=rfm.index)

In [None]:
# Apply KMeans
kmeans = KMeans(n_clusters=3, random_state=42)
df['CustomerSegment'] = kmeans.fit_predict(rfm_scaled)

In [None]:
# Map cluster labels to meanings
# You can adjust this based on cluster profiling
segment_map = {
    0: 'Important',
    1: 'Normal',
    2: 'Low-Value'
}

In [None]:
df['CustomerSegment'] = df['CustomerSegment'].map(segment_map)

# View sample
print(df[['CustomerID', 'CustomerSegment']].drop_duplicates().head())

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [None]:
np.random.seed(42)
df['Discount'] = np.round(np.random.uniform(0, 0.5, size=len(df)), 2)


In [None]:
df = df.dropna(subset=['Discount', 'TotalPrice'])

In [None]:
# Extract the feature (Discount) and target (TotalSpend)
X = df[['Discount']]  # Feature: discount offered
y = df['TotalPrice']  # Target: total spend by the customer

In [None]:
# Fit the Linear Regression model
model = LinearRegression()
model.fit(X, y)

In [None]:
# Print the regression equation
print(f"Regression Equation: TotalSpend = {model.coef_[0]:.2f} * Discount + {model.intercept_:.2f}")

In [None]:

# Generate predictions for a range of discounts
discounts = np.linspace(0, df['Discount'].max(), 100).reshape(-1, 1)
predicted_spends = model.predict(discounts)

In [None]:
# Plot the actual data points and the regression line
plt.figure(figsize=(10, 6))
plt.scatter(df['Discount'], df['TotalPrice'], alpha=0.5, label='Actual Data')
plt.plot(discounts, predicted_spends, color='red', linewidth=2, label='Regression Line')
plt.xlabel('Discount')
plt.ylabel('TotalPrice')
plt.title('Effect of Discount on Total Spend')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Define a threshold, for example: has the customer purchased more than once?
df['WillPurchaseAgain'] = df.groupby('CustomerID')['InvoiceDate'].transform(lambda x: (x.max() - x.min()).days > 30)
df['WillPurchaseAgain'] = df['WillPurchaseAgain'].astype(int)

In [None]:
import datetime

# Assuming today is the cutoff date
cutoff_date = datetime.datetime(2011, 12, 10)

# Recency calculation
latest_purchase = df.groupby('CustomerID')['InvoiceDate'].max()
will_purchase = (cutoff_date - latest_purchase).dt.days < 90

# Create target variable
customer_df = pd.DataFrame({'WillPurchaseAgain': will_purchase.astype(int)})

In [None]:
rfm = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (cutoff_date - x.max()).days,  # Recency
    'InvoiceNo': 'nunique',                                 # Frequency
    'TotalPrice': 'sum'                                     # Monetary
}).rename(columns={
    'InvoiceDate': 'Recency',
    'InvoiceNo': 'Frequency',
    'TotalPrice': 'Monetary'
})

In [None]:
rfm['WillPurchaseAgain'] = customer_df['WillPurchaseAgain']

In [None]:
from sklearn.model_selection import train_test_split

X = rfm.drop('WillPurchaseAgain', axis=1)
y = rfm['WillPurchaseAgain']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

model = RandomForestClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
rfm['PredictedRepurchase'] = model.predict(X)

In [None]:
df.head()