In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load data
online_sales = pd.read_csv('Online_Sales.csv')
customers_data = pd.read_csv('Customers_Data.csv')
discount_coupon = pd.read_csv('Discount_Coupon.csv')
marketing_spend = pd.read_csv('Marketing_Spend.csv')
tax_amount = pd.read_csv('Tax_Amount.csv')

# Preprocess data
online_sales['Invoice_Value'] = ((online_sales['Quantity'] * online_sales['Avg_Price']) * (1 - online_sales['Discount_pct']) * (1 + online_sales['GST'])) + online_sales['Delivery_Charges']

# EDA
plt.figure(figsize=(10,6))
plt.plot(online_sales['Invoice_Value'])
plt.title('Invoice Value Over Time')
plt.xlabel('Date')
plt.ylabel('Invoice Value')
plt.show()

# Customer Segmentation
kmeans = KMeans(n_clusters=4)
customers_data['Segment'] = kmeans.fit_predict(customers_data[['Value', 'RFM']])

# Predictive Modeling
X = online_sales[['CustomerID', 'Invoice_Value', 'Quantity', 'Avg_Price', 'Discount_pct', 'GST']]
y = online_sales['Customer_Lifetime_Value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

# Cohort Analysis
cohort = online_sales.groupby('CustomerID')['Invoice_Value'].sum().reset_index()
cohort['Cohort'] = cohort['CustomerID'].apply(lambda x: x.split('-')[0])
cohort['Retention'] = cohort['Invoice_Value'].apply(lambda x: x > 0)
cohort['Month'] = cohort['Cohort'].apply(lambda x: x.split('-')[1])
cohort = cohort.groupby('Month')['Retention'].mean().reset_index()
plt.figure(figsize=(10,6))
plt.plot(cohort['Month'], cohort['Retention'])
plt.title('Retention by Cohort')
plt.xlabel('Month')
plt.ylabel('Retention')
plt.show()