In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the data
online_sales = pd.read_csv('Online_Sales.csv')
customers_data = pd.read_csv('Customers_Data.csv')
discount_coupon = pd.read_csv('Discount_Coupon.csv')
marketing_spend = pd.read_csv('Marketing_Spend.csv')
tax_amount = pd.read_csv('Tax_Amount.csv')

# Step 1: Calculate Invoice Amount or Sale Amount or Revenue for each transaction and item level
online_sales['Invoice Value'] = (online_sales['Quantity'] * online_sales['Avg_price']) * (1 - online_sales['Discount_pct']) * (1 + online_sales['GST']) + online_sales['Delivery_Charges']

# Step 2: Perform Detailed Exploratory Analysis
# Customer Acquisition
customer_acquisition = online_sales.groupby(pd.Grouper(key='Transaction_Date', freq='M'))['CustomerID'].nunique()

# Customer Retention
customer_retention = online_sales.groupby(pd.Grouper(key='Transaction_Date', freq='M'))['CustomerID'].apply(lambda x: x[x.duplicated(keep=False)].count())

# Revenue Analysis
revenue_analysis = online_sales.groupby([pd.Grouper(key='Transaction_Date', freq='M'), 'Product_Category', 'Location'])['Invoice Value'].sum()

# Discount Analysis
discount_analysis = online_sales.groupby([pd.Grouper(key='Transaction_Date', freq='M'), 'Product_Category', 'Coupon_Status'])['Invoice Value'].sum()

# KPI Analysis
kpi_analysis = online_sales.groupby([pd.Grouper(key='Transaction_Date', freq='M'), 'Product_Category'])['Invoice Value'].sum()

# Trend and Seasonality Analysis
trend_seasonality_analysis = online_sales.groupby(pd.Grouper(key='Transaction_Date', freq='M'))['Invoice Value'].sum()

# Day-wise Analysis
day_wise_analysis = online_sales.groupby(pd.Grouper(key='Transaction_Date', freq='D'))['Invoice Value'].sum()

# Step 3: Performing Customer Segmentation
# Heuristic (Value-based, RFM) Segmentation
customers_data['Revenue'] = online_sales.groupby('CustomerID')['Invoice Value'].sum()
customers_data['Frequency'] = online_sales.groupby('CustomerID')['Transaction_ID'].count()
customers_data['Recency'] = online_sales.groupby('CustomerID')['Transaction_Date'].max()

# Scientific (K-Means) Segmentation
kmeans = KMeans(n_clusters=4)
customers_data['Cluster'] = kmeans.fit_predict(customers_data[['Revenue', 'Frequency', 'Recency']])

# Step 4: Predicting Customer Lifetime Value
# Define dependent variable with categories low value, medium value, high value using customer revenue
customers_data['Lifetime Value'] = pd.cut(customers_data['Revenue'], bins=[0, 100, 500, 1000], labels=['Low', 'Medium', 'High'])

# Build classification model
X = customers_data[['Frequency', 'Recency']]
y = customers_data['Lifetime Value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))

# Step 5: Cross-Selling
# Perform exploratory analysis and market basket analysis to understand which products are selling together
cross_selling_analysis = online_sales.groupby('Product_SKU')['Invoice Value'].sum()

# Step 6: Predicting Next Purchase Day
# Create dependent variable at the customer level (average days per one transaction for only repeat customers)
customers_data['Average Days'] = online_sales.groupby('CustomerID')['Transaction_Date'].apply(lambda x: x.diff().mean())

# Divide into groups 0-30 days, 30-60 days, 60-90 days, and 90+ days
customers_data['Next Purchase Day'] = pd.cut(customers_data['Average Days'], bins=[0, 30, 60, 90, np.inf], labels=['0-30 days', '30-60 days', '60-90 days', '90+ days'])

# Build classification model
X = customers_data[['Frequency', 'Recency']]
y = customers_data['Next Purchase Day']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))

# Step 7: Cohort Analysis
# Define cohorts based on the month of first purchase and analyze their behavior over time
cohort_analysis = online_sales.groupby(pd.Grouper(key='Transaction_Date', freq='M'))['CustomerID'].nunique()

# Identify which month cohort has the maximum retention
max_retention_cohort = cohort_analysis.idxmax()
print('Max Retention Cohort:', max_retention_cohort)