<a href="https://colab.research.google.com/github/nikshitagchiliveri/DATA-ANALYSIS-IN-E-COMMERCE/blob/main/CUSTOMER_CHURN_ANALYSIS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
!kaggle datasets download -d ankitverma2010/ecommerce-customer-churn-analysis-and-prediction

In [None]:
import zipfile
zip_ref = zipfile.ZipFile('/content/ecommerce-customer-churn-analysis-and-prediction.zip', 'r')
zip_ref.extractall('/content')
zip_ref.close()

In [None]:
import pandas as pd
df = pd.read_excel('/content/E Commerce Dataset.xlsx', sheet_name='E Comm')
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
df['Tenure'] = df['Tenure'].fillna(0)

In [None]:
df['WarehouseToHome'] = df['WarehouseToHome'].fillna(df['WarehouseToHome'].mean())

In [None]:
df['HourSpendOnApp'] = df['HourSpendOnApp'].fillna(df['HourSpendOnApp'].mean())

In [None]:
df['OrderAmountHikeFromlastYear'] = df['OrderAmountHikeFromlastYear'].fillna(df['OrderAmountHikeFromlastYear'].mean())

In [None]:
df['CouponUsed'] = df['CouponUsed'].fillna(0)

In [None]:
df['DaySinceLastOrder'].fillna(df['DaySinceLastOrder'].mode()[0] , inplace= True)

In [None]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
df['OrderCount'] = (imputer.fit_transform(df[['OrderCount']]))

In [None]:
df.isnull().sum()

In [None]:
df['PreferredLoginDevice'] = df['PreferredLoginDevice'].replace('Phone', 'Mobile Phone')

In [None]:
print(df['PreferredLoginDevice'].unique())
print(df['PreferredPaymentMode'].unique())

In [None]:
df['PreferredPaymentMode'] = df['PreferredPaymentMode'].replace({'CC': 'Credit Card', 'COD': 'Cash on Delivery'})

In [None]:
print(df['Gender'].unique())
print(df['PreferedOrderCat'].unique())

In [None]:
df['PreferedOrderCat'] = df['PreferedOrderCat'].replace('Phone', 'Mobile Phone')

In [None]:
print(df['PreferedOrderCat'].unique())

In [None]:
import matplotlib.pyplot as plt
df.hist(bins=30, figsize=(15, 10))
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Select only numeric columns for correlation calculation
numeric_df = df.select_dtypes(include=['number'])

plt.figure(figsize=(12, 8))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.show()

In [None]:
sns.scatterplot(data=df, x='SatisfactionScore', y='HourSpendOnApp', hue='Churn')
plt.show()

In [None]:
sns.boxplot(data=df, x='CityTier', y='CashbackAmount', hue='Churn')
plt.show()

In [None]:
sns.countplot(data=df, x='PreferredLoginDevice', hue='Churn')
plt.show()

In [None]:
sns.countplot(data=df, x='MaritalStatus', hue='Churn')
plt.show()

In [None]:
sns.barplot(data=df, x='PreferredPaymentMode', y='Churn')
plt.show()

In [None]:
df.to_csv('cleaned_data.csv', index=False)
print("Cleaned data has been saved as 'cleaned_data.csv'.")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [None]:
df = pd.read_csv('/content/cleaned_data.csv')

In [None]:
X = df.drop(columns='Churn')
y = df['Churn']

In [None]:
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['number']).columns

In [None]:
# Preprocessing of the pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()

In [None]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print("Logistic Regression Performance:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"ROC-AUC Score: {roc_auc:.2f}")

In [None]:
import pandas as pd

sample_data = {
    'CustomerID': [50010],
    'Tenure': [12],
    'PreferredLoginDevice': ['Mobile Phone'],
    'CityTier': [1],
    'WarehouseToHome': [7.0],
    'PreferredPaymentMode': ['Credit Card'],
    'Gender': ['Female'],
    'HourSpendOnApp': [3.5],
    'NumberOfDeviceRegistered': [2],
    'PreferedOrderCat': ['Mobile Phone'],
    'SatisfactionScore': [3],
    'MaritalStatus': ['Married'],
    'NumberOfAddress': [2],
    'Complain': [0],
    'OrderAmountHikeFromlastYear': [12.0],
    'CouponUsed': [5],
    'OrderCount': [10],
    'DaySinceLastOrder': [15],
    'CashbackAmount': [50.0]
}

sample_df = pd.DataFrame(sample_data)

# Using the trained pipeline to predict the churn probability for the sample customer
churn_probability = pipeline.predict_proba(sample_df)[:, 1]

print(f"Churn Probability for the sample customer: {churn_probability[0]:.2f}")

In [None]:
sample_data = {
    'CustomerID': [50030],
    'Tenure': [0],
    'PreferredLoginDevice': ['Mobile Phone'],
    'CityTier': [3],
    'WarehouseToHome': [50.0],
    'PreferredPaymentMode': ['Debit Card'],
    'Gender': ['Male'],
    'HourSpendOnApp': [0.5],
    'NumberOfDeviceRegistered': [1],
    'PreferedOrderCat': ['Laptop & Accessory'],
    'SatisfactionScore': [0],
    'MaritalStatus': ['Single'],
    'NumberOfAddress': [10],
    'Complain': [5],
    'OrderAmountHikeFromlastYear': [50.0],
    'CouponUsed': [0],
    'OrderCount': [0],
    'DaySinceLastOrder': [100],
    'CashbackAmount': [0.0]
}

sample_df = pd.DataFrame(sample_data)

# Use the trained pipeline to predict the churn probability for the extreme high-risk sample customer
churn_probability = pipeline.predict_proba(sample_df)[:, 1]

print(f"Churn Probability for the extreme high-risk sample customer: {churn_probability[0]:.2f}")