In [8]:
import pandas as pd
import seaborn as sns
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [9]:
df= pd.read_excel("marketing_campaign.xlsx")

FileNotFoundError: [Errno 2] No such file or directory: 'marketing_campaign.xlsx'

In [None]:
df

## Basic Data Inspection

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df=df.dropna()

In [None]:
df.isnull().sum()
df.shape

In [None]:
df.duplicated().sum()

## EDA

In [None]:
df.columns

In [None]:
for i in df.columns:
    print(i,'   ' ,df[i].unique(), sep="\n")
    print()

In [None]:
df=df.drop(columns=['ID'],axis=1)

In [None]:
df.head(3)

##Feature Engineering




#### creating a new column as age and droping Year_Birth

In [None]:
df['Dt_Customer'].max()

In [None]:
# As the data we have is valid till 2014-2015 we can measure the age as
df['Age']=2015-df['Year_Birth']

In [None]:
df.head(3)

In [None]:
df= df.drop(columns=['Year_Birth'],axis=1)

#### Modifying Educations column

In [None]:
df['Education'].value_counts().plot.pie(autopct='%1.1f%%')

In [None]:
# We can make only three categories as Undergrad, Grad and Postgrad

In [None]:
df["Education"]=df["Education"].replace({"Basic":"Undergraduate","2n Cycle":"Undergraduate", "Graduation":"Graduate", "Master":"Postgraduate", "PhD":"Postgraduate"})

In [None]:
df['Education'].value_counts()

#### Modifying Marital_Status column

In [None]:
df['Marital_Status'].value_counts().plot(kind='bar')

In [None]:
# Lets keep two categories only

In [None]:
df["Marital_Status"]=df["Marital_Status"].replace({"Together":"Married","Divorced":"Single", "Widow":"Single","Alone":"Single","Absurd":"Single","YOLO":"Single"})

In [None]:
df['Marital_Status'].unique()

Converting Education and Marital status to int

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
LE=LabelEncoder()

In [None]:
df['Education']=LE.fit_transform(df['Education'])
df['Marital_Status']=LE.fit_transform(df['Marital_Status'])

In [None]:
df[['Education','Marital_Status']]

In [None]:
df.head()

#### removing Z_CostContact and Z_Revenue as the values throughout the columns are same

In [None]:
df[['Z_CostContact','Z_Revenue']].value_counts()

In [None]:
df= df.drop(columns=['Z_CostContact','Z_Revenue'],axis=1)

In [None]:
df.head()

#### merging 'Kidhome' and 'Teenhome' into total_children

In [None]:
df['total_children']=df['Kidhome']+df['Teenhome']
df= df.drop(columns=['Kidhome','Teenhome'],axis=1)

In [None]:
df.head(2)

#### Droping Date columns as it is not needed anymore

In [None]:
df= df.drop(columns=['Dt_Customer'],axis=1)

In [None]:
df.head(2)

#### separating categorical and numerical(continuous) columns

In [None]:
cat_columns=df[['Education','Marital_Status','AcceptedCmp3','AcceptedCmp4','AcceptedCmp5','AcceptedCmp1','AcceptedCmp2','Complain','Response']]
cat_columns.head(4)

In [None]:
num_columns=df.drop(columns=cat_columns,axis=1)

In [None]:
num_columns.head(4)

##Univariate Analysis

In [None]:
# Histograms
df.hist(bins=30, figsize=(15, 10))
plt.show()

In [None]:
# Boxplots
plt.figure(figsize=(15, 10))
for i, col in enumerate(df.select_dtypes(include=[np.number]).columns):
    plt.subplot(5, 6, i+1)
    sns.boxplot(df[col])
    plt.title(col)
plt.tight_layout()
plt.show()

##Bivariate Analysis

In [None]:
# Correlation matrix
plt.figure(figsize=(24, 20))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.show()

In [None]:
# Count plots
plt.figure(figsize=(15, 10))
for i, col in enumerate(df.select_dtypes(include=[object]).columns):
    plt.subplot(2, 2, i+1)
    sns.countplot(y=col, data=df)
    plt.title(col)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.swarmplot(x='NumWebPurchases', y='Income', data=df)
plt.title('Income Distribution by Number of Web Purchases')
plt.xlabel('Number of Web Purchases')
plt.ylabel('Income')
plt.show()

In [None]:
df['Total_Spent'] = df['MntWines'] + df['MntFruits'] + df['MntMeatProducts'] + df['MntFishProducts'] + df['MntSweetProducts'] + df['MntGoldProds']
plt.figure(figsize=(10,6))
sns.scatterplot(x='Income', y='Total_Spent', data=df)
plt.title('Income vs. Total Amount Spent')
plt.xlabel('Income')
plt.ylabel('Total Amount Spent')
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.lineplot(x='Age', y='Recency', data=df)
plt.title('Customer Recency by Year of Birth')
plt.xlabel('Year of Birth')
plt.ylabel('Recency')
plt.show()

##Multivariate Analysis

In [None]:
# Boxplots for numerical variables grouped by categorical variables
plt.figure(figsize=(15, 10))
for i, col in enumerate(df.select_dtypes(include=[np.number]).columns):
    plt.subplot(5, 6, i+1)
    sns.boxplot(x='Education', y=col, data=df)
    plt.title(col)
plt.tight_layout()
plt.show()

In [None]:
# Violin plots
plt.figure(figsize=(15, 10))
for i, col in enumerate(df.select_dtypes(include=[np.number]).columns):
    plt.subplot(5, 6, i+1)
    sns.violinplot(x='Education', y=col, data=df)
    plt.title(col)
plt.tight_layout()
plt.show()

### Checking Skewness and Outliers

In [None]:
for col in num_columns:
    sns.histplot(df[col], kde=True)  # kde=True adds a KDE line to the histogram
    plt.title(f'Distribution of {col}')
    plt.show()
    print()

##### Many of the columns are showing skewness and also possibilies of outliers

In [None]:
df['Income'].astype(int)

In [None]:
num_columns.skew()

In [None]:
for col in num_columns:
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col}')
    plt.show()

In [None]:
for col in num_columns:
    sns.scatterplot(df[col])
    plt.title(f'scatterplot of {col}')
    plt.show()

In [None]:
def detect_outliers(d):
  for i in d:
    Q3, Q1 = np.percentile(df[i], [75 ,25])
    IQR = Q3 - Q1

    upper_bound = Q3+1.5*IQR
    lower_bound = Q1-1.5*IQR

    outliers = df[i][(df[i] > upper_bound) | (df[i] < lower_bound)]
    print(f'*** {i} outlier points***', '\n', outliers, '\n')

In [None]:
detect_outliers(num_columns)

#### Looking at the data , we can see that Income and age have genuine outliers and we have to delete them

In [None]:
df = df[(df['Age']<100)]
df = df[(df['Income']<150000)]

In [None]:
df[num_columns.columns].skew()

In [None]:
num_columns.columns

#### performing log transformation to reduce skewness

In [None]:
skewed_features = ['MntWines', 'MntFruits', 'MntMeatProducts',
       'MntFishProducts', 'MntSweetProducts', 'MntGoldProds',
       'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases']
for col in skewed_features:
    df[col] = np.log1p(df[col])


In [None]:
df[num_columns.columns].skew()

In [None]:
for col in num_columns:
    sns.histplot(df[col], kde=True)  # kde=True adds a KDE line to the histogram
    plt.title(f'Distribution of {col}')
    plt.show()
    print()

##Multicollinearity Check

In [None]:
df.head(3)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

# Convert categorical variables to dummy variables
df = pd.get_dummies(df, drop_first=True)

# Add a constant term for VIF calculation
df = add_constant(df)

# Calculate VIF for each feature
vif_df = pd.DataFrame()
vif_df["feature"] = df.columns
vif_df["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]

print(vif_df)


### all values less than 10.

###Scaling of data

In [None]:
df = df.drop(columns=['PM2.5', 'PM10'])

In [None]:
data_cleaned=df.copy()
data_cleaned

In [None]:
# Label encoding for categorical column

In [None]:
df.head()

In [None]:
df.shape

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [None]:
df_scaled=pd.DataFrame(scaler.fit_transform(df),columns=df.columns)

In [None]:
df_scaled

## PCA

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95, random_state=42)
pca_data = pd.DataFrame(pca.fit_transform(df_scaled))
pca_data.columns = pca_data.columns.astype(str)
print(f"Number of components to explain 95% variance: {pca.n_components_}")

In [None]:
pca_data.head(2)

###Explained Variance

In [None]:
# Calculate cumulative explained variance
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)

# Plotting explained variance
plt.figure(figsize=(10, 6))
plt.plot(np.arange(1, len(explained_variance_ratio) + 1), cumulative_variance_ratio, marker='o', linestyle='--')
plt.title('Cumulative Explained Variance by Components')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.grid(True)
plt.show()

In [None]:
# Print explained variance ratio for each component
print("Explained Variance Ratio by Components:")
for i, ev in enumerate(explained_variance_ratio, start=1):
    print(f"Component {i}: {ev:.3f}")

# Clustering

#### Elbow Method to find No of Clusters

In [None]:
from sklearn.cluster import KMeans

In [None]:
WCSS = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(pca_data)
    WCSS.append(kmeans.inertia_)

# Plotting the Elbow Graph
plt.plot(range(1, 11), WCSS, marker='o', linestyle='--')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.title('Elbow Method for Optimal k')
plt.show()

#### Silhouette Score method to find out the number of clusters

In [None]:
from sklearn.metrics import silhouette_score

silhouette_scores = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=0)
    labels = kmeans.fit_predict(pca_data)
    silhouette_scores.append(silhouette_score(pca_data, labels))

# Plotting the Silhouette Scores
plt.plot(range(2, 11), silhouette_scores, marker='o', linestyle='--')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Method for Optimal k')
plt.show()

In [None]:
# high Silhouette Score indicates the best no of clusters

In [None]:
# Although score of 2 clusters  is higher than 3 clusters ,
# to make an effective strategy for marketing we will create 3 clusters.

## Number of Clusters = 3

### K-Means Clustering

In [None]:
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score

In [None]:
# K-Means Clustering
kmeans = KMeans(n_clusters=3, random_state=0)
kmeans_labels = kmeans.fit_predict(pca_data)

# Evaluation Metrics
kmeans_silhouette = silhouette_score(pca_data, kmeans_labels)
kmeans_db_index = davies_bouldin_score(pca_data, kmeans_labels)
kmeans_ch_index = calinski_harabasz_score(pca_data, kmeans_labels)

print(f"K-Means Clustering Metrics:")
print(f"Silhouette Score: {kmeans_silhouette:.4f}")
print(f"Davies-Bouldin Index: {kmeans_db_index:.4f}")
print(f"Calinski-Harabasz Index: {kmeans_ch_index:.4f}")


In [None]:
# Plotting K-Means Clustering Results
plt.figure(figsize=(10, 6))
sns.scatterplot(x=pca_data.iloc[:, 0], y=pca_data.iloc[:, 1], hue=kmeans_labels, palette='viridis')
plt.title('K-Means Clustering Results')
plt.show()

###Agglomerative (or) Hierarchical Clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as shc


linked = shc.linkage(pca_data, method='ward')

plt.figure(figsize=(10, 7))
shc.dendrogram(linked)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Samples')
plt.ylabel('Distance')
plt.show()

# Fit Agglomerative Clustering
agg_cluster = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')
labels = agg_cluster.fit_predict(pca_data)

In [None]:
from sklearn.cluster import AgglomerativeClustering

# Hierarchical Clustering with optimal clusters
hierarchical = AgglomerativeClustering(n_clusters=3)
hierarchical_labels = hierarchical.fit_predict(pca_data)

# Evaluation Metrics
hierarchical_silhouette = silhouette_score(pca_data, hierarchical_labels)
hierarchical_db_index = davies_bouldin_score(pca_data, hierarchical_labels)
hierarchical_ch_index = calinski_harabasz_score(pca_data, hierarchical_labels)

print(f"Hierarchical Clustering Metrics:")
print(f"Silhouette Score: {hierarchical_silhouette:.4f}")
print(f"Davies-Bouldin Index: {hierarchical_db_index:.4f}")
print(f"Calinski-Harabasz Index: {hierarchical_ch_index:.4f}")


In [None]:
# Plotting Hierarchical Clustering Results
plt.figure(figsize=(10, 6))
sns.scatterplot(x=pca_data.iloc[:, 0], y=pca_data.iloc[:, 1], hue=hierarchical_labels, palette='viridis')
plt.title('Hierarchical Clustering Results')
plt.show()

### Gaussian Mixture Method

In [None]:
from sklearn.mixture import GaussianMixture

# Gaussian Mixture with optimal clusters (e.g., k=4)
gmm = GaussianMixture(n_components=3, random_state=0)
gmm_labels = gmm.fit_predict(pca_data)

# Evaluation Metrics
gaussian_silhouette = silhouette_score(pca_data, gmm_labels)
gaussian_db_index = davies_bouldin_score(pca_data, gmm_labels)
gaussian_ch_index = calinski_harabasz_score(pca_data, gmm_labels)

print(f"Gaussian Mixture Metrics:")
print(f"Silhouette Score: {gaussian_silhouette}")
print(f"Davies-Bouldin Index: {gaussian_db_index}")
print(f"Calinski-Harabasz Index: {gaussian_ch_index}")

In [None]:
# Plotting Gaussian Mixture Clustering Results
plt.figure(figsize=(10, 6))
sns.scatterplot(x=pca_data.iloc[:, 0], y=pca_data.iloc[:, 1], hue=gmm_labels, palette='viridis')
plt.title('Gaussian Mixture Clustering Results')
plt.show()

###DBScan Clustering

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

# Find the best eps and min_samples for DBSCAN using Grid Search
from sklearn.neighbors import NearestNeighbors

nearest_neighbors = NearestNeighbors(n_neighbors=11)
nearest_neighbors.fit(pca_data)
distances, indices = nearest_neighbors.kneighbors(pca_data)
distances = np.sort(distances[:, 10], axis=0)

plt.figure(figsize=(10, 6))
plt.plot(distances)
plt.title('K-distance Graph for DBSCAN')
plt.xlabel('Points sorted by distance')
plt.ylabel('10th nearest neighbor distance')
plt.show()

In [None]:
# Based on the graph, let's choose an eps value where the slope changes sharply
best_eps = 1.5
best_min_samples = 10

dbscan = DBSCAN(eps=best_eps, min_samples=best_min_samples)
dbscan_labels = dbscan.fit_predict(pca_data)

# Filter out noise points (-1 labels) for evaluation
dbscan_valid_data = pca_data[dbscan_labels != -1]
dbscan_valid_labels = dbscan_labels[dbscan_labels != -1]

# Evaluation Metrics for DBSCAN
dbscan_silhouette = silhouette_score(dbscan_valid_data, dbscan_valid_labels)
dbscan_db_index = davies_bouldin_score(dbscan_valid_data, dbscan_valid_labels)
dbscan_ch_index = calinski_harabasz_score(dbscan_valid_data, dbscan_valid_labels)


print(f"DBSCAN Clustering Metrics:")
print(f"Silhouette Score: {dbscan_silhouette}")
print(f"Davies-Bouldin Index: {dbscan_db_index}")
print(f"Calinski-Harabasz Index: {dbscan_ch_index}")


In [None]:
# Plotting DBSCAN Clustering Results
plt.figure(figsize=(10, 6))
sns.scatterplot(x=dbscan_valid_data.iloc[:, 0], y=dbscan_valid_data.iloc[:, 1], hue=dbscan_valid_labels, palette='viridis')
plt.title('DBSCAN Clustering Results')
plt.show()

### Spectral Clustering

In [None]:
from sklearn.cluster import SpectralClustering

# Spectral Clustering
n_clusters = 3  # You can specify the number of clusters you want
spectral = SpectralClustering(n_clusters=n_clusters, affinity='nearest_neighbors', random_state=0)
spectral_labels = spectral.fit_predict(pca_data)

# Evaluation Metrics for Spectral Clustering
spectral_silhouette = silhouette_score(pca_data, spectral_labels)
spectral_db_index = davies_bouldin_score(pca_data, spectral_labels)
spectral_ch_index = calinski_harabasz_score(pca_data, spectral_labels)

print(f"Spectral Clustering Metrics:")
print(f"Silhouette Score: {spectral_silhouette}")
print(f"Davies-Bouldin Index: {spectral_db_index}")
print(f"Calinski-Harabasz Index: {spectral_ch_index}")

In [None]:
# Plotting Spectral Clustering Results
plt.figure(figsize=(10, 6))
sns.scatterplot(x=pca_data.iloc[:, 0], y=pca_data.iloc[:, 1], hue=spectral_labels, palette='viridis')
plt.title('Spectral Clustering Results')
plt.show()

###Comparing the results

In [None]:
import pandas as pd
from tabulate import tabulate

# Assuming you have calculated metrics and stored them in variables like this
clustering_algorithms = ["K-Means", "Hierarchical", "DBSCAN", "Gaussian Mixture", "Spectral Clustering"]
silhouette_scores = [kmeans_silhouette, hierarchical_silhouette, dbscan_silhouette, gaussian_silhouette, spectral_silhouette]
db_indices = [kmeans_db_index, hierarchical_db_index, dbscan_db_index, gaussian_db_index, spectral_db_index]
ch_indices = [kmeans_ch_index, hierarchical_ch_index, dbscan_ch_index, gaussian_ch_index, spectral_ch_index]

# Create a dictionary for DataFrame
data = {
    "Clustering Algorithm": clustering_algorithms,
    "Silhouette Score": silhouette_scores,
    "Davies-Bouldin Index": db_indices,
    "Calinski-Harabasz Index": ch_indices
}

# Create a DataFrame
results_df = pd.DataFrame(data)

# Calculate composite score (higher is better)
results_df['Composite Score'] = results_df['Silhouette Score'] - results_df['Davies-Bouldin Index'] + results_df['Calinski-Harabasz Index']

# Print the table using tabulate
table = tabulate(results_df, headers='keys', tablefmt='pretty', showindex=False)
print(table)

# Determine the best clustering algorithm based on Composite Score
best_clustering_algorithm = results_df.loc[results_df['Composite Score'].idxmax(), 'Clustering Algorithm']
print("\nBest Clustering Technique based on Composite Score:")
print(best_clustering_algorithm)

##Kmeans Clustering is the best suited for our data

# Profiling and Analysis Of Clusters using K-means

In [None]:
data_cleaned['Clusters']=labels

In [None]:
# Mapping dictionary
cluster_map = {0: 'group 1', 1: 'group 2', 2: 'group 3'}

# Applying map function
data_cleaned['Cluster_group'] = data_cleaned['Clusters'].map(cluster_map)

In [None]:
data_cleaned.head(2)

In [None]:
from sklearn.cluster import KMeans

# Perform K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=0)
kmeans_labels = kmeans.fit_predict(df_scaled)

# Add cluster labels to the original DataFrame
df['KMeans_Cluster'] = kmeans_labels


In [None]:
# Calculate descriptive statistics for hierarchical clusters
kmeans_stats = df.groupby('KMeans_Cluster').agg(['mean', 'median', 'std'])
print("Hierarchical Clusters Descriptive Statistics:")
print(kmeans_stats)

In [None]:
# Function to plot feature distribution within K-Means clusters
def plot_kmeans_cluster_distribution(df, cluster_column, title):
    for feature in df.columns[:-3]:  # Exclude the last three cluster columns
        plt.figure(figsize=(12, 6))
        sns.boxplot(x=cluster_column, y=feature, data=df)
        plt.title(f"{title} - {feature}")
        plt.show()

# Plot feature distribution within K-Means clusters
plot_kmeans_cluster_distribution(df, 'KMeans_Cluster', 'K-Means Clustering')

In [None]:
data_cleaned['Cluster_group'].value_counts()

In [None]:
sns.countplot(x=data_cleaned['Cluster_group'])

In [None]:
sns.countplot(x=data_cleaned['Cluster_group'],hue=data_cleaned['Education'])

In [None]:
sns.countplot(x=data_cleaned['Cluster_group'],hue=data_cleaned['Marital_Status'])

In [None]:
sns.kdeplot(data = data_cleaned, x=data_cleaned["Income"],hue=data_cleaned["Cluster_group"])

In [None]:
cluster_income_summary = data_cleaned.groupby('Cluster_group')['Income'].agg(['mean', 'median', 'std', 'min', 'max'])
print(cluster_income_summary)

In [None]:
sns.kdeplot(data = data_cleaned, x=data_cleaned["Age"],hue=data_cleaned["Cluster_group"])

In [None]:
cluster_age_summary = data_cleaned.groupby('Cluster_group')['Age'].agg(['mean', 'median', 'std', 'min', 'max'])
print(cluster_age_summary)

In [None]:
sns.countplot(x=data_cleaned['Cluster_group'],hue=data_cleaned['total_children'])

In [None]:
#Creating a feature to get a sum of spendings

data_cleaned["Total_spend"] = data_cleaned["MntWines"]+ data_cleaned["MntFruits"]+ data_cleaned["MntMeatProducts"]+ data_cleaned["MntFishProducts"]+ data_cleaned["MntSweetProducts"]+data_cleaned["MntGoldProds"]

#Plotting count of total campaign accepted.

plt.figure()
data_cleaned.head(2)

In [None]:
sns.scatterplot(data = data_cleaned ,x=data_cleaned["Total_spend"], y=data_cleaned["Income"],hue=data_cleaned["Clusters"])

In [None]:
#Creating a feature to get total number of purchases made

data_cleaned["Total_purchase"] = data_cleaned["NumDealsPurchases"]+ data_cleaned["NumWebPurchases"]+ data_cleaned["NumCatalogPurchases"]+data_cleaned["NumStorePurchases"]

#Plotting count of total campaign accepted.

plt.figure()
data_cleaned.head(2)

In [None]:
sns.scatterplot(data = data_cleaned ,x=data_cleaned["Total_purchase"], y=data_cleaned["Income"],hue=data_cleaned["Clusters"])

# Classification for future Clustering

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
data_cleaned

In [None]:
data_cleaned['Complain'].sum()

In [None]:
data_cleaned['Response'].sum()

In [None]:
classif_data=data_cleaned.drop(columns=['Cluster_group','Total_spend','Total_purchase','Complain','Response','Recency'])

In [None]:
classif_data

In [None]:
classif_data['Education']=LE.fit_transform(classif_data['Education'])
classif_data['Marital_Status']=LE.fit_transform(classif_data['Marital_Status'])

In [None]:
classif_scaled=pd.DataFrame(scaler.fit_transform(classif_data),columns=classif_data.columns)

In [None]:
classif_scaled

In [None]:
X=classif_data.drop('Clusters',axis=1)
y=classif_data['Clusters']

In [None]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=42)

##Model Building

In [None]:
#importing libraries
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVR
from sklearn.metrics import  accuracy_score,confusion_matrix,classification_report

###Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, log_loss
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Training the model
logreg = LogisticRegression(multi_class='ovr', solver='lbfgs', max_iter=1000)
logreg.fit(X_train, y_train)

# Making predictions
y_pred = logreg.predict(X_test)
y_pred_proba = logreg.predict_proba(X_test)

# Evaluating the model
logreg_accuracy = accuracy_score(y_test, y_pred)
logreg_precision = precision_score(y_test, y_pred, average='macro')
logreg_recall = recall_score(y_test, y_pred, average='macro')
logreg_f1 = f1_score(y_test, y_pred, average='macro')
logreg_roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
logreg_logloss = log_loss(y_test, y_pred_proba)
logreg_conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {logreg_accuracy}')
print(f'Precision: {logreg_precision}')
print(f'Recall: {logreg_recall}')
print(f'F1 Score: {logreg_f1}')
print(f'ROC-AUC Score: {logreg_roc_auc}')
print(f'Log Loss: {logreg_logloss}')
print(f'Confusion Matrix:\n {logreg_conf_matrix}')

In [None]:
# Plotting the ROC Curve for multiclass
fpr = {}
tpr = {}
roc_auc = {}
n_classes = len(logreg.classes_)

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test, y_pred_proba[:, i], pos_label=i)
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plotting all ROC curves
plt.figure(figsize=(12, 8))
colors = ['blue', 'green', 'red']
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], color=colors[i], lw=2, label=f'Class {i} (area = {roc_auc[i]:0.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multiclass Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

###KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, log_loss
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc


# Training the KNN model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Making predictions
y_pred = knn.predict(X_test)
y_pred_proba = knn.predict_proba(X_test)

# Evaluating the model
knn_accuracy = accuracy_score(y_test, y_pred)
knn_precision = precision_score(y_test, y_pred, average='macro')
knn_recall = recall_score(y_test, y_pred, average='macro')
knn_f1 = f1_score(y_test, y_pred, average='macro')
knn_roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
knn_logloss = log_loss(y_test, y_pred_proba)
knn_conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {knn_accuracy}')
print(f'Precision: {knn_precision}')
print(f'Recall: {knn_recall}')
print(f'F1 Score: {knn_f1}')
print(f'ROC-AUC Score: {knn_roc_auc}')
print(f'Log Loss: {knn_logloss}')
print(f'Confusion Matrix:\n {knn_conf_matrix}')

In [None]:
# Plotting the ROC Curve for multiclass
fpr = {}
tpr = {}
roc_auc = {}
n_classes = len(knn.classes_)

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test, y_pred_proba[:, i], pos_label=i)
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plotting all ROC curves
plt.figure(figsize=(12, 8))
colors = ['blue', 'green', 'red']
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], color=colors[i], lw=2, label=f'Class {i} (area = {roc_auc[i]:0.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multiclass Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

###Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, log_loss
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Training the Decision Tree model
dtc = DecisionTreeClassifier(random_state=42)
dtc.fit(X_train, y_train)

# Making predictions
y_pred = dtc.predict(X_test)
y_pred_proba = dtc.predict_proba(X_test)

# Evaluating the model
dtc_accuracy = accuracy_score(y_test, y_pred)
dtc_precision = precision_score(y_test, y_pred, average='macro')
dtc_recall = recall_score(y_test, y_pred, average='macro')
dtc_f1 = f1_score(y_test, y_pred, average='macro')
dtc_roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
dtc_logloss = log_loss(y_test, y_pred_proba)
dtc_conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {dtc_accuracy}')
print(f'Precision: {dtc_precision}')
print(f'Recall: {dtc_recall}')
print(f'F1 Score: {dtc_f1}')
print(f'ROC-AUC Score: {dtc_roc_auc}')
print(f'Log Loss: {dtc_logloss}')
print(f'Confusion Matrix:\n {dtc_conf_matrix}')

In [None]:
# Plotting the ROC Curve for multiclass
fpr = {}
tpr = {}
roc_auc = {}
n_classes = len(dtc.classes_)

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test, y_pred_proba[:, i], pos_label=i)
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plotting all ROC curves
plt.figure(figsize=(12, 8))
colors = ['blue', 'green', 'red']
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], color=colors[i], lw=2, label=f'Class {i} (area = {roc_auc[i]:0.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multiclass Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

###Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, log_loss
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Training the Random Forest model
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train, y_train)

# Making predictions
y_pred = rfc.predict(X_test)
y_pred_proba = rfc.predict_proba(X_test)

# Evaluating the model
rfc_accuracy = accuracy_score(y_test, y_pred)
rfc_precision = precision_score(y_test, y_pred, average='macro')
rfc_recall = recall_score(y_test, y_pred, average='macro')
rfc_f1 = f1_score(y_test, y_pred, average='macro')
rfc_roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
rfc_logloss = log_loss(y_test, y_pred_proba)
rfc_conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {rfc_accuracy}')
print(f'Precision: {rfc_precision}')
print(f'Recall: {rfc_recall}')
print(f'F1 Score: {rfc_f1}')
print(f'ROC-AUC Score: {rfc_roc_auc}')
print(f'Log Loss: {rfc_logloss}')
print(f'Confusion Matrix:\n {rfc_conf_matrix}')

In [None]:

# Plotting the ROC Curve for multiclass
fpr = {}
tpr = {}
roc_auc = {}
n_classes = len(rfc.classes_)

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test, y_pred_proba[:, i], pos_label=i)
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plotting all ROC curves
plt.figure(figsize=(12, 8))
colors = ['blue', 'green', 'red']
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], color=colors[i], lw=2, label=f'Class {i} (area = {roc_auc[i]:0.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multiclass Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

###SVM

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, log_loss
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import label_binarize

# Binarize the output
y_bin = label_binarize(y, classes=[0, 1, 2])  # Adjust classes as necessary
n_classes = y_bin.shape[1]

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y_bin, test_size=0.3, random_state=42)

# Training the SVM model with One-vs-Rest strategy
svm = OneVsRestClassifier(SVC(probability=True, random_state=42))
svm.fit(X_train, y_train)

# Making predictions
y_pred = svm.predict(X_test)
y_pred_proba = svm.predict_proba(X_test)

# Evaluating the model
svm_accuracy = accuracy_score(y_test, y_pred)
svm_precision = precision_score(y_test, y_pred, average='macro')
svm_recall = recall_score(y_test, y_pred, average='macro')
svm_f1 = f1_score(y_test, y_pred, average='macro')
svm_roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
svm_logloss = log_loss(y_test, y_pred_proba)
svm_conf_matrix = confusion_matrix(y_test.argmax(axis=1), y_pred.argmax(axis=1))

print(f'Accuracy: {svm_accuracy}')
print(f'Precision: {svm_precision}')
print(f'Recall: {svm_recall}')
print(f'F1 Score: {svm_f1}')
print(f'ROC-AUC Score: {svm_roc_auc}')
print(f'Log Loss: {svm_logloss}')
print(f'Confusion Matrix:\n {svm_conf_matrix}')

In [None]:
# Plotting the ROC Curve for multiclass
fpr = {}
tpr = {}
roc_auc = {}

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_pred_proba[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plotting all ROC curves
plt.figure(figsize=(12, 8))
colors = ['blue', 'green', 'red']
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], color=colors[i], lw=2, label=f'Class {i} (area = {roc_auc[i]:0.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multiclass Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

###Gradient Boosting Classifier

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, log_loss
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

# Binarize the output
y_bin = label_binarize(y, classes=[0, 1, 2])  # Adjust classes as necessary
n_classes = y_bin.shape[1]

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y_bin, test_size=0.3, random_state=42)

# Training the Gradient Boosting Classifier
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train.argmax(axis=1))  # Training on the original multiclass labels

# Making predictions
y_pred = gb.predict(X_test)
y_pred_proba = gb.predict_proba(X_test)

# Evaluating the model
gb_accuracy = accuracy_score(y_test.argmax(axis=1), y_pred)
gb_precision = precision_score(y_test.argmax(axis=1), y_pred, average='macro')
gb_recall = recall_score(y_test.argmax(axis=1), y_pred, average='macro')
gb_f1 = f1_score(y_test.argmax(axis=1), y_pred, average='macro')
gb_roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
gb_logloss = log_loss(y_test, y_pred_proba)
gb_conf_matrix = confusion_matrix(y_test.argmax(axis=1), y_pred)

print(f'Accuracy: {gb_accuracy}')
print(f'Precision: {gb_precision}')
print(f'Recall: {gb_recall}')
print(f'F1 Score: {gb_f1}')
print(f'ROC-AUC Score: {gb_roc_auc}')
print(f'Log Loss: {gb_logloss}')
print(f'Confusion Matrix:\n {gb_conf_matrix}')

In [None]:
# Plotting the ROC Curve for multiclass
fpr = {}
tpr = {}
roc_auc = {}

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_pred_proba[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plotting all ROC curves
plt.figure(figsize=(12, 8))
colors = ['blue', 'green', 'red']
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], color=colors[i], lw=2, label=f'Class {i} (area = {roc_auc[i]:0.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multiclass Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, log_loss
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

# Binarize the output
y_bin = label_binarize(y, classes=[0, 1, 2])  # Adjust classes as necessary
n_classes = y_bin.shape[1]

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y_bin, test_size=0.3, random_state=42)

# Training the Naive Bayes Classifier
nb = GaussianNB()
nb.fit(X_train, y_train.argmax(axis=1))  # Training on the original multiclass labels

# Making predictions
y_pred = nb.predict(X_test)
y_pred_proba = nb.predict_proba(X_test)

# Evaluating the model
nb_accuracy = accuracy_score(y_test.argmax(axis=1), y_pred)
nb_precision = precision_score(y_test.argmax(axis=1), y_pred, average='macro')
nb_recall = recall_score(y_test.argmax(axis=1), y_pred, average='macro')
nb_f1 = f1_score(y_test.argmax(axis=1), y_pred, average='macro')
nb_roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
nb_logloss = log_loss(y_test, y_pred_proba)
nb_conf_matrix = confusion_matrix(y_test.argmax(axis=1), y_pred)

print(f'Accuracy: {nb_accuracy}')
print(f'Precision: {nb_precision}')
print(f'Recall: {nb_recall}')
print(f'F1 Score: {nb_f1}')
print(f'ROC-AUC Score: {nb_roc_auc}')
print(f'Log Loss: {nb_logloss}')
print(f'Confusion Matrix:\n {nb_conf_matrix}')

# Plotting the ROC Curve for multiclass
fpr = {}
tpr = {}
roc_auc = {}

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_pred_proba[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plotting all ROC curves
plt.figure(figsize=(12, 8))
colors = ['blue', 'green', 'red']
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], color=colors[i], lw=2, label=f'Class {i} (area = {roc_auc[i]:0.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multiclass Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, log_loss
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

# Binarize the output for multiclass ROC AUC
y_bin = label_binarize(y, classes=[0, 1, 2])  # Adjust classes as necessary
n_classes = y_bin.shape[1]

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y_bin, test_size=0.3, random_state=42)

# Training the XGBoost Classifier
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train.argmax(axis=1))  # Training on the original multiclass labels

# Making predictions
y_pred = xgb.predict(X_test)
y_pred_proba = xgb.predict_proba(X_test)

# Evaluating the model
xgb_accuracy = accuracy_score(y_test.argmax(axis=1), y_pred)
xgb_precision = precision_score(y_test.argmax(axis=1), y_pred, average='macro')
xgb_recall = recall_score(y_test.argmax(axis=1), y_pred, average='macro')
xgb_f1 = f1_score(y_test.argmax(axis=1), y_pred, average='macro')
xgb_roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
xgb_logloss = log_loss(y_test, y_pred_proba)
xgb_conf_matrix = confusion_matrix(y_test.argmax(axis=1), y_pred)

print(f'Accuracy: {xgb_accuracy}')
print(f'Precision: {xgb_precision}')
print(f'Recall: {xgb_recall}')
print(f'F1 Score: {xgb_f1}')
print(f'ROC-AUC Score: {xgb_roc_auc}')
print(f'Log Loss: {xgb_logloss}')
print(f'Confusion Matrix:\n {xgb_conf_matrix}')

In [None]:
# Plotting the ROC Curve for multiclass
fpr = {}
tpr = {}
roc_auc = {}

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_pred_proba[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plotting all ROC curves
plt.figure(figsize=(12, 8))
colors = ['blue', 'green', 'red']
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], color=colors[i], lw=2, label=f'Class {i} (area = {roc_auc[i]:0.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multiclass Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

##Comparison Table

In [None]:
import pandas as pd
from tabulate import tabulate

# Define your model names and corresponding classification metrics
models = ["Logistic Regression", "Decision Tree", "Random Forest", "Gradient Boosting", "XGBoost",
          "KNN", "SVM", "Naive Bayes"]
accuracy = [logreg_accuracy, dtc_accuracy, rfc_accuracy, gb_accuracy, xgb_accuracy,
            knn_accuracy, svm_accuracy, nb_accuracy]
precision = [logreg_precision, dtc_precision, rfc_precision, gb_precision, xgb_precision,
             knn_precision, svm_precision, nb_precision]
recall = [logreg_recall, dtc_recall, rfc_recall, gb_recall, xgb_recall,
          knn_recall, svm_recall, nb_recall]
f1_score = [logreg_f1, dtc_f1, rfc_f1, gb_f1, xgb_f1,
            knn_f1, svm_f1, nb_f1]
roc_auc = [logreg_roc_auc, dtc_roc_auc, rfc_roc_auc, gb_roc_auc, xgb_roc_auc,
           knn_roc_auc, svm_roc_auc, nb_roc_auc]
log_loss = [logreg_logloss, dtc_logloss, rfc_logloss, gb_logloss, xgb_logloss,
            knn_logloss, svm_logloss, nb_logloss]
confusion_matrix = [logreg_conf_matrix, dtc_conf_matrix, rfc_conf_matrix, gb_conf_matrix, xgb_conf_matrix,
                    knn_conf_matrix, svm_conf_matrix, nb_conf_matrix]

# Create the dictionary for DataFrame
data = {
    "Model": models,
    "Accuracy": accuracy,
    "Precision": precision,
    "Recall": recall,
    "F1 Score": f1_score,
    "ROC AUC": roc_auc,
    "Log Loss": log_loss,
    "Confusion Matrix": confusion_matrix
}

# Create the DataFrame
results_df = pd.DataFrame(data)

# Print the table using tabulate
table = tabulate(results_df, headers='keys', tablefmt='pretty', showindex=False)

# Print the formatted table
print(table)

In [None]:
# Find the best models based on Accuracy
best_models_accuracy = results_df[results_df['Accuracy'] == results_df['Accuracy'].max()]

print("\nModels with the highest Accuracy:")
print(best_models_accuracy)

###since we have two best model according to the accuracy, there is a need to compare other metrics too.

In [None]:
# Compare other metrics for these models
for index, row in best_models_accuracy.iterrows():
    model_name = row['Model']
    print(f"\nPerformance metrics for {model_name}:")
    print(f"Precision: {row['Precision']}")
    print(f"Recall: {row['Recall']}")
    print(f"F1 Score: {row['F1 Score']}")
    print(f"ROC AUC: {row['ROC AUC']}")
    print(f"Log Loss: {row['Log Loss']}")
    print(f"Confusion Matrix: {row['Confusion Matrix']}")

# Find the best model based on each metric
best_model_f1 = best_models_accuracy['F1 Score'].idxmax()
best_model_log_loss = best_models_accuracy['Log Loss'].idxmin()

In [None]:
# Print the best model for each metric
print("\nBest model based on F1 Score:")
print(best_models_accuracy.loc[best_model_f1])

In [None]:
print("\nBest model based on Log Loss:")
print(best_models_accuracy.loc[best_model_log_loss])

##Therefore Random Forest achieves the best when compared to XGBoost

#Model Deployment

In [None]:
!pip install streamlit -q

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import scipy.cluster.hierarchy as shc

# Setting display options
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')

# Reading the data
df = pd.read_excel("marketing_campaign.xlsx")

# Basic data inspection
print(df.describe())
print(df.info())
print(df.isnull().sum())

# Dropping rows with missing values
df = df.dropna()
print(df.isnull().sum())
print(df.shape)
print(df.duplicated().sum())

# Dropping unnecessary columns
df = df.drop(columns=['ID'], axis=1)

# Creating age column and dropping Year_Birth
df['Age'] = 2015 - df['Year_Birth']
df = df.drop(columns=['Year_Birth'], axis=1)

# Modifying Education column
df["Education"] = df["Education"].replace({"Basic": "Undergraduate", "2n Cycle": "Undergraduate",
                                           "Graduation": "Graduate", "Master": "Postgraduate",
                                           "PhD": "Postgraduate"})

# Modifying Marital_Status column
df["Marital_Status"] = df["Marital_Status"].replace({"Together": "Married", "Divorced": "Single",
                                                     "Widow": "Single", "Alone": "Single",
                                                     "Absurd": "Single", "YOLO": "Single"})

# Dropping Z_CostContact and Z_Revenue columns
df = df.drop(columns=['Z_CostContact', 'Z_Revenue'], axis=1)

# Merging Kidhome and Teenhome into total_children
df['total_children'] = df['Kidhome'] + df['Teenhome']
df = df.drop(columns=['Kidhome', 'Teenhome'], axis=1)

# Dropping Date column
df = df.drop(columns=['Dt_Customer'], axis=1)

# Separating categorical and numerical columns
cat_columns = df[['Education', 'Marital_Status', 'AcceptedCmp3', 'AcceptedCmp4',
                  'AcceptedCmp5', 'AcceptedCmp1', 'AcceptedCmp2', 'Complain', 'Response']]
num_columns = df.drop(columns=cat_columns.columns)

# Handling skewness and outliers
df['Income'] = df['Income'].astype(int)
skewed_features = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts',
                   'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases',
                   'NumWebPurchases', 'NumCatalogPurchases']

for col in skewed_features:
    df[col] = np.log1p(df[col])

# Data preprocessing and feature engineering
LE = LabelEncoder()
df['Education'] = LE.fit_transform(df['Education'])
df['Marital_Status'] = LE.fit_transform(df['Marital_Status'])

scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

# PCA
pca = PCA(0.95)
pca_data = pd.DataFrame(pca.fit_transform(df_scaled))

# Clustering using KMeans
WCSS = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(pca_data)
    WCSS.append(kmeans.inertia_)

# plt.plot(range(1, 11), WCSS, marker='o', linestyle='--')
# plt.xlabel('Number of clusters')
# plt.ylabel('WCSS')
# plt.title('Elbow Method for Optimal k')
# plt.show()

# Silhouette Score method to find optimal k
silhouette_scores = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=0)
    labels = kmeans.fit_predict(pca_data)
    silhouette_scores.append(silhouette_score(pca_data, labels))

# plt.plot(range(2, 11), silhouette_scores, marker='o', linestyle='--')
# plt.xlabel('Number of clusters')
# plt.ylabel('Silhouette Score')
# plt.title('Silhouette Method for Optimal k')
# plt.show()

# Agglomerative Clustering
linked = shc.linkage(pca_data, method='ward')

# plt.figure(figsize=(10, 7))
# shc.dendrogram(linked)
# plt.title('Hierarchical Clustering Dendrogram')
# plt.xlabel('Samples')
# plt.ylabel('Distance')
# plt.show()

agg_cluster = AgglomerativeClustering(n_clusters=3,  linkage='ward')
labels = agg_cluster.fit_predict(pca_data)
pca_data['Clusters'] = labels

# Visualization of clusters


#
# plt.figure(figsize=(10, 7))
# sns.scatterplot(x=pca_data.iloc[:, 0], y=pca_data.iloc[:, 1], hue=pca_data["Clusters"], palette='viridis', marker='o')
# plt.title('Agglomerative Clustering (PCA-reduced data)')
# plt.xlabel('Principal Component 1')
# plt.ylabel('Principal Component 2')
# plt.legend(title='Cluster')
# plt.show()



# Profiling and analysis of clustering
data_cleaned = df.copy()
data_cleaned['Clusters'] = labels

# Various visualizations and analysis
# (Code for visualizations and analysis can be added here)

# Classification for future clustering
classif_data = data_cleaned[['Education', 'Marital_Status', 'Income', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Age', 'total_children', 'Clusters']]

classif_data['Education'] = LE.fit_transform(classif_data['Education'])
classif_data['Marital_Status'] = LE.fit_transform(classif_data['Marital_Status'])
classif_scaled = pd.DataFrame(scaler.fit_transform(classif_data), columns=classif_data.columns)

X = classif_data.drop('Clusters', axis=1)
y = classif_data['Clusters']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Classification models
from sklearn.metrics import  accuracy_score

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print("KNN Accuracy:", accuracy_score(y_test, y_pred))

dt = DecisionTreeClassifier(max_depth=5, random_state=42)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred))

rf = RandomForestClassifier(n_estimators=100, max_depth=11, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred))

gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=2, random_state=42)
gbm.fit(X_train, y_train)
y_pred = gbm.predict(X_test)
print("GBM Accuracy:", accuracy_score(y_test, y_pred))

# Additional code for analysis and visualizations can be added based on your requirements









import pickle
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier

# Train your model (assuming 'gbm' is your trained model)
# Save the model
with open('model.pkl', 'wb') as model_file:
    pickle.dump(gbm, model_file)

# Assuming you have fitted scalers and encoders
scaler = StandardScaler().fit(X_train)  # fit with your training data
le_education = LabelEncoder().fit(X['Education'])
le_marital_status = LabelEncoder().fit(X['Marital_Status'])

# Save the scalers and encoders
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

with open('le_education.pkl', 'wb') as le_education_file:
    pickle.dump(le_education, le_education_file)

with open('le_marital_status.pkl', 'wb') as le_marital_status_file:
    pickle.dump(le_marital_status, le_marital_status_file)

In [None]:
import streamlit as st
import pickle
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Custom CSS for background image
css = """
<style>
body {
    background-image: url('download.jfif');
    background-size: cover;
    background-attachment: fixed;
}
</style>
"""

# Inject the CSS into the Streamlit app
st.markdown(css, unsafe_allow_html=True)

# Load model, scaler, and encoders
with open('model.pkl', 'rb') as model_file:
    model = pickle.load(model_file)

with open('scaler.pkl', 'rb') as scaler_file:
    scaler = pickle.load(scaler_file)

with open('le_education.pkl', 'rb') as le_education_file:
    le_education = pickle.load(le_education_file)

with open('le_marital_status.pkl', 'rb') as le_marital_status_file:
    le_marital_status = pickle.load(le_marital_status_file)

# Define mapping for dropdown options
education_mapping = {0: 'Undergraduate', 1: 'Graduate', 2: 'Postgraduate'}
marital_status_mapping = {0: 'Single', 1: 'Married'}

# Reverse mapping for label encoding
reverse_education_mapping = {v: k for k, v in education_mapping.items()}
reverse_marital_status_mapping = {v: k for k, v in marital_status_mapping.items()}

# Cluster information (replace with actual descriptions and recommendations)
cluster_info = {
    0: {
        "description": "Cluster 0: Customers have higher income and education levels, are generally married with moderate children, and prefer premium products like wines and gold.",
        "recommendation": " Focus on premium offerings, implement exclusive loyalty programs, and enhance customer service to retain their high spending and engagement levels."
    },
    1: {
        "description": "Cluster 1: Customer group comprises married customers with moderate income and education, having more children and preferring affordable products like fruits and sweets.",
        "recommendation": "Utilize value-oriented pricing, tailor family-oriented marketing, and leverage promotional strategies to appeal to their budget-conscious nature."
    },
    2: {
        "description": "Cluster 2: Comprises of younger customers with lower income and education levels, fewer children, and a preference for basic necessities and low-cost products.",
        "recommendation": "Emphasize affordability, improve digital engagement with exclusive online promotions, and streamline purchasing processes for convenience."
    }
}

# Streamlit UI
st.title("Customer Clustering Prediction")

# Form to collect user inputs
with st.form("prediction_form"):
    Education = st.selectbox("Education:", options=list(education_mapping.values()))
    Marital_Status = st.selectbox("Marital Status:", options=list(marital_status_mapping.values()))
    Income = st.number_input("Income:", min_value=0)
    MntWines = st.number_input("MntWines:", min_value=0)
    MntFruits = st.number_input("MntFruits:", min_value=0)
    MntMeatProducts = st.number_input("MntMeatProducts:", min_value=0)
    MntFishProducts = st.number_input("MntFishProducts:", min_value=0)
    MntSweetProducts = st.number_input("MntSweetProducts:", min_value=0)
    MntGoldProds = st.number_input("MntGoldProds:", min_value=0)
    NumDealsPurchases = st.number_input("NumDealsPurchases:", min_value=0)
    NumWebPurchases = st.number_input("NumWebPurchases:", min_value=0)
    NumCatalogPurchases = st.number_input("NumCatalogPurchases:", min_value=0)
    NumStorePurchases = st.number_input("NumStorePurchases:", min_value=0)
    NumWebVisitsMonth = st.number_input("NumWebVisitsMonth:", min_value=0)
    AcceptedCmp3 = st.number_input("AcceptedCmp3:", min_value=0, max_value=1)
    AcceptedCmp4 = st.number_input("AcceptedCmp4:", min_value=0, max_value=1)
    AcceptedCmp5 = st.number_input("AcceptedCmp5:", min_value=0, max_value=1)
    AcceptedCmp1 = st.number_input("AcceptedCmp1:", min_value=0, max_value=1)
    AcceptedCmp2 = st.number_input("AcceptedCmp2:", min_value=0, max_value=1)
    Age = st.number_input("Age:", min_value=0)
    TotalChildren = st.number_input("TotalChildren:", min_value=0)

    submitted = st.form_submit_button("Predict")

# Prediction logic
if submitted:
    # Prepare the input data
    input_data = [
        reverse_education_mapping[Education],
        reverse_marital_status_mapping[Marital_Status],
        Income, MntWines, MntFruits, MntMeatProducts, MntFishProducts,
        MntSweetProducts, MntGoldProds, NumDealsPurchases, NumWebPurchases,
        NumCatalogPurchases, NumStorePurchases, NumWebVisitsMonth,
        AcceptedCmp3, AcceptedCmp4, AcceptedCmp5, AcceptedCmp1, AcceptedCmp2,
        Age, TotalChildren
    ]

    # Scaling the numerical features
    scaled_data = scaler.transform([input_data])

    # Making prediction
    prediction = model.predict(scaled_data)
    predicted_cluster = prediction[0]

    # Display the result
    st.write(f"The predicted customer cluster is: **{predicted_cluster}**")
    st.write(f"Description: {cluster_info[predicted_cluster]['description']}")
    st.write(f"Strategic Recommendation: {cluster_info[predicted_cluster]['recommendation']}")

    # Visualize input data for the predicted cluster
    st.write("## Visualizations for the Predicted Cluster")

    if predicted_cluster == 0:
        # Visualization 1: Spending distribution
        labels = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
        spending = [MntWines, MntFruits, MntMeatProducts, MntFishProducts, MntSweetProducts, MntGoldProds]

        fig1, ax1 = plt.subplots()
        ax1.pie(spending, labels=labels, autopct='%1.1f%%')
        ax1.axis('equal')  # Equal aspect ratio ensures the pie is drawn as a circle.
        ax1.set_title("Spending Distribution for Cluster 0")
        st.pyplot(fig1)

        # Visualization 2: Income vs Spending
        fig2, ax2 = plt.subplots()
        spending_data = [MntWines + MntFruits + MntMeatProducts + MntFishProducts + MntSweetProducts + MntGoldProds]
        sns.barplot(x=['Total Spending'], y=spending_data, ax=ax2)
        ax2.set_title("Income vs Total Spending for Cluster 0")
        st.pyplot(fig2)

        # Visualization 3: Age Distribution
        fig3, ax3 = plt.subplots()
        sns.histplot([Age], bins=10, kde=True, ax=ax3)
        ax3.set_title("Age Distribution for Cluster 0")
        st.pyplot(fig3)

    elif predicted_cluster == 1:
        # Visualization 1: Spending distribution
        labels = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
        spending = [MntWines, MntFruits, MntMeatProducts, MntFishProducts, MntSweetProducts, MntGoldProds]

        fig1, ax1 = plt.subplots()
        ax1.pie(spending, labels=labels, autopct='%1.1f%%')
        ax1.axis('equal')  # Equal aspect ratio ensures the pie is drawn as a circle.
        ax1.set_title("Spending Distribution for Cluster 1")
        st.pyplot(fig1)

        # Visualization 2: Purchase behavior
        purchase_labels = ['Deals Purchases', 'Web Purchases', 'Catalog Purchases', 'Store Purchases']
        purchase_counts = [NumDealsPurchases, NumWebPurchases, NumCatalogPurchases, NumStorePurchases]

        fig2, ax2 = plt.subplots()
        sns.barplot(x=purchase_labels, y=purchase_counts, ax=ax2)
        ax2.set_ylabel('Number of Purchases')
        ax2.set_title("Purchase Behavior for Cluster 1")
        st.pyplot(fig2)

        # Visualization 3: Age Distribution
        fig3, ax3 = plt.subplots()
        sns.histplot([Age], bins=10, kde=True, ax=ax3)
        ax3.set_title("Age Distribution for Cluster 1")
        st.pyplot(fig3)

    elif predicted_cluster == 2:
        # Visualization 1: Spending distribution
        labels = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
        spending = [MntWines, MntFruits, MntMeatProducts, MntFishProducts, MntSweetProducts, MntGoldProds]

        fig1, ax1 = plt.subplots()
        ax1.pie(spending, labels=labels, autopct='%1.1f%%')
        ax1.axis('equal')  # Equal aspect ratio ensures the pie is drawn as a circle.
        ax1.set_title("Spending Distribution for Cluster 2")
        st.pyplot(fig1)

        # Visualization 2: Web visits
        st.write("Number of Web Visits per Month:", NumWebVisitsMonth)
        fig2, ax2 = plt.subplots()
        sns.barplot(x=['Web Visits'], y=[NumWebVisitsMonth], ax=ax2)
        ax2.set_ylabel('Number of Web Visits')
        ax2.set_title("Web Visits for Cluster 2")
        st.pyplot(fig2)

        # Visualization 3: Age Distribution
        fig3, ax3 = plt.subplots()
        sns.histplot([Age], bins=10, kde=True, ax=ax3)
        ax3.set_title("Age Distribution for Cluster 2")
        st.pyplot(fig3)


In [None]:
! wget -q -O - ipv4.icanhazip.com

In [None]:
!streamlit run customer_segmentation.py & npx localtunnel --port 8501