In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder,StandardScaler
import seaborn as sns

In [None]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("MySparkApp").getOrCreate()
spark_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true")
.load("hdfs://localhost:9000/bigdata/marketing_strat.csv")
spark_df.show()

#Convert to pandas df
df = spark_df.toPandas()

In [None]:
df

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.size

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df=df.dropna()

In [None]:
df.rename(columns={"Response": "AcceptedCmp6"}, inplace=True)

In [None]:
df.info()

# Exploratory Analysis

In [None]:
df['Age'] = 2024-df['Year_Birth']

In [None]:
x = df.drop(['Z_CostContact','Z_Revenue'],axis = 1)

In [None]:
df["Spent"] = df["MntWines"]+ df["MntFruits"]+ df["MntMeatProducts"]+ df["MntFishProducts"]+ df["MntSweetProducts"]+ df["MntGoldProds"]

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(16,9))
ax = sns.heatmap(x.corr(numeric_only=True),annot = True,cmap = 'viridis')
plt.show()

# Data Visualization

In [None]:
sns.pairplot(df[['Income','Age','Recency','Spent','Marital_Status']])

In [None]:
sns.countplot(x = 'Education' , data = df)
plt.show()

In [None]:
sns.countplot( x = 'Marital_Status',data = df)
plt.show()

In [None]:
obj = ['Education','Marital_Status']

In [None]:
for i in obj:
    data=df.copy()
    data.groupby(i)['Age'].mean().plot.bar()
    plt.xlabel(i)
    plt.ylabel('Age')
    plt.title(i)
    plt.show()

In [None]:
df_copy = df.copy()

In [None]:
for i in range(len(obj)):
    x='Marital_Status'
    for j in range(1):
        if obj[i] != x:
            sns.barplot(x= x,y='Age',hue=obj[i],data=df_copy)
            sns.set(rc={'figure.figsize':(11,12)})
            plt.show()

In [None]:
for i in range(len(obj)):
    x='Marital_Status'
    for j in range(1):
        if obj[i] != x:
            sns.barplot(x= x,y='Income',hue=obj[i],data=df_copy)
            sns.set(rc={'figure.figsize':(11,12)})
            plt.show()

In [None]:
y = df.drop(['Education','Marital_Status','Dt_Customer'],axis = 1)
for i in y.columns:
    sns.boxplot(x = i, data = y,color = 'yellowgreen')   
    plt.xlabel(i)
    plt.show()

In [None]:
df['Accepted'] = df['AcceptedCmp1'] + df['AcceptedCmp2'] + df['AcceptedCmp3'] + df['AcceptedCmp4'] + df['AcceptedCmp5'] + df['AcceptedCmp6']

In [None]:
df['Education'].value_counts()

In [None]:
df["Education"]=df["Education"].replace({"2n Cycle":"Undergraduate", "Graduation":"Graduate", "Master":"Postgraduate", "PhD":"Postgraduate"})

In [None]:
df['Marital_Status'].value_counts()

In [None]:
df['Num_Children'] = df['Kidhome'] + df['Teenhome']

In [None]:
df["Marital_Status"]=df["Marital_Status"].replace({"Married":"Partner", "Together":"Partner", "Absurd":"Single", "Widow":"Single", "YOLO":"Single", "Divorced":"Single", "Single":"Single","Alone":"Single"})

In [None]:
def categorize_age(age):
    if age <= 59:
        return 'Adults'
    else:
        return 'Seniors'
# Apply the function to create a new 'Age Group' column
df['Age_Group'] = df['Age'].apply(categorize_age)

In [None]:
ndf = df

In [None]:
mStratCols = ['Dt_Customer','AcceptedCmp1','AcceptedCmp2','AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Complain', 'AcceptedCmp6','ID','Kidhome','Teenhome','Age','Year_Birth']

In [None]:
ndf = df.drop(mStratCols,axis =1)

In [None]:
ndf.info()

In [None]:
ndf.info()

In [None]:
ndf.info()

In [None]:
ndf['Age_Group'].value_counts()

In [None]:
ndf.info()

In [None]:
#Total spendings on various items
ndf["Spent"] = ndf["MntWines"]+ ndf["MntFruits"]+ ndf["MntMeatProducts"]+ ndf["MntFishProducts"]+ ndf["MntSweetProducts"]+ ndf["MntGoldProds"]

In [None]:
ndf = ndf.dropna()

In [None]:
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt

# Encoding

In [None]:
##AGE
# Create a mapping from category to integer
age_mapping = {'Adults': 1, 'Seniors': 2}

# Apply integer encoding to the 'Category' column
ndf['Age_Group'] = ndf['Age_Group'].map(age_mapping)

In [None]:
##Education
# Create a mapping from category to integer
edu_mapping = {'Basic': 1, 'Graduate': 2,'Undergraduate': 3, 'Postgraduate': 4 }

# Apply integer encoding to the 'Category' column
ndf['Education'] = ndf['Education'].map(edu_mapping)

In [None]:
##AGE
# Create a mapping from category to integer
marital_mapping = {'Partner': 1, 'Single': 2}

# Apply integer encoding to the 'Category' column
ndf['Marital_Status'] = ndf['Marital_Status'].map(marital_mapping)

# Feature Scaling

In [None]:
scaler = StandardScaler()
scaler.fit(ndf)
scaled_ndf = pd.DataFrame(scaler.transform(ndf),columns= ndf.columns )

# Dimensionality reduction

In [None]:
pca = PCA(n_components=3)
pca.fit(scaled_ndf)
PCA_ndf = pd.DataFrame(pca.transform(scaled_ndf), columns=(["col1","col2", "col3"]))
PCA_ndf.describe().T

In [None]:
x =PCA_ndf["col1"]
y =PCA_ndf["col2"]
z =PCA_ndf["col3"]

In [None]:
print('Optimal number of cluster with Elbow Method:')
Elbow_M = KElbowVisualizer(KMeans(), k=10)
Elbow_M.fit(PCA_ndf)
Elbow_M.show()

# Gaussian Mixture Model

In [None]:
import random

In [None]:
random.seed(1234)
from sklearn.mixture import GaussianMixture
#Initiating the GaussianMixture Clustering model 
GMM = GaussianMixture(n_components=4, random_state=10)

GP_df= GMM.fit_predict(PCA_ndf)
# fit model and predict clusters
ndf["Clusters"]= GP_df
df["Clusters"]=GP_df

In [None]:
#Plotting the clusters
fig = plt.figure(figsize=(10,8))
ax = plt.subplot(111, projection='3d', label="bla")
ax.scatter(x, y, z, s=40, c=df["Clusters"], marker='o', cmap = 'viridis')
ax.set_title("The Plot Of The Clusters")
plt.show()

In [None]:
pal = ["#feda75","#fa7e1e", "#d62976","#962fbf"]
pl = sns.countplot(x=df["Clusters"], palette= pal)
pl.set_title("Distribution Of The Clusters")
plt.show()

In [None]:
pl = sns.scatterplot(data = ndf,x=ndf["Spent"], y=ndf["Income"],hue=ndf["Clusters"],palette = pal)
pl.set_title("Cluster's Profile Based On Income And Spending")
plt.legend()
plt.show()

In [None]:
plt.figure()
pl=sns.swarmplot(x=df["Clusters"], y=df["Spent"], color= '#CBEDDD', alpha=0.5 )
pl=sns.boxenplot(x=df["Clusters"], y=df["Spent"])
plt.show()

In [None]:
plt.figure(figsize=(10,5))
plt.title(f'Income')
axes = sns.boxplot(data=df, x='Clusters', y='Income', palette=sns.color_palette("Set1"), showfliers=False)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming you have df, df["Clusters"], pal, and other necessary variables defined

# Create subplots with 1 row and 2 columns
fig, axes = plt.subplots(3, 2, figsize=(12, 16))

# List of campaign columns
campaign_columns = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp6']
axes = axes.flatten()

# Loop through each campaign column and create a count plot for 'AcceptedCmp' == 0
for i, campaign_col in enumerate(campaign_columns):
    subset_df = df[df[campaign_col] == 1]
    ax = sns.countplot(x=campaign_col, hue=df["Clusters"], palette=pal, data=subset_df, ax=axes[i])
    ax.set_title(f"Count Of Promotion Accepted - Campaign {i+1}")
    ax.set_xlabel("Number Of Total Accepted Promotions")

# Adjust layout for better spacing
plt.tight_layout()

# Show the combined plot
plt.show()

In [None]:
#Plotting the number of deals purchased
plt.figure()
pl=sns.boxenplot(y=df["NumDealsPurchases"],x=df["Clusters"], palette= pal)
pl.set_title("Number of Deals Purchased")
plt.show()

In [None]:
plt.figure(figsize=(16,5))
plt.title(f'Education Obtained')
sns.countplot(data=df, x='Education', hue='Clusters')
plt.show()

In [None]:
purchases = df['NumWebPurchases']+df['NumCatalogPurchases']+df['NumStorePurchases']

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(10, 10))   
k = 0

for i in range(0, 2):
    for j in range(0, 2):
        
        cluster_data = df.query(f'Clusters == {k}')
        
        if not cluster_data.empty:
            num = cluster_data.filter(regex='Num[^Deals|Total].+Purchases').sum(axis=0)
            sizes = dict(num)
            labels = list(sizes.keys())  # Adjust labels based on the data
            axes[i, j].set_title(f"Clusters {k}")
            axes[i, j].pie(sizes.values(), labels=labels, autopct="%.1f%%", pctdistance=0.85)
            k -= -1

fig.suptitle('Shopping Source Proportions')
fig.legend(title="Shopping Source", labels=labels, bbox_to_anchor=(1, 1)) 
plt.show()

In [None]:
plt.figure(figsize=(16,5))
plt.title(f'Age Group Obtained')
sns.countplot(data=df, x='Age_Group', hue='Clusters')
plt.show()

In [None]:
Personal = ["Num_Children" , "Education"]

for i in Personal:
    plt.figure()
    sns.jointplot(x=ndf[i], y=ndf["Spent"], hue=ndf["Clusters"], kind= "kde", palette = pal)
    plt.show()

In [None]:
plt.figure(figsize=(16,5))
plt.title(f'Marital Status Obtained')
sns.countplot(data=df, x='Marital_Status', hue='Clusters')
plt.show()

In [None]:
plt.figure(figsize=(16,5))
plt.title(f'Number of Children Obtained')
sns.countplot(data=df, x='Num_Children', hue='Clusters')
plt.show()