In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date,timedelta
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
pd.set_option("display.max_rows",None)
pd.set_option("display.max_columns",None)

Question 1&2

In [None]:
#Create Dataframe from csv
data = pd.read_csv('marketing_campaign.csv',delimiter= '\t')
df = pd.DataFrame(data)

df.dropna(inplace=True) #may need to change inplace to false

#Change Dt_Customer column to datetime object
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'],format='%d-%m-%Y')
#Create a pie chart for the education column
Education = df['Education'].value_counts()
Education.plot.pie(title='Education', autopct='%1.1f%%')
plt.show()
#find all dtype objects in the dataframe
print(df.dtypes[df.dtypes == 'object'])


In [None]:
#Change the Values of YOLO or Absurd or Alone to Single
df.loc[(df['Marital_Status'] == 'YOLO') | 
       (df['Marital_Status'] == 'Absurd') | 
       (df['Marital_Status'] == 'Alone'),'Marital_Status'] = 'Single'
#Create a pie chart for the Marital Status column
MaritalStatus = df['Marital_Status'].value_counts()
MaritalStatus.plot.pie(title='Marital Status', autopct='%1.1f%%')
plt.show()
#print the unique values in marital status
print(df['Marital_Status'].unique())

Question 3

In [None]:
#get the current date
df['Customer_For'] = pd.to_datetime(date.today())
#subtract the number of days since the customer's last visit
df['Customer_For'] = df['Customer_For'] - pd.to_timedelta(df['Recency'],unit='D')
#subtract the dates of the customer's first and last visit
df['Customer_For'] = df['Customer_For'] - df['Dt_Customer']

#get the current year - the date of the customer's birth
df['Age'] = date.today().year - df['Year_Birth']

# get the total amount of money spent by the customer in the last 2 years
df['Spent'] = df['MntWines'] + df['MntFruits'] + df['MntMeatProducts'] + df['MntFishProducts'] + df['MntGoldProds']

#Get the number of children in the customer's family
df['Children'] = df['Kidhome'] + df['Teenhome']

#if marital status is together or married
#we increment family size by 2
#else we increment by 1
df['Family_Size'] = df['Marital_Status'].apply(lambda x: 2 if (x == 'Married') | (x == 'Together') else 1)
#Increment the family size by the number of children in the family
df['Family_Size'] += df['Children']

#If customer has at least 1 child,he is a parent
df['Is_Parent'] = df['Children'].apply(lambda x: 1 if x >=1 else 0)

#Check if customer is alone or has a partner
df['Living_With'] = df['Marital_Status'].apply(lambda x: 'Partner' if (x == 'Married') | (x == 'Together') else 'Alone')

#Create Age groups
def Age_Sort(x):
    if x <= 30:
        return '21-30'
    elif x <= 40:
        return '31-40'
    elif x <= 50:
        return '41-50'
    elif x <= 60:
        return '51-60'
    elif x <= 70:
        return '61-70'
    elif x <= 80:
        return '71-80'
    else:
        return '>80'
#apply the function to the age column
df['Age_Group'] = df['Age'].apply(Age_Sort)

Question 4

In [None]:
#A standard z-score function
def zscore(col):
    return (col - np.mean(col)) / np.std(col)
#calculate the z-score for the Year birth column
#Exclude the extreme values
z_scores = zscore(df['Year_Birth'])
df =  df[(z_scores <= 3) & (z_scores >= -3)]

#Calculate z score for the income column
z_scores = zscore(df['Income'])
df =  df[(z_scores <= 3) & (z_scores >= -3)]

#Calculate z score for the Spent Column
z_scores = zscore(df['Spent'])
df =  df[(z_scores <= 3) & (z_scores >= -3)]

#Calculate z score for the Num Deals Purchases Column
z_scores = zscore(df['NumDealsPurchases'])
df =  df[(z_scores <= 3) & (z_scores >= -3)]

Question 5

In [None]:
sns.heatmap(df.corr(),cmap='RdBu') #Use .corr() to get the correlations between the columns of Dataframe 
                                   #and then use seaborn.heatmap to illustrate it in a red-blue spectrum 2d heatmap

Question 6

In [None]:
#1st graph (6.1 Question)
counts = df['Marital_Status'].value_counts() #count the number of occurrences of each category in the Marital Status column
counts.plot.pie(autopct='%1.1f%%')
plt.show()                                  #display the pie chart and observe the highest percentage of marital status

#2nd graph  (6.2 Question)
counts = df['Complain'].value_counts()  #count values in Complain column for each value
labels_dict = {1 : 'Yes', 0 : 'No'}
new_labels = counts.index.map(labels_dict)

fig,ax = plt.subplots()
ax.bar(new_labels, counts.values, width=0.1)
ax.set_title('Bar Chart of complain instances')
ax.set_xlabel('Existence of complain')
ax.set_ylabel('Occurances')

for i, v in enumerate(counts.values):   #display the actual amounts into the pie
    ax.text(i, v, str(v), ha='center', va='bottom')

plt.show()    #display the bar chart and observe above the bar the total occurances of clients with a complain and those without it.

#3rd graph  (6.3 Question)
newdf = df[['Marital_Status', 'Spent']]   #get the columns that are gonna be analyzed
newdf = newdf.groupby('Marital_Status').sum()   #group the data by Family Size and compute the sum of income for each size 

newdf.plot(kind='bar',color='black')

plt.xlabel('Marital Status')
plt.ylabel('Spent')
plt.title('Total spendings per marital status')
plt.show()

#4th graph  (6.4 Question)
fig = plt.figure()  #create a 3D axis
ax = plt.axes(projection='3d')

ax.scatter(df['Spent'], df['Kidhome'], df['Family_Size'],color='brown') #represent the data of the three columns in a scatter plot
ax.set_xlabel('Spent')
ax.set_ylabel('Kidhome')
ax.set_zlabel('Family Size')
plt.show() #display the scatter 3d chart and observe the frequency of bigger size of families spending less and less.

#5th graph (6.5 Question)
newdf = df[['Age_Group', 'Spent']]    #get the columns that are gonna be analyzed
newdf = newdf.groupby('Age_Group').sum()    #group the data by Age Groups and compute the sum of spendings for each age-group 

newdf.plot(kind='bar',color='purple')
plt.xlabel('Age Groups')
plt.ylabel('Spent')
plt.title('Total spendings per age group')
plt.show()

#6th graph (6.6 Question)
df.plot.scatter(x='Income', y='Spent',s=20) #Show the values between the two columns we want to analyze in a scatter plot

#7th graph (6.7 Question)
newdf = df[['Education', 'Income']] #get the columns that are gonna be analyzed
newdf = newdf.groupby('Education').mean() #group the data by education category and compute the mean income of each category

newdf.plot(kind='bar', color='green')
plt.xlabel('Education')
plt.ylabel('Average Income')
plt.title('Average income per education level')
plt.show()

#8th graph (6.8 Question)
newdf = df[['Family_Size', 'Income']]   #get the columns that are gonna be analyzed
newdf = newdf.groupby('Family_Size').mean() #group the data by family size  and compute the mean income of each size

newdf.plot(kind='bar',color='orange')
plt.xlabel('Family Size')
plt.ylabel('Average Income')
plt.title('Average income per family size')
plt.show()

#9th graph (6.9 Question)
newdf = df[['Kidhome', 'Income']] #get the columns that are gonna be analyzed
newdf = newdf.groupby('Kidhome').mean() #group the data by numbers of kids in house and compute the mean income of each size

newdf.plot(kind='bar',color='black')
plt.xlabel('Number of Kids at home')
plt.ylabel('Average Income')
plt.title('Average income per Kidhome')
plt.show()

#10th graph (6.15 Question)
plt.hist(df['Income'],color='red')  #show the data income of dataframe in a histogram
plt.title('Histogram for Income')
plt.xlabel('Amount')
plt.ylabel('Occurances')
plt.show()

Question 7

In [None]:
#Transform the Living With column
encoder = LabelEncoder()

encoder.fit(df['Living_With'])
df['Living_With'] = encoder.transform(df['Living_With'])

#Transform the Marital status column
encoder.fit(df['Marital_Status'])
df['Marital_Status'] = encoder.transform(df['Marital_Status'])

#Transform the Marital status column
encoder.fit(df['Education'])
df['Education'] = encoder.transform(df['Education'])

#Create a copy of the dataframe
new_df = df.copy()

#Delete the non useful columns
del new_df['AcceptedCmp1']
del new_df['AcceptedCmp2']
del new_df['AcceptedCmp3']
del new_df['AcceptedCmp4']
del new_df['AcceptedCmp5']
del new_df['Complain']
del new_df['Response']
del new_df['Dt_Customer']
del new_df['Age_Group']
del new_df['Customer_For']
#hold the columns since the output of transform is a numpy array
df_columns = new_df.columns

#Standardize the new dataframe
scaler = StandardScaler()
scaler.fit(new_df)
new_df = scaler.transform(new_df)

#return to the dataframe form
new_df = pd.DataFrame(new_df,columns=df_columns)

#Create a PCA object with 3 components
pca = PCA(n_components=3)

#Transform the dataframe
new_df = pca.fit_transform(new_df)

# create a scatter plot of the reduced data
fig = plt.figure()
colors = ['r', 'g', 'b']
ax = fig.add_subplot(111, projection='3d')
for i in range(new_df.shape[0]):
    ax.scatter(new_df[i, 0], new_df[i, 1], new_df[i, 2], c=colors[i % 3])
ax.set_xlabel('Component 1')
ax.set_ylabel('Component 2')
ax.set_zlabel('Component 3')
plt.title('PCA Scatter Plot')
plt.show()


Question 8

ELBOW

In [None]:
#Get a copy of the dataframe for the agglomerative clustering
df2 = new_df.copy()
#Elbow method
sse = []
#check for various cluster values
for k in range(1,11):
    #use the Kmeans method
    kmeans = KMeans(n_clusters=k)
    #fit the Dataframe
    kmeans.fit(new_df)
    #append the squared distance to the list
    sse.append(kmeans.inertia_)
#plot to find the optimal number of clusters
plt.plot(range(1, 11), sse)
plt.title('Elbow')
plt.xlabel('Number of clusters')
plt.ylabel('Squared Distance Sum')
plt.show()


K-Means Clustering

In [None]:
#By looking the above plot we can see that the optimal number of clusters is 4
#use 4 clusters in the kmeans method
kmeans = KMeans(n_clusters=4)
#predict the new model
kmeans.fit_predict(new_df)
new_df = np.column_stack((new_df, kmeans.labels_))
new_df = pd.DataFrame(data=new_df, columns=['PC1', 'PC2', 'PC3', 'Cluster'])

# Plot the resulting clusters using a scatter plot
# Create a 3D scatter plot
cluster_labels = kmeans.labels_
new_df = new_df.iloc[:, :3]  # Keep only the first three columns
new_df = pd.concat([new_df, pd.Series(cluster_labels, name='Cluster')], axis=1)

# Plot the resulting clusters using a scatter plot
# Create a 3D scatter plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# Plot the data points in different colors based on their cluster assignments
colors = ['r', 'g', 'b','y']
for i in range(4):
    cluster_data = new_df[new_df['Cluster'] == i]
    ax.scatter(cluster_data['PC1'], cluster_data['PC2'], cluster_data['PC3'], c=colors[i])


# Set the axis labels
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')
plt.title('K-Means')
# Show the plot
plt.show()

Agglomerative Clustering

In [None]:
agg_clustering = AgglomerativeClustering(n_clusters=4)
agg_labels = agg_clustering.fit_predict(new_df)

# Add the cluster labels to the transformed dataset
new_df = np.column_stack((new_df, agg_labels))
new_df = pd.DataFrame(data=new_df, columns=['PC1', 'PC2', 'PC3','PC4', 'Cluster'])

# Create a 3D scatter plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# Plot the data points in different colors based on their cluster assignments
colors = ['r', 'g', 'b', 'y']
for i in range(4):
    ax.scatter(new_df[new_df['Cluster'] == i]['PC1'], new_df[new_df['Cluster'] == i]['PC2'], 
               new_df[new_df['Cluster'] == i]['PC3'], c=colors[i])

# Set the axis labels
plt.title('Agglomerative Clustering')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')