# Constumer Personality Analysis

In [5]:
import numpy as np
import pandas as pd
import datetime
from datetime import date
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler, normalize
from sklearn import metrics
from sklearn.mixture import GaussianMixture
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import warnings
#warnings.filterwarnings('ignore')

In [8]:
data = pd.read_csv('marketing_campaing.csv', header=0, sep=';')

# Data exploration

In [9]:
data.head(2)

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,2012-09-04,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,2014-03-08,38,11,...,5,0,0,0,0,0,0,3,11,0


In [27]:
print(f"Shape of data {data.shape}")
print(f"Info {data.info()}")

Shape of data (2240, 29)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Dt_Customer          2240 non-null   object 
 8   Recency              2240 non-null   int64  
 9   MntWines             2240 non-null   int64  
 10  MntFruits            2240 non-null   int64  
 11  MntMeatProducts      2240 non-null   int64  
 12  MntFishProducts      2240 non-null   int64  
 13  MntSweetProducts     2240 non-null   int64  
 14  MntGoldProds         2240 non-null   int64  
 15  NumDealsPurch

# Data preparation

In [28]:
# Adding age column
data['Age'] = 2021 - data['Year_Birth']

# Spending variable
data['Spending'] = data['MntWines']+data['MntFruits']+data['MntMeatProducts']+data['MntFishProducts']+data['MntSweetProducts']+data['MntGoldProds']

# Seniority variable
last_date = date(2021, 11, 9)
data['Seniority'] = pd.to_datetime(data['Dt_Customer'], dayfirst=True, format='%Y-%m-%d')
data['Seniority'] = pd.to_numeric(data['Seniority'].dt.date.apply(lambda x: (last_date-x)).dt.days, downcast='integer')/30

data=data.rename(columns={'NumWebPurchases':'Web', 'NumCatalogPurchases':'Catalog', 'NumStorePurchases':'Store'})
data['Marital_Status'] = data['Marital_Status'].replace({
    'Divorced':'Alone', 'Single':'Alone', 'Married':'In couple',
    'Together':'In couple', 'Absurd':'Alone', 'Widow':'Alone',
    'YOLO':'Alone'})

data['Education'] = data['Education'].replace({
    'Basic':'Undergraduate', '2n Cycle':'Undergraduate',
    'Graduation':'Postgraduate', 'Master':'Postgraduate',
    'PhD':'Postgraduate'
})

# Total children @home
data['Children'] = data['Kidhome'] + data['Teenhome']

# Household that has child
data['Has_child'] = np.where(data.Children > 0, "Has child", 'No child')

data['Children'].replace({3: "3 children",2:'2 children',1:'1 child',0:"No child"},inplace=True)

data=data.rename(columns={'MntWines': "Wines",'MntFruits':'Fruits','MntMeatProducts':'Meat','MntFishProducts':'Fish','MntSweetProducts':'Sweets','MntGoldProds':'Gold'})

data=data[['Age','Education','Marital_Status','Income','Spending','Seniority','Has_child','Children','Wines','Fruits','Meat','Fish','Sweets','Gold']]

data.head()

Unnamed: 0,Age,Education,Marital_Status,Income,Spending,Seniority,Has_child,Children,Wines,Fruits,Meat,Fish,Sweets,Gold
0,64,Postgraduate,Alone,58138.0,1617,111.766667,No child,No child,635,88,546,172,88,88
1,67,Postgraduate,Alone,46344.0,27,93.433333,Has child,2 children,11,1,6,2,1,6
2,56,Postgraduate,In couple,71613.0,776,100.066667,No child,No child,426,49,127,111,21,42
3,37,Postgraduate,In couple,26646.0,53,94.3,Has child,1 child,11,4,20,10,3,5
4,40,Postgraduate,In couple,58293.0,422,95.033333,Has child,1 child,173,43,118,46,27,15


## Further data cleaning

In [29]:
# Removing outliers and missing values in dataset

data = data.dropna(subset=['Income'])
data = data[data['Income']<600000]

# Clustering

>**Clustering of customers in the dataset as follows:**
> 
> 1. **Stars:** Old customers with high income and high spending nature.
>
> 2. **Need Attention:** New customers with below-average income and high spending nature.
>
> 3. **High Potential:** New customers with high income and high spending nature.
> 4. **Leaky Bucket:** Old customers with below-average income and a low spending nature.

In [30]:
# First normalizing data

scaler = StandardScaler()
dataset_temp = data[["Income", 'Seniority', 'Spending']]
X_std = scaler.fit_transform(dataset_temp)
X = normalize(X_std, norm='l2')

In [34]:
gm_model = GaussianMixture(n_components=4,
                      covariance_type='spherical',
                      max_iter=2000,
                      random_state=5).fit(X)
labels = gm_model.predict(X)
dataset_temp['Cluster'] = labels
dataset_temp = dataset_temp.replace({
    0:'Stars',1:'Need attention',
    2:'High potential',3:'Leaky bucket'
})

# Adding cluster classification to data
data = data.merge(dataset_temp.Cluster,
                 left_index=True, right_index=True)

pd.options.display.float_format = "{:.0f}".format
summary = data[['Income', 'Spending', 'Seniority', 'Cluster']]
summary.set_index('Cluster', inplace=True)
summary = summary.groupby('Cluster').describe().transpose()
summary.head()

Unnamed: 0,Cluster,High potential,Leaky bucket,Need attention,Stars
Income,count,460,585,642,528
Income,mean,73502,34750,37781,69542
Income,std,13748,12065,12440,12006
Income,min,49090,2447,1730,44802
Income,25%,65320,26490,28882,60880


## Visualizing data

In [37]:
PLOT = go.Figure()
for C in list(data.Cluster.unique()):
    

    PLOT.add_trace(go.Scatter3d(x = data[data.Cluster == C]['Income'],
                                y = data[data.Cluster == C]['Seniority'],
                                z = data[data.Cluster == C]['Spending'],                        
                                mode = 'markers',marker_size = 6, marker_line_width = 1,
                                name = str(C)))
PLOT.update_traces(hovertemplate='Income: %{x} <br>Seniority: %{y} <br>Spending: %{z}')

    
PLOT.update_layout(width = 800, height = 800, autosize = True, showlegend = True,
                   scene = dict(xaxis=dict(title = 'Income', titlefont_color = 'black'),
                                yaxis=dict(title = 'Seniority', titlefont_color = 'black'),
                                zaxis=dict(title = 'Spending', titlefont_color = 'black')),
                   font = dict(family = "Gilroy", color  = 'black', size = 12))



# Data prep for Customer Personality Analysis

> Preparing data for the Apriori algorithm
> Defining three segments of the customers accorgin to age, income and seniority

In [38]:
#Create Age segment
cut_labels_Age = ['Young', 'Adult', 'Mature', 'Senior']
cut_bins = [0, 30, 45, 65, 120]
data['Age_group'] = pd.cut(data['Age'], bins=cut_bins,
                           labels=cut_labels_Age)

#Create Income segment
cut_labels_Income = ['Low income', 'Low to medium income',
                     'Medium to high income', 'High income']
data['Income_group'] = pd.qcut(data['Income'], q=4,
                               labels=cut_labels_Income)

#Create Seniority segment
cut_labels_Seniority = ['New customers',
                        'Discovering customers',
                        'Experienced customers', 'Old customers']
data['Seniority_group'] = pd.qcut(data['Seniority'], q=4,
                                  labels=cut_labels_Seniority)
data=data.drop(columns=['Age','Income','Seniority'])

> Defining new segments according to spending of customers on each product which will be based on:
> 1. Non Buyer
> 2. Low Buyer
> 3. Frequent Buyer
> 4. Biggest Buyer

In [39]:
cut_labels = ['Low consumer', 'Frequent consumer', 'Biggest consumer']
data['Wines_segment'] = pd.qcut(data['Wines'][data['Wines']>0],
                                q=[0, .25, .75, 1],
                                labels=cut_labels).astype("object")

data['Fruits_segment'] = pd.qcut(data['Fruits'][data['Fruits']>0],
                                 q=[0, .25, .75, 1], 
                                 labels=cut_labels).astype("object")

data['Meat_segment'] = pd.qcut(data['Meat'][data['Meat']>0],
                               q=[0, .25, .75, 1], 
                               labels=cut_labels).astype("object")

data['Fish_segment'] = pd.qcut(data['Fish'][data['Fish']>0],
                               q=[0, .25, .75, 1], 
                               labels=cut_labels).astype("object")

data['Sweets_segment'] = pd.qcut(data['Sweets'][data['Sweets']>0],
                                 q=[0, .25, .75, 1], 
                                 labels=cut_labels).astype("object")
data['Gold_segment'] = pd.qcut(data['Gold'][data['Gold']>0],
                               q=[0, .25, .75, 1], 
                               labels=cut_labels).astype("object")
data.replace(np.nan, "Non consumer",inplace=True)
data.drop(columns=['Spending','Wines','Fruits','Meat','Fish',
                   'Sweets','Gold'],inplace=True)
data = data.astype(object)

In [41]:
data.head(3)

Unnamed: 0,Education,Marital_Status,Has_child,Children,Cluster_x,Cluster_y,Cluster,Age_group,Income_group,Seniority_group,Wines_segment,Fruits_segment,Meat_segment,Fish_segment,Sweets_segment,Gold_segment
0,Postgraduate,Alone,No child,No child,Stars,Stars,Stars,Mature,Medium to high income,Old customers,Biggest consumer,Biggest consumer,Biggest consumer,Biggest consumer,Biggest consumer,Biggest consumer
1,Postgraduate,Alone,Has child,2 children,Need attention,Need attention,Need attention,Senior,Low to medium income,New customers,Low consumer,Low consumer,Low consumer,Low consumer,Low consumer,Low consumer
2,Postgraduate,In couple,No child,No child,High potential,High potential,High potential,Mature,High income,Discovering customers,Frequent consumer,Biggest consumer,Frequent consumer,Biggest consumer,Frequent consumer,Frequent consumer


# Apriori Algorithm

>The Apriori algorithm is the simplest technique to identify the underlying relationships between different types of elements. The idea behind this algorithm is that all nonempty subsets of a frequent category must also be frequent. Here I will be using the Apriori algorithm for the task of customer personality analysis with Python. Here I will use this algorithm to identify the biggest customer of wines:

In [42]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 999)
pd.options.display.float_format = "{:.3f}".format
association=data.copy() 
df = pd.get_dummies(association)
min_support = 0.08
max_len = 10
frequent_items = apriori(df, use_colnames=True, 
                         min_support=min_support, 
                         max_len=max_len + 1)

rules = association_rules(frequent_items, 
                          metric='lift', min_threshold=1)

product='Wines'
segment='Biggest consumer'
target = '{\'%s_segment_%s\'}' %(product,segment)
results_personnal_care = rules[rules['consequents'].astype(str).str.contains(target, 
                                                                             na=False)].sort_values(by='confidence', ascending=False)
results_personnal_care.head()


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
41009,"(Cluster_Stars, Age_group_Mature, Education_Postgraduate, Cluster_x_Stars)",(Wines_segment_Biggest consumer),0.121,0.249,0.084,0.698,2.805,0.054,2.486
57553,"(Age_group_Mature, Education_Postgraduate, Cluster_y_Stars, Cluster_Stars, Cluster_x_Stars)",(Wines_segment_Biggest consumer),0.121,0.249,0.084,0.698,2.805,0.054,2.486
17520,"(Cluster_Stars, Age_group_Mature, Education_Postgraduate)",(Wines_segment_Biggest consumer),0.121,0.249,0.084,0.698,2.805,0.054,2.486
17343,"(Age_group_Mature, Education_Postgraduate, Cluster_y_Stars)",(Wines_segment_Biggest consumer),0.121,0.249,0.084,0.698,2.805,0.054,2.486
16428,"(Age_group_Mature, Education_Postgraduate, Cluster_x_Stars)",(Wines_segment_Biggest consumer),0.121,0.249,0.084,0.698,2.805,0.054,2.486


# Conclusion

>So according to the output and overall analysis conducted on this data science project on customer personality analysis with Python, we can conclude that the biggest customers of wines are:
> 1. Customers with an average income of around 69,500K
> 2. Customers with an average total spend of approximately 1,252K
> 3. Customers registered with the company for approximately 21 months.
> 4. Customers with a graduate degree.
> 5. And customers who are also heavy consumers of meat products.