In [1]:
!pip install plotly

[0m

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff

import datetime


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

PATH = "/kaggle/input/customer-personality-analysis/marketing_campaign.csv"
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)



/kaggle/input/customer-personality-analysis/marketing_campaign.csv


# Introduction

**Performing an effective customer personality analysis entails a crucial step of identifying and defining the problem that requires resolution. This is usually achieved by formulating a set of questions that will serve as the basis of the investigation.**

Key to the customer personality analysis is obtaining accurate and insightful responses to questions, such as:

1. What is the customer's perception of the company's products and services?
2. What are the critical factors that impact the customer's purchasing decisions?
3. How engaged is the customer with the company's products or services?

# Dataset Overview

**The purpose of this data overview is to provide a preliminary understanding of the dataset and the key variables that will be analyzed. The dataset contains information on customer attributes, products, promotions, and places, and will be used to perform a customer personality analysis.**

In [3]:
customers_data = pd.read_csv(PATH, sep="\t")
customers_data.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,88,546,172,88,88,3,8,10,4,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,1,6,2,1,6,2,1,1,2,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,49,127,111,21,42,1,8,2,10,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,4,20,10,3,5,2,2,0,4,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,43,118,46,27,15,5,5,3,6,5,0,0,0,0,0,0,3,11,0


In [4]:
customers_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Dt_Customer          2240 non-null   object 
 8   Recency              2240 non-null   int64  
 9   MntWines             2240 non-null   int64  
 10  MntFruits            2240 non-null   int64  
 11  MntMeatProducts      2240 non-null   int64  
 12  MntFishProducts      2240 non-null   int64  
 13  MntSweetProducts     2240 non-null   int64  
 14  MntGoldProds         2240 non-null   int64  
 15  NumDealsPurchases    2240 non-null   i

* The dataset consists of 2240 rows and 29 columns. 
* The dataset has no missing values in the columns except for the "Income" column, which has 24 missing values. 
* The "Education" and "Marital_Status" columns are of object type, suggesting that they contain categorical data. 
* Finally, the "Z_CostContact" and "Z_Revenue" columns appear to have constant values and may not be useful for further analysis.

In [5]:
# Customers Age might be more useful feature rather than the Year of birth 
customers_data["Age"] = datetime.datetime.now().year - customers_data.Year_Birth


bins = range(0, 100, 10)  # 0 to 100 with a step of 10
# Customers Age Group
customers_data["Age_Group"] = pd.cut(customers_data.Age, bins).astype(str)

# Descriptive Statistics

**Understanding the distribution of the data is crucial for selecting the appropriate statistical metrics to explore and describe the data.**

By examining the distribution of the data, we can determine if it follows a normal distribution or if it is skewed. This information can be used to select the appropriate summary statistics and to identify any outliers or unusual data points.

In [6]:
customers_data.describe()

Unnamed: 0,ID,Year_Birth,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response,Age
count,2240.0,2240.0,2216.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0
mean,5592.159821,1968.805804,52247.251354,0.444196,0.50625,49.109375,303.935714,26.302232,166.95,37.525446,27.062946,44.021875,2.325,4.084821,2.662054,5.790179,5.316518,0.072768,0.074554,0.072768,0.064286,0.013393,0.009375,3.0,11.0,0.149107,54.194196
std,3246.662198,11.984069,25173.076661,0.538398,0.544538,28.962453,336.597393,39.773434,225.715373,54.628979,41.280498,52.167439,1.932238,2.778714,2.923101,3.250958,2.426645,0.259813,0.262728,0.259813,0.245316,0.114976,0.096391,0.0,0.0,0.356274,11.984069
min,0.0,1893.0,1730.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,11.0,0.0,27.0
25%,2828.25,1959.0,35303.0,0.0,0.0,24.0,23.75,1.0,16.0,3.0,1.0,9.0,1.0,2.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,11.0,0.0,46.0
50%,5458.5,1970.0,51381.5,0.0,0.0,49.0,173.5,8.0,67.0,12.0,8.0,24.0,2.0,4.0,2.0,5.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,11.0,0.0,53.0
75%,8427.75,1977.0,68522.0,1.0,1.0,74.0,504.25,33.0,232.0,50.0,33.0,56.0,3.0,6.0,4.0,8.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,11.0,0.0,64.0
max,11191.0,1996.0,666666.0,2.0,2.0,99.0,1493.0,199.0,1725.0,259.0,263.0,362.0,15.0,27.0,28.0,13.0,20.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,11.0,1.0,130.0


## Age Distribution

In [7]:
# Calculate the count of each category
age_group_counts = customers_data["Age_Group"].value_counts().reset_index()

# Rename the columns
age_group_counts.columns = ["Age_Group", "Count"]

fig = px.bar(age_group_counts, x="Age_Group", y="Count")
fig.show()

In [8]:
fig = px.histogram(customers_data.Age, marginal="box")
fig.show()

## Income Distributions

In [9]:
fig = px.histogram(customers_data.Income, marginal="box", nbins=100)
fig.update_layout(xaxis_range=[0, 120_000]) # there are a few outliers above 120K income
fig.show()

Based on the histogram and skewness measures, it appears that the data is relatively symmetric and not strongly skewed in any particular direction. As a result, we can use the mean as our measure of central tendency without significant bias towards one end of the distribution.

**Central tendecy of annual household salary is $52247.25**

### Income Distribution by Education Category

**This section would focus on analyzing and understanding the distribution of income across various education levels within the dataset.**

In [10]:
fig = px.histogram(customers_data, x="Income", color="Education")
fig.update_layout(xaxis_range=[0, 120_000])
fig.show()

In [11]:
fig = px.histogram(customers_data, x="Income", facet_col="Education")
fig.update_layout(xaxis_range=[0, 120_000], yaxis_range=[0, 100])
fig.show()

The analysis indicates that there is a notable similarity in income distribution among individuals holding various graduate degrees, including Graduated, PhD, Master, and 2nd Cycle degrees. However, it is important to acknowledge that the representation of these educational levels differs. This discrepancy can be attributed to the inherent complexity associated with attaining higher degrees and educational accomplishments. 