# I. Importing libraries
##### In this phase, essential libraries are imported to the project. Libraries provide pre-built functionalities that help streamline the development process.

In [2]:
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import svm
from sklearn.preprocessing import RobustScaler, OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from yellowbrick.classifier import confusion_matrix
from sklearn.decomposition import PCA
from xgboost import XGBClassifier


# II. Dataset download
##### This step involves acquiring the dataset required for analysis. Whether it's from a web source, database, or local file, downloading the dataset is crucial for subsequent stages.

In [3]:
data = pd.read_csv('marketing-data/ifood_df.csv')
data.head()


Unnamed: 0,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,...,marital_Together,marital_Widow,education_2n Cycle,education_Basic,education_Graduation,education_Master,education_PhD,MntTotal,MntRegularProds,AcceptedCmpOverall
0,58138.0,0,0,58,635,88,546,172,88,88,...,0,0,0,0,1,0,0,1529,1441,0
1,46344.0,1,1,38,11,1,6,2,1,6,...,0,0,0,0,1,0,0,21,15,0
2,71613.0,0,0,26,426,49,127,111,21,42,...,1,0,0,0,1,0,0,734,692,0
3,26646.0,1,0,26,11,4,20,10,3,5,...,1,0,0,0,1,0,0,48,43,0
4,58293.0,1,0,94,173,43,118,46,27,15,...,0,0,0,0,0,0,1,407,392,0


In [4]:
# Dataset shape
data.shape


(2205, 39)

In [5]:
# Meta data
data.describe()


Unnamed: 0,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,...,marital_Together,marital_Widow,education_2n Cycle,education_Basic,education_Graduation,education_Master,education_PhD,MntTotal,MntRegularProds,AcceptedCmpOverall
count,2205.0,2205.0,2205.0,2205.0,2205.0,2205.0,2205.0,2205.0,2205.0,2205.0,...,2205.0,2205.0,2205.0,2205.0,2205.0,2205.0,2205.0,2205.0,2205.0,2205.0
mean,51622.094785,0.442177,0.506576,49.00907,306.164626,26.403175,165.312018,37.756463,27.128345,44.057143,...,0.257596,0.034467,0.089796,0.02449,0.504762,0.165079,0.215873,562.764626,518.707483,0.29932
std,20713.063826,0.537132,0.54438,28.932111,337.493839,39.784484,217.784507,54.824635,41.130468,51.736211,...,0.43741,0.182467,0.285954,0.154599,0.500091,0.371336,0.41152,575.936911,553.847248,0.68044
min,1730.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,-283.0,0.0
25%,35196.0,0.0,0.0,24.0,24.0,2.0,16.0,3.0,1.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,56.0,42.0,0.0
50%,51287.0,0.0,0.0,49.0,178.0,8.0,68.0,12.0,8.0,25.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,343.0,288.0,0.0
75%,68281.0,1.0,1.0,74.0,507.0,33.0,232.0,50.0,34.0,56.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,964.0,884.0,0.0
max,113734.0,2.0,2.0,99.0,1493.0,199.0,1725.0,259.0,262.0,321.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2491.0,2458.0,4.0


In [6]:
# Columns
data.columns


Index(['Income', 'Kidhome', 'Teenhome', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Z_CostContact', 'Z_Revenue', 'Response',
       'Age', 'Customer_Days', 'marital_Divorced', 'marital_Married',
       'marital_Single', 'marital_Together', 'marital_Widow',
       'education_2n Cycle', 'education_Basic', 'education_Graduation',
       'education_Master', 'education_PhD', 'MntTotal', 'MntRegularProds',
       'AcceptedCmpOverall'],
      dtype='object')

In [8]:
# data types in the dataset
data.dtypes


Income                  float64
Kidhome                   int64
Teenhome                  int64
Recency                   int64
MntWines                  int64
MntFruits                 int64
MntMeatProducts           int64
MntFishProducts           int64
MntSweetProducts          int64
MntGoldProds              int64
NumDealsPurchases         int64
NumWebPurchases           int64
NumCatalogPurchases       int64
NumStorePurchases         int64
NumWebVisitsMonth         int64
AcceptedCmp3              int64
AcceptedCmp4              int64
AcceptedCmp5              int64
AcceptedCmp1              int64
AcceptedCmp2              int64
Complain                  int64
Z_CostContact             int64
Z_Revenue                 int64
Response                  int64
Age                       int64
Customer_Days             int64
marital_Divorced          int64
marital_Married           int64
marital_Single            int64
marital_Together          int64
marital_Widow             int64
educatio

# III. Exploratory Data Analysis (EDA)
##### EDA is a preliminary data analysis phase where key statistical and visual techniques are employed to understand the dataset's characteristics, identify patterns, and gain initial insights.

### III.1) Marketing Campaing Analysis

In [9]:
# Selecting only marketing campaing features
campaing_data = data[['AcceptedCmp1',
                 'AcceptedCmp2',
                 'AcceptedCmp3',
                 'AcceptedCmp4',
                 'AcceptedCmp5',
                 'Response',
                 'Complain',
                 'AcceptedCmpOverall']]

campaing_data


Unnamed: 0,AcceptedCmp1,AcceptedCmp2,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,Response,Complain,AcceptedCmpOverall
0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
2200,0,0,0,0,0,0,0,0
2201,1,0,0,0,0,0,0,1
2202,0,0,0,1,0,0,0,1
2203,0,0,0,0,0,0,0,0


In [10]:
# Creating classes with the campaing data

# Filter for customers who accepted more than one campaign
more_than_one = campaing_data[campaing_data['AcceptedCmpOverall'] > 1]

# Filter for customers who complained but did not accept any campaign
complain_without_accept = campaing_data[(campaing_data['AcceptedCmpOverall'] == 0) & (campaing_data['Complain'] > 0)]

# Filter for customers who accepted at least once and complained
accepted_once_and_complained = campaing_data[(campaing_data['AcceptedCmpOverall'] == 1) & (campaing_data['Complain'] > 0)]

# Filter for customers who accepted more than once and complained
accepted_more_and_complained = campaing_data[(campaing_data['AcceptedCmpOverall'] > 1) & (campaing_data['Complain'] > 0)]

# Filter for customers who complained
complained = campaing_data[(campaing_data['Complain'] > 0)]

# Filter for customers who accepted at least once
accepted = campaing_data[campaing_data['AcceptedCmpOverall'] > 0]

# Filter for customers who accepted the first campaign
first_adopters = campaing_data[campaing_data['AcceptedCmp1'] == 1]

# Filter for customers who accepted the second campaign
seccmp_adopters = campaing_data[campaing_data['AcceptedCmp2'] == 1]

# Filter for customers who accepted the third campaign
thirdcmp_adopters = campaing_data[campaing_data['AcceptedCmp3'] == 1]

# Filter for customers who accepted the fourth campaign
fourthcmp_adopters = campaing_data[campaing_data['AcceptedCmp4'] == 1]

# Filter for customers who accepted the fifth campaign
fithcmp_adopters = campaing_data[campaing_data['AcceptedCmp5'] == 1]

# Filter for customers who responded positively to the last campaign
last_campaing = campaing_data[campaing_data['Response'] == 1]

# Calculate conversion rates for each campaign and overall
conversion_last = round((last_campaing.shape[0] / campaing_data.shape[0]) * 100)
conversion_1st_cmp = round((first_adopters.shape[0] / campaing_data.shape[0]) * 100)
conversion_2nd_cmp = round((seccmp_adopters.shape[0] / campaing_data.shape[0]) * 100)
conversion_3rd_cmp = round((thirdcmp_adopters.shape[0] / campaing_data.shape[0]) * 100)
conversion_4th_cmp = round((fourthcmp_adopters.shape[0] / campaing_data.shape[0]) * 100)
conversion_5th_cmp = round((fithcmp_adopters.shape[0] / campaing_data.shape[0]) * 100)

# Display a summary of conversion rates, customer counts, and complaint statistics for marketing campaigns.
print(f'For this company, there was a conversion of {conversion_last}% in the last campaing and the following conversions in the first 5 campaigns {conversion_1st_cmp}%,{conversion_2nd_cmp}%,{conversion_3rd_cmp}%,{conversion_4th_cmp} and {conversion_5th_cmp}%, consecutively.\n'
      f'{first_adopters.shape[0]} converted in the first campaign and {last_campaing.shape[0]} converted in the last.\n'
      f'Having a total of {complained.shape[0]} complaints, {complain_without_accept.shape[0]} of which there was no acceptance of the campaign, '
      f'{accepted_once_and_complained.shape[0]} accepted once and {accepted_more_and_complained.shape[0]} accepted more than once.'
)


For this company, there was a conversion of 15% in the last campaing and the following conversions in the first 5 campaigns 6%,1%,7%,7 and 7%, consecutively.
142 converted in the first campaign and 333 converted in the last.
Having a total of 20 complaints, 18 of which there was no acceptance of the campaign, 1 accepted once and 1 accepted more than once.


### III.2) Education Analysis

In [11]:
# Selecting only education level features
education_data = data[['education_2n Cycle',
                 'education_Basic',
                 'education_Graduation',
                 'education_Master',
                 'education_PhD'
                 ]]

#Identifying whether only one education is possible (maximum sum = 1)
education_data['overall_education'] = data['education_2n Cycle'] + data['education_Basic'] + data['education_Graduation'] + data['education_Master'] + data['education_PhD']
education_data.describe()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  education_data['overall_education'] = data['education_2n Cycle'] + data['education_Basic'] + data['education_Graduation'] + data['education_Master'] + data['education_PhD']


Unnamed: 0,education_2n Cycle,education_Basic,education_Graduation,education_Master,education_PhD,overall_education
count,2205.0,2205.0,2205.0,2205.0,2205.0,2205.0
mean,0.089796,0.02449,0.504762,0.165079,0.215873,1.0
std,0.285954,0.154599,0.500091,0.371336,0.41152,0.0
min,0.0,0.0,0.0,0.0,0.0,1.0
25%,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,0.0,1.0,0.0,0.0,1.0
75%,0.0,0.0,1.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
# Creating a single categorical column to designate education
education_data['education_level'] = education_data.idxmax(axis=1)
education_data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  education_data['education_level'] = education_data.idxmax(axis=1)


Unnamed: 0,education_2n Cycle,education_Basic,education_Graduation,education_Master,education_PhD,overall_education,education_level
0,0,0,1,0,0,1,education_Graduation
1,0,0,1,0,0,1,education_Graduation
2,0,0,1,0,0,1,education_Graduation
3,0,0,1,0,0,1,education_Graduation
4,0,0,0,0,1,1,education_PhD
...,...,...,...,...,...,...,...
2200,0,0,1,0,0,1,education_Graduation
2201,0,0,0,0,1,1,education_PhD
2202,0,0,1,0,0,1,education_Graduation
2203,0,0,0,1,0,1,education_Master


In [None]:
# Plot a bar chart to visualize the frequency of different education levels in the 'education_data' dataset.
plt.figure(figsize=(10, 6))
education_data['education_level'].value_counts().plot(kind='bar', color='skyblue')

# Set plot title and axis labels
plt.title('Levels of education frequency')
plt.xlabel('Education level')
plt.ylabel('Frequency')

# Display the plot
plt.show()


### III.3) Marital Status Analysis

In [None]:
# Selecting only marital status features
marital_data = data[['marital_Divorced',
                 'marital_Married',
                 'marital_Single',
                 'marital_Together',
                 'marital_Widow']]

#Identifying whether only one marital status is possible (maximum sum = 1)
marital_data['overall_marital'] = data[['marital_Divorced',
                                 'marital_Married',
                                 'marital_Single',
                                 'marital_Together',
                                 'marital_Widow']].sum(axis=1)


marital_data.describe()


In [None]:
# Creating a single categorical column to designate marital status
marital_data.drop('overall_marital', axis=1)
marital_data['marital_status'] = marital_data[['marital_Divorced',
                                               'marital_Married',
                                               'marital_Single',
                                               'marital_Together',
                                               'marital_Widow']].idxmax(axis=1)


In [None]:
# Plot a bar chart to visualize the frequency of different marital statuses in the 'marital_data' dataset.
plt.figure(figsize=(10, 6))
marital_data['marital_status'].value_counts().plot(kind='bar', color='skyblue')

# Set plot title and axis labels
plt.title('Marital status frequency')
plt.xlabel('Marital status')
plt.ylabel('Frequency')

# Display the plot
plt.show()


### III.4) Rebuilding Dataset

In [None]:
# Create a new DataFrame 'df' based on a copy of 'data' with simplified marital and education levels

# Extract the marital status with the highest value for each row
df['marital_level'] = data[['marital_Divorced', 'marital_Married', 'marital_Single', 'marital_Together', 'marital_Widow']].idxmax(axis=1)

# Extract the education level with the highest value for each row
df['education_level'] = data[['education_2n Cycle', 'education_Basic', 'education_Graduation', 'education_Master', 'education_PhD']].idxmax(axis=1)

# Remove prefix from marital and education level columns
df['marital_level'] = df['marital_level'].str.replace('marital_', '')
df['education_level'] = df['education_level'].str.replace('education_', '')

# Drop original columns used for extraction
df.drop(['marital_Divorced', 'marital_Married', 'marital_Single', 'marital_Together', 'marital_Widow',
         'education_2n Cycle', 'education_Basic', 'education_Graduation', 'education_Master', 'education_PhD'], axis=1, inplace=True)

# Display the first few rows of the modified DataFrame
df.head()


In [13]:
# Creating classes of campaign acceptance:
# a) best_buyer: Accepted the campaign more than once
# b) first_buyer: Accepted in the first campaign
# c) regular_buyer: Accepted the offer between the first and last campaign
# d) not_buyer: Didn't accept the campaign

# If we have a best buyer that bought also in the first campaign, this client will be classified as best buyer

df['buyer_status'] = 'regular_buyer'

# Update buyer_status based on campaign acceptance criteria
df.loc[df['AcceptedCmpOverall'] > 1, 'buyer_status'] = 'best_buyer'
df.loc[(df['AcceptedCmp1'] == 1) & (df['AcceptedCmpOverall'] == 1), 'buyer_status'] = 'first_buyer'
df.loc[(df['AcceptedCmpOverall'] == 0) & (df['Response'] == 0), 'buyer_status'] = 'not_buyer'

# Drop columns related to individual campaign acceptances and overall acceptance
df.drop(['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmpOverall'], axis=1, inplace=True)


# Display the first few rows of the DataFrame with the new 'buyer_status' column
df.head()


NameError: name 'df' is not defined

### III.5) Buyer status analysis

In [None]:
# Plot a bar chart to visualize the frequency of different buyer statuses in the 'df' DataFrame.
plt.figure(figsize=(10, 6))
df['buyer_status'].value_counts().plot(kind='bar', color='skyblue')

# Set plot title and axis labels
plt.title('Buyer status frequency')
plt.xlabel('Buyer status')
plt.ylabel('Frequency')

# Display the plot
plt.show()


In [None]:
# Set seaborn style to whitegrid
sns.set(style="whitegrid")

# Get unique buyer statuses
buyer_statuses = df['buyer_status'].unique()

# Create subplots based on the number of unique buyer statuses
fig, axes = plt.subplots(nrows=len(buyer_statuses), figsize=(12, 6 * len(buyer_statuses)))

# Plot histograms of Age for each buyer status
for i, status in enumerate(buyer_statuses):
    subset = df[df['buyer_status'] == status]
    sns.histplot(subset['Age'], kde=True, bins=30, ax=axes[i])
    axes[i].set_title(f'Histogram of Age for {status}')
    axes[i].set_xlabel('Age')
    axes[i].set_ylabel('Count')

# Adjust layout for better visualization
plt.tight_layout()

# Display the plots
plt.show()


In [None]:
# Set seaborn style to whitegrid
sns.set(style="whitegrid")

# Get unique buyer statuses
buyer_statuses = df['buyer_status'].unique()

# Create subplots based on the number of unique buyer statuses
fig, axes = plt.subplots(nrows=len(buyer_statuses), figsize=(12, 6 * len(buyer_statuses)))

# Plot histograms of Income for each buyer status
for i, status in enumerate(buyer_statuses):
    subset = df[df['buyer_status'] == status]
    sns.histplot(subset['Income'], kde=True, bins=30, ax=axes[i])
    axes[i].set_title(f'Histogram of Income for {status}')
    axes[i].set_xlabel('Income')
    axes[i].set_ylabel('Count')

# Adjust layout for better visualization
plt.tight_layout()

# Display the plots
plt.show()


In [None]:
# Set seaborn style to whitegrid
sns.set(style="whitegrid")

# Get unique buyer statuses
buyer_statuses = df['buyer_status'].unique()

# Create subplots based on the number of unique buyer statuses
fig, axes = plt.subplots(nrows=len(buyer_statuses), figsize=(12, 6 * len(buyer_statuses)))

# Plot histograms of Customer_Days for each buyer status
for i, status in enumerate(buyer_statuses):
    subset = df[df['buyer_status'] == status]
    sns.histplot(subset['Customer_Days'], kde=True, bins=30, ax=axes[i])
    axes[i].set_title(f'Histogram of Customer Days for {status}')
    axes[i].set_xlabel('Days')
    axes[i].set_ylabel('Count')

# Adjust layout for better visualization
plt.tight_layout()

# Display the plots
plt.show()


### III.6) Purchasing profile analysis

In [None]:
# Extract columns starting with 'Mnt' (excluding 'MntRegularProds' and 'MntTotal')
mnt_columns = [col for col in df.columns if col.startswith('Mnt')]
mnt_columns.remove('MntRegularProds')
mnt_columns.remove('MntTotal')

# Create new columns for each 'Mnt' column as a percentage of 'MntTotal'
for i in mnt_columns:
    df[i + '_percent'] = df[i] / df['MntTotal']

# Display the first few rows of the DataFrame with the new percentage columns
df.head()


In [None]:
# Drop specified columns from the DataFrame
df.drop(['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'Z_CostContact', 'Z_Revenue'], axis=1, inplace=True)


In [None]:
# Create a new column 'main_product' based on the maximum percentage among product categories
df['main_product'] = df[['MntWines_percent', 'MntFruits_percent', 'MntMeatProducts_percent',
                         'MntFishProducts_percent', 'MntSweetProducts_percent', 'MntGoldProds_percent']].idxmax(axis=1)

# Remove prefixes and suffixes from 'main_product' values
df['main_product'] = df['main_product'].str.replace('Mnt', '').str.replace('_percent', '')

# Display the first few rows of the DataFrame with the new 'main_product' column
df.head()


In [None]:
# Create a new column 'main_channel' based on the maximum number of purchases among different channels
df['main_channel'] = df[['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']].idxmax(axis=1)

# Remove prefixes and suffixes from 'main_channel' values
df['main_channel'] = df['main_channel'].str.replace('Num', '').str.replace('Purchases', '')

# Display the first few rows of the DataFrame with the new 'main_channel' column
df.head()


### III.7) General profile summary

In [None]:
def profile(data):
    '''
    This function aims to create a summary of the consumer profile, identifying its main characteristics,
    as well as its highest value product and channel.
    '''

    print(f'The customer profile of this company has {round(data["Age"].mean())} years old in average, having an average of anual income of R${round(data["Income"].mean(),2)}.\n'
          f'These are clients whose marital status is mostly {data["marital_level"].mode()[0]}, with a {data["education_level"].mode()[0]} level of education and having, on average, {round(data["Kidhome"].mean())} children and {round(data["Teenhome"].mean())} teenagers at home.\n'
          f'There are {round(data["Complain"].mean()*100,2)} % of complaints.'
          f'Most of them purchase {data[data["Complain"]==1]["main_product"].mode()[0]} from {data[data["Complain"]==1]["main_channel"].mode()[0]}.'
          f'They are predominantly {data[data["Complain"]==1]["marital_level"].mode()[0]}, with a {data[data["Complain"]==1]["education_level"].mode()[0]} level of education. On average, they have {round(data[data["Complain"]==1]["Kidhome"].mean())} children and {round(data[data["Complain"]==1]["Teenhome"].mean())} teenager living at home.\n'
          f'The purchasing profile is the following:')

    def plot_channel_distribution(df, main_channel_values):
        '''
        The function aims to create a sales profile graph correlating the proportion of product sales for each sales channel.
        '''

        purchasing_pattern = [col for col in df.columns if col.startswith('Mnt') and col not in ['MntRegularProds', 'MntTotal']]

        for channel in main_channel_values:
            channel_df = df[df['main_channel'] == channel]
            average_distribution = channel_df[purchasing_pattern].mean()

            purchasing_pattern_display = [col.replace('Mnt', '').replace('_percent', '') for col in purchasing_pattern]

            plt.figure(figsize=(14, 6))
            plt.bar(purchasing_pattern_display, average_distribution)
            plt.title(f'Average distribution for {channel}')
            plt.xlabel('Purchasing Pattern')
            plt.grid(visible=False)

            for i, value in enumerate(average_distribution):
                plt.text(i, value + 0.01, f'{value:.2f}', ha='center', va='bottom')

            plt.show()

    plot_channel_distribution(data, data['main_channel'].unique().tolist())

profile(df)


### III.8) Buyer analysis via campaign

In [None]:
# Create a new column 'buyer_class' based on the 'buyer_status'
df['buyer_class'] = np.where(df['buyer_status'] != 'not_buyer', 'Buyer', 'Not Buyer')


In [None]:
# Scatter plot of Income vs. Age with 'buyer_class' as hue
plt.figure(figsize=(10, 5))
sns.scatterplot(x=df['Age'], y=df['Income'], hue=df['buyer_class'])
plt.title('Scatter Plot of Income vs. Age')
plt.xlabel('Age')
plt.ylabel('Income')
plt.grid(False)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# Histograms of Income and Age for Buyers and Non-Buyers
plt.figure(figsize=(10, 5))

plt.subplot(2, 2, 1)
sns.histplot(df, x=buy['Income'], color='b', kde=True)
plt.title('Buyer Income')
plt.xlabel('Income')
plt.grid(False)

plt.subplot(2, 2, 2)
sns.histplot(df, x=not_buy['Income'], color='r', kde=True)
plt.title('Not Buyer Income')
plt.xlabel('Income')
plt.grid(False)

plt.subplot(2, 2, 3)
sns.histplot(df, x=buy['Age'], color='b', kde=True)
plt.title('Buyer Age')
plt.xlabel('Age')
plt.grid(False)

plt.subplot(2, 2, 4)
sns.histplot(df, x=not_buy['Age'], color='r', kde=True)
plt.title('Not Buyer Age')
plt.xlabel('Age')
plt.grid(False)

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()


In [None]:
# Scatter plots showing the relationship between Age and the number of purchases in different channels,
# with 'buyer_class' as hue for better visualization.

# Subplot 1: Deals purchase vs. Age
plt.subplot(2, 2, 1)
sns.scatterplot(x=df['Age'], y=df['NumDealsPurchases'], hue=df['buyer_class'], legend=False)
plt.title('Deals purchase vs. Age')
plt.xlabel('Age')
plt.ylabel('Number of purchases')
plt.grid(False)

# Subplot 2: Web purchase vs. Age
plt.subplot(2, 2, 2)
sns.scatterplot(x=df['Age'], y=df['NumWebPurchases'], hue=df['buyer_class'], legend=False)
plt.title('Web purchase vs. Age')
plt.xlabel('Age')
plt.ylabel('Number of purchases')
plt.grid(False)

# Subplot 3: Catalog purchase vs. Age
plt.subplot(2, 2, 3)
sns.scatterplot(x=df['Age'], y=df['NumCatalogPurchases'], hue=df['buyer_class'], legend=False)
plt.title('Catalog purchase vs. Age')
plt.xlabel('Age')
plt.ylabel('Number of purchases')
plt.grid(False)

# Subplot 4: Store purchase vs. Age
plt.subplot(2, 2, 4)
sns.scatterplot(x=df['Age'], y=df['NumStorePurchases'], hue=df['buyer_class'])
plt.title('Store purchase vs. Age')
plt.xlabel('Age')
plt.ylabel('Number of purchases')
plt.grid(False)

# Display legend outside the plot for better visibility
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# Adjust layout for better visualization
plt.tight_


In [None]:
# Scatter plot showing the relationship between Customer Days and Age, with 'buyer_class' as hue.

plt.figure(figsize=(10, 5))

sns.scatterplot(x=df['Customer_Days'], y=df['Age'], hue=df['buyer_class'])
plt.title('Customer Days vs. Age (historic)')
plt.xlabel('Customer Days')
plt.ylabel('Age')

# Display legend outside the plot for better visibility
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(False)


In [None]:
# Scatter plot showing the relationship between Customer Days and Age, with 'Response' as hue.

plt.figure(figsize=(10, 5))

sns.scatterplot(x=df['Customer_Days'], y=df['Age'], hue=df['Response'])
plt.title('Customer Days vs. Age (color = Response)')
plt.xlabel('Customer Days')
plt.ylabel('Age')

# Display legend outside the plot for better visibility
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(False)


### III.9) Complain analysis

In [None]:
# Generating a customer profile summary specifically for clients who have complaints (Complain == 1).
profile(df[df['Complain'] == 1])

# applying the aforementioned function to describe the general profile of the customer who complains
profile(df_complain)


In [None]:
# Scatter plot showing the relationship between Customer Days and Age, with 'Complain' as hue.

plt.figure(figsize=(10, 5))

sns.scatterplot(x=df['Customer_Days'], y=df['Age'], hue=df['Complain'])
plt.title('Customer Days vs. Age (historic)')
plt.xlabel('Customer Days')
plt.ylabel('Age')

# Display legend outside the plot for better visibility
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(False)


# IV. Data Cleaning and Preparation
##### This phase focuses on cleaning and transforming the data to ensure it is in a suitable format for analysis. Tasks may include handling missing values, addressing outliers, and organizing the data structure.

### IV.1) Duplicates treatment

In [None]:
# Counting the number of duplicate rows in the DataFrame.
df.duplicated().sum()


In [None]:
# Removing duplicate rows from the DataFrame.
df = df.drop_duplicates()


In [None]:
# Checking and displaying the count of missing values in each column, sorted in descending order.
df.isnull().sum().sort_values(ascending=False)


### IV.2) Null/empty treatment

In [None]:
# Checking for the presence of null or empty values in the DataFrame
df.isnull().sum()


### IV.3) Evaluation of distribution of numerical variables

In [None]:
# Extracting a subset of numeric features from the DataFrame.
x_numeric = df[['Income', 'Recency', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases',
                'NumStorePurchases', 'NumWebVisitsMonth', 'Age', 'Customer_Days', 'MntTotal', 'MntRegularProds']]


In [None]:
# Creating subplots for boxplots of numeric columns
fig, axes = plt.subplots(6, 2, figsize=(12, 40))

# Iterating through each numeric column and creating a boxplot
for i, column in enumerate(x_numeric.columns):
    row, col = divmod(i, 2)
    axes[row, col].boxplot(x_numeric[column])
    axes[row, col].set_title(column)

# Adjusting layout and displaying the plot
plt.tight_layout()
plt.show()


### IV.4) Scaling numerical variables

In [None]:
# Creating a RobustScaler instance
rb_scaler = RobustScaler()

# Fitting the scaler to the selected numeric columns and transforming the data
rb_scaler.fit(df[['Income','Recency', 'NumDealsPurchases',
                   'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases',
                   'NumWebVisitsMonth','Age', 'Customer_Days', 'MntTotal',
                   'MntRegularProds']])

df[['Income','Recency', 'NumDealsPurchases',
    'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases',
    'NumWebVisitsMonth','Age', 'Customer_Days', 'MntTotal',
    'MntRegularProds']] = rb_scaler.transform(df[['Income','Recency', 'NumDealsPurchases',
                                                   'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases',
                                                   'NumWebVisitsMonth','Age', 'Customer_Days', 'MntTotal',
                                                   'MntRegularProds']])

df.head()


### IV.5) Encoding categorical variables where one class has more value than another

### IV.6) Encoding categorical variables where there is no difference between classes

# V. Baseline modeling
##### Baseline modeling establishes an initial predictive model using simple algorithms or default parameters. This serves as a benchmark for more complex models, helping evaluate their performance.

# VI. Modeling
##### In this stage, various machine learning models are implemented and fine-tuned to achieve optimal predictive performance. The goal is to choose the model that best fits the dataset and problem requirements.

# VII. Pipeline creation
##### A data processing pipeline is constructed to automate and streamline the entire workflow, from data preprocessing to model deployment. Pipelines enhance reproducibility and facilitate collaboration.