# EasyMoney preprocessing (Segmentation Model)

## 1. Imports

### 1.1. Libraries

In [54]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

import sklearn
from sklearn import set_config
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, MinMaxScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn import model_selection
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, RepeatedKFold, KFold
from sklearn.metrics import accuracy_score
import category_encoders as ce

import folium
import plotly.express as px
from scipy import stats
from sklearn.tree import export_graphviz
import graphviz

import statsmodels.api as sm

set_config(transform_output = "pandas")

### 1.2. Data

In [None]:
PATH = r'C:\Users\Usuario\Desktop\Proyects\Easy Money\data_compressed'
file_name =  r"\customer_commercial_activity.csv"
file_name1 = r"\customer_products.csv"
file_name2 = r"\customer_sociodemographics.csv"
file_name3 = r"\product_description.csv"
file_name4 = r"\sales.csv"
cca = pd.read_csv(PATH + file_name, sep = ",", index_col=0)
cp = pd.read_csv(PATH + file_name1, sep = ",", index_col=0)
cs = pd.read_csv(PATH + file_name2, sep = ",", index_col=0)
prd = pd.read_csv(PATH + file_name3, sep = ",", index_col=0)
sales = pd.read_csv(PATH + file_name4, sep = ",", index_col=0)

dfs = [cca, cp, cs, sales]

In [57]:
print("Working with these versions of libraries\n")
print(f"Numpy version {np.__version__}")
print(f"Pandas version {pd.__version__}")
print(f"Statsmodels version {sm.__version__}")
print(f"Sklearn version {sklearn.__version__}")

Working with these versions of libraries

Numpy version 1.26.4
Pandas version 2.1.4
Statsmodels version 0.14.0
Sklearn version 1.3.0


### 1.3. Custom Functions

In [56]:
## Sampling data

def sampling_xdf(dfs, p_sample):
    sampled_dfs = []
    for df in dfs:
        n_rows = int(len(df)*p_sample)
        sampled_df = df.sample(n = n_rows, random_state = 42)
        sampled_dfs.append(sampled_df)
    return sampled_dfs

## Calculation functions

def calc_moda(series):
    return series.mode().iloc[0]

## Preprocessing

def setOthers(dataframe, column, num_values):
    top_categories = dataframe[column].value_counts().head(num_values)
    top_categories_list = top_categories.index.to_list()
    top_categories_list.append('Others')
    dataframe[column] = pd.Categorical(dataframe[column], categories=top_categories_list)
    return dataframe[column].fillna('Others')

def seniority(df, date_column, threshold_date):
    df[date_column] = pd.to_datetime(df[date_column])

    df['customer_seniority'] = 'Old'
    df.loc[df.groupby('pk_cid')[date_column].transform('min') >= threshold_date, 'customer_seniority'] = 'New'

    return df



### 1.4. Sampling

In [58]:
cca, cp, cs, sales = sampling_xdf(dfs, 1)

## 2. Preprocessing

### 2.1. Sales_products

In [59]:
def get_sales_data(sales, prd):
    prd = prd.rename(columns={'pk_product_ID': 'product_ID'})
    sales = sales.rename(columns={'cid' : 'pk_cid'})
    sales['month_sale_int'] = pd.to_datetime(sales['month_sale']).dt.strftime('%Y%m%d').astype(int)
    sales_prd = sales.merge(prd, on= 'product_ID', how= 'inner')

    sales_prd_merge = sales_prd.groupby('pk_cid').agg(
    T_sales = ('pk_sale', 'count'),
    n_product = ('product_desc', 'count'),
    T_net_margin = ('net_margin', 'sum'),
    Mean_net_margin = ('net_margin', 'mean')
)
    return sales_prd_merge

In [60]:
sales_prd_merge = get_sales_data(sales, prd)

In [61]:
'''
prd = prd.rename(columns={'pk_product_ID': 'product_ID'})
sales = sales.rename(columns={'cid' : 'pk_cid'})
sales['month_sale_int'] = pd.to_datetime(sales['month_sale']).dt.strftime('%Y%m%d').astype(int)
sales_prd = sales.merge(prd, on= 'product_ID', how= 'inner')

sales_prd_merge = sales_prd.groupby('pk_cid').agg(
    T_sales = ('pk_sale', 'count'),
    n_product = ('product_desc', 'count'),
    T_net_margin = ('net_margin', 'sum'),
    Mean_net_margin = ('net_margin', 'mean'),
    first_sale = ('month_sale_int', 'min'),
    last_sale = ('month_sale_int', 'max'),
    product_moda = ('product_ID', calc_moda)
)
'''

"\nprd = prd.rename(columns={'pk_product_ID': 'product_ID'})\nsales = sales.rename(columns={'cid' : 'pk_cid'})\nsales['month_sale_int'] = pd.to_datetime(sales['month_sale']).dt.strftime('%Y%m%d').astype(int)\nsales_prd = sales.merge(prd, on= 'product_ID', how= 'inner')\n\nsales_prd_merge = sales_prd.groupby('pk_cid').agg(\n    T_sales = ('pk_sale', 'count'),\n    n_product = ('product_desc', 'count'),\n    T_net_margin = ('net_margin', 'sum'),\n    Mean_net_margin = ('net_margin', 'mean'),\n    first_sale = ('month_sale_int', 'min'),\n    last_sale = ('month_sale_int', 'max'),\n    product_moda = ('product_ID', calc_moda)\n)\n"

In [62]:
sales_prd_merge.head()

Unnamed: 0_level_0,T_sales,n_product,T_net_margin,Mean_net_margin
pk_cid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
15891,1,1,66.4,66.4
16203,1,1,96.7,96.7
16502,2,2,132.9,66.45
17457,1,1,56.0,56.0
17970,2,2,879.8,439.9


### 2.2. Customers information

In [63]:
def cust_top_features(df):
    customers_merge = df.groupby('pk_cid').agg(
        entry_channel_nunique = ('entry_channel_so', pd.Series.nunique),
        entry_channel_most_freq = ('entry_channel_so', calc_moda),
        act_cust_most_freq = ('active_customer', 'mean'),
        act_cust_std = ('active_customer', 'std'),
        afiliation_time = ('afiliation_time', 'max'),
        salary = ('salary_imp', 'mean'),
        age = ('age', 'max'),
        region_code = ('region_code_so', calc_moda),
        #entry_date = ('entry_date', 'min'),
        segment = ('segment', calc_moda),
        customer_seniority = ('customer_seniority', calc_moda))
    return customers_merge


In [64]:
def get_customers_data(cca, cs):
        cust = cca.merge(cs, on = ['pk_cid', 'pk_partition'], how = 'inner')

        cust = cust[cust['deceased']== 'N'].drop('deceased', axis =  1)
        cust_es = pd.DataFrame(cust[cust['country_id'] == 'ES'].drop('country_id', axis = 1))

        cust_es['active_customer'] = cust_es['active_customer'].astype(int)
        cust_es['pk_partition'] = pd.to_datetime(cust_es['pk_partition'])
        cust_es['entry_date'] = pd.to_datetime(cust_es['entry_date'])
        cust_es['afiliation_time'] = cust_es['pk_partition'] - cust_es['entry_date']

        cust_es[['region_code_mf', 'gender_mf']] = SimpleImputer(strategy =  'most_frequent').fit_transform(
                cust_es[['region_code', 'gender']])
        cust_es['segment'] = SimpleImputer(strategy =  'constant', fill_value = 'Other').fit_transform(
                cust_es[['segment']])
        sal_mn_regage = cust_es.groupby(['age', 'region_code'])['salary'].transform('mean')
        cust_es['salary_imp'] = cust_es['salary'].fillna(sal_mn_regage).round(2)
        cust_es['salary_imp'] = SimpleImputer(strategy = 'mean').fit_transform(cust_es[['salary_imp']])

        cust_es['entry_channel_so'] = setOthers(cust_es, 'entry_channel', 10)
        cust_es['region_code_so'] = setOthers(cust_es, 'region_code', 40)
        cust_es['region_code_so'] = cust_es['region_code_so'].replace('Others' , 99.0)
        threshold_date = pd.to_datetime('2018-01-01')
        cust_es = seniority(cust_es, 'entry_date', threshold_date)
        return cust_top_features(cust_es)

In [65]:
customers_merge = get_customers_data(cca, cs)

In [66]:
customers_merge.head().T

pk_cid,15891,16063,16203,16502,17457
entry_channel_nunique,1,1,1,2,1
entry_channel_most_freq,KAT,KAT,KAT,KHN,KAT
act_cust_most_freq,0.5,0.714286,0.833333,0.888889,1.0
act_cust_std,0.707107,0.48795,0.408248,0.333333,0.0
afiliation_time,31 days 00:00:00,181 days 00:00:00,151 days 00:00:00,242 days 00:00:00,607 days 00:00:00
salary,176628.78,163192.43,152281.83,176423.122222,102405.75
age,59,62,70,58,54
region_code,28.0,28.0,8.0,28.0,28.0
segment,02 - PARTICULARES,02 - PARTICULARES,01 - TOP,02 - PARTICULARES,02 - PARTICULARES
customer_seniority,New,New,New,New,Old


### 2.3. Customers products

In [67]:
def getproductdata(dataset):
  dataset.fillna(0, inplace = True)
  #products = dataset.drop(['pk_cid', 'pk_partition'], axis=1).columns
  #dataset['Products_contracted'] = dataset[products].sum(axis = 1)
  dataset['Products_contracted'] = dataset.select_dtypes(exclude = object).sum(axis = 1)
  cust_products_merge = dataset.groupby('pk_cid').agg(
    mean_products_contracted = ('Products_contracted','mean'),
    std_products_contracted = ('Products_contracted','std'),
    max_products_contracted = ('Products_contracted','max'),
    min_products_contracted = ('Products_contracted','min'))
  cust_products_merge['std_products_contracted'].fillna(-1, inplace = True)
  return cust_products_merge

In [68]:
cust_products_merge = getproductdata(cp)

In [69]:
cust_products_merge

Unnamed: 0_level_0,mean_products_contracted,std_products_contracted,max_products_contracted,min_products_contracted
pk_cid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
15891,1.589150e+04,0.707107,15892.0,15891.0
16063,1.606300e+04,0.000000,16063.0,16063.0
16203,1.620383e+04,0.408248,16204.0,16203.0
16502,1.650344e+04,0.726483,16504.0,16502.0
17457,1.745924e+04,0.562296,17460.0,17458.0
...,...,...,...,...
1553685,1.553685e+06,-1.000000,1553685.0,1553685.0
1553686,1.553686e+06,-1.000000,1553686.0,1553686.0
1553687,1.553687e+06,-1.000000,1553687.0,1553687.0
1553688,1.553688e+06,-1.000000,1553688.0,1553688.0


### 2.4. Building dataset

In [73]:
def mergedataset(customers_merge, cust_products_merge, sales_prd_merge):
  full_df = pd.merge(customers_merge, cust_products_merge, how = 'inner', left_index = True, right_index = True)
  full_df = pd.merge(full_df, sales_prd_merge, how = 'left', left_index = True, right_index = True)

  full_df['afiliation_days'] = full_df['afiliation_time'].dt.days
  full_df.drop(columns = 'afiliation_time', axis = 1, inplace = True)
  full_df['act_cust_most_freq'] = full_df['act_cust_most_freq'].astype(float)
  full_df.fillna(0, inplace = True)
  #full_df['entry_date_int'] = pd.to_datetime(full_df['entry_date']).dt.strftime('%Y%m%d').astype(int)
  #full_df.drop('entry_date', axis= 1, inplace = True)

  return full_df

In [78]:
def prep_transform(df):
  #numerical_columns_v2 = df.select_dtypes(exclude = object).drop(['region_code', 'product_moda'], axis=1).columns.to_list()
  numerical_columns_v2 = df.select_dtypes(exclude = object).drop(['region_code'], axis=1).columns.to_list()
  transform_pipe = ColumnTransformer(transformers = [
    ("scaler", MinMaxScaler(), numerical_columns_v2),
    ("encoder", OneHotEncoder(sparse_output = False), ['entry_channel_most_freq', 'segment']),
    #("encoder", OneHotEncoder(sparse_output = False), ['entry_channel_most_freq', 'segment', 'product_moda']),
    ('ordinal', OrdinalEncoder(), ['region_code', 'customer_seniority'])
    ])
  full_df_trans = transform_pipe.fit_transform(df)
  return full_df_trans

In [79]:
full_df = mergedataset(customers_merge, cust_products_merge, sales_prd_merge)
full_df_trans = prep_transform(full_df)

In [80]:
em_segmentation = full_df
em_segmentation_trans = full_df_trans

In [81]:
full_df_trans.shape
#full_df.shape

(456172, 31)

In [82]:
em_segmentation_trans.head().T

pk_cid,15891,16063,16203,16502,17457
scaler__entry_channel_nunique,0.0,0.0,0.0,0.333333,0.0
scaler__act_cust_most_freq,0.5,0.714286,0.833333,0.888889,1.0
scaler__act_cust_std,1.0,0.690066,0.57735,0.471405,0.0
scaler__salary,0.006072,0.005607,0.005229,0.006064,0.003503
scaler__age,0.553398,0.582524,0.660194,0.543689,0.504854
scaler__mean_products_contracted,0.0,0.000112,0.000203,0.000398,0.001019
scaler__std_products_contracted,0.376385,0.220481,0.310492,0.380657,0.344457
scaler__max_products_contracted,0.0,0.000111,0.000203,0.000398,0.00102
scaler__min_products_contracted,0.0,0.000112,0.000203,0.000397,0.001019
scaler__T_sales,0.1,0.0,0.1,0.2,0.1


#### 2.5. Pickle

In [83]:
pd.to_pickle(em_segmentation, "C:/Users/Usuario/Desktop/Proyects/Easy Money/EasyMoney_/pickles/em_segmentation.pkl")
pd.to_pickle(em_segmentation_trans, "C:/Users/Usuario/Desktop/Proyects/Easy Money/EasyMoney_/pickles/em_segmentation_trans.pkl")