In [7]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [8]:
data = pd.read_csv("G:/Portfolio Projects/Data Science/05-Customer Buying Behavior Prediction/data/customer_data.csv")
data.head(5)

Unnamed: 0,id,age,gender,income,education,region,loyalty_status,purchase_frequency,purchase_amount,product_category,promotion_usage,satisfaction_score
0,1,27,Male,40682,Bachelor,East,Gold,frequent,18249,Books,0,6
1,2,29,Male,15317,Masters,West,Regular,rare,4557,Clothing,1,6
2,3,37,Male,38849,Bachelor,West,Silver,rare,11822,Clothing,0,6
3,4,30,Male,11568,HighSchool,South,Regular,frequent,4098,Food,0,7
4,5,31,Female,46952,College,North,Regular,occasional,19685,Clothing,1,5


### Knowing the data

In [9]:
print(f"Number of rows: {data.shape[0]}")
print(f"Number of columns: {data.shape[1]}")

Number of rows: 100000
Number of columns: 12


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   id                  100000 non-null  int64 
 1   age                 100000 non-null  int64 
 2   gender              100000 non-null  object
 3   income              100000 non-null  int64 
 4   education           100000 non-null  object
 5   region              100000 non-null  object
 6   loyalty_status      100000 non-null  object
 7   purchase_frequency  100000 non-null  object
 8   purchase_amount     100000 non-null  int64 
 9   product_category    100000 non-null  object
 10  promotion_usage     100000 non-null  int64 
 11  satisfaction_score  100000 non-null  int64 
dtypes: int64(6), object(6)
memory usage: 9.2+ MB


Let's check every column individually now, for any preprocessing required.

In [11]:
for col in data.columns:
    print(data[col].value_counts())

id
99984    1
99983    1
99982    1
99981    1
99980    1
        ..
5        1
4        1
3        1
2        1
1        1
Name: count, Length: 100000, dtype: int64
age
30    8867
31    8645
29    8636
28    8266
32    8034
33    7214
27    7022
34    5894
26    5882
35    4873
25    4856
36    3583
24    3576
37    2697
23    2636
38    1791
22    1790
39    1200
21    1187
20     743
40     682
41     481
19     457
42     242
18     233
17     143
43     125
16      66
44      58
15      39
45      35
47      13
14      13
46      11
13       4
12       3
48       2
49       1
Name: count, dtype: int64
gender
Female    50074
Male      49926
Name: count, dtype: int64
income
39596    12
28944    10
36828    10
12803    10
43031     9
         ..
20448     1
34754     1
13016     1
28786     1
38794     1
Name: count, Length: 40003, dtype: int64
education
College       39874
Bachelor      30279
HighSchool    20031
Masters        9816
Name: count, dtype: int64
region
East     30074
Wes

In [12]:
data = data.drop(['id', 'education'], axis=1)

In [13]:
data.columns

Index(['age', 'gender', 'income', 'region', 'loyalty_status',
       'purchase_frequency', 'purchase_amount', 'product_category',
       'promotion_usage', 'satisfaction_score'],
      dtype='object')

### One Hot Encoding & Scling Features

In [14]:
categorical_columns = ['gender', 'region', 'loyalty_status', 'product_category']
numerical_columns = ['age', 'income', 'purchase_amount', 'promotion_usage', 'satisfaction_score']

In [15]:
# Creating Pipelines for preprocessing
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder())
])

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

In [16]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ]
)

Train Test splitting of the preprocessed data

In [17]:
# Split data
X = data.drop('purchase_frequency', axis=1)
y = data['purchase_frequency']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# Fit and transform the training data
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [19]:
# Saving the preprocessor as a model
joblib.dump(preprocessor, '../models/preprocessor.pkl')

['../models/preprocessor.pkl']

In [20]:
# Save the processed data
train_data = pd.DataFrame(X_train)
train_data['purchase_frequency'] = y_train.reset_index(drop=True)
test_data = pd.DataFrame(X_test)
test_data['purchase_frequency'] = y_test.reset_index(drop=True)

train_data.to_csv('../data/train_data.csv', index=False)
test_data.to_csv('../data/test_data.csv', index=False)