In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

ModuleNotFoundError: No module named 'pandas'

In [None]:
data = pd.read_csv("G:/Portfolio Projects/Data Science/5-Customer Buying Behaviour Prediction/data/customer_data.csv")
data.head(5)

### Knowing the data

In [None]:
print(f"Number of rows: {data.shape[0]}")
print(f"Number of columns: {data.shape[1]}")

In [None]:
data.info()

Let's check every column individually now, for any preprocessing required.

In [None]:
for col in data.columns:
    print(data[col].value_counts())

In [None]:
data = data.drop(['id', 'education'], axis=1)

In [None]:
data.columns

### One Hot Encoding & Scling Features

In [None]:
categorical_columns = ['gender', 'region', 'loyalty_status', 'product_category']
numerical_columns = ['age', 'income', 'purchase_amount', 'promotion_usage', 'satisfaction_score']

In [None]:
# Creating Pipelines for preprocessing
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder())
])

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

In [None]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ]
)

Train Test splitting of the preprocessed data

In [None]:
# Split data
X = data.drop('purchase_frequency', axis=1)
y = data['purchase_frequency']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Fit and transform the training data
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [None]:
# Saving the preprocessor as a model
joblib.dump(preprocessor, '../models/preprocessor.pkl')

In [None]:
# Save the processed data
train_data = pd.DataFrame(X_train)
train_data['purchase_frequency'] = y_train.reset_index(drop=True)
test_data = pd.DataFrame(X_test)
test_data['purchase_frequency'] = y_test.reset_index(drop=True)

train_data.to_csv('../data/train_data.csv', index=False)
test_data.to_csv('../data/test_data.csv', index=False)