In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Define file paths
customer_transactions_path = "/Users/gillianmondero/cleaned_customer_transactions.csv"
customer_feedback_path = "/Users/gillianmondero/cleaned_customer_feedback.csv"

# Read the CSV files
customer_transactions = pd.read_csv(customer_transactions_path)
customer_feedback = pd.read_csv(customer_feedback_path)

# Display the first few rows of both datasets
customer_transactions_head = customer_transactions.head()
customer_feedback_head = customer_feedback.head()

customer_transactions_head, customer_feedback_head

(   Customer_ID  Transactions_Per_Customer  Average_Transaction_Amount  \
 0            1                   0.416667                    0.535373   
 1            2                   0.083333                    0.460484   
 2            3                   0.000000                    0.265987   
 3            4                   0.083333                    0.820374   
 4            5                   0.333333                    0.568005   
 
    Total_Transaction_Amount   Recency  
 0                  0.447358  0.131980  
 1                  0.124909  0.340102  
 2                  0.033842  0.761421  
 3                  0.216489  0.142132  
 4                  0.392269  0.005076  ,
    Customer_ID  Satisfaction_Score  Likelihood_to_Recommend  \
 0            1                10.0                        9   
 1            2                 3.0                        3   
 2            3                10.0                        1   
 3            4                 7.0                

In [6]:
# Merge the two datasets on 'Customer_ID'
merged_df = pd.merge(customer_transactions, customer_feedback, on='Customer_ID', how='inner')

# Display the first few rows of the merged dataset
merged_df_head = merged_df.head()
merged_df_head

Unnamed: 0,Customer_ID,Transactions_Per_Customer,Average_Transaction_Amount,Total_Transaction_Amount,Recency,Satisfaction_Score,Likelihood_to_Recommend,Feedback_Comments_Encoded,Feedback_Comment,Satisfaction_Score_Normalized,Likelihood_to_Recommend_Normalized
0,1,0.416667,0.535373,0.447358,0.13198,10.0,9,4,Excellent,1.0,0.888889
1,1,0.416667,0.535373,0.447358,0.13198,7.0,9,1,Good service,0.666667,0.888889
2,2,0.083333,0.460484,0.124909,0.340102,3.0,3,4,Needs improvement,0.222222,0.222222
3,2,0.083333,0.460484,0.124909,0.340102,7.0,5,3,Good service,0.666667,0.444444
4,2,0.083333,0.460484,0.124909,0.340102,3.0,7,4,Needs improvement,0.222222,0.666667


In [8]:
# Check for duplicate rows in the merged dataset
duplicates = merged_df.duplicated().sum()

# Remove duplicate rows
merged_df = merged_df.drop_duplicates()

# Display the number of duplicates removed
duplicates

2

In [10]:
# Summary statistics of the merged dataset
merged_summary = merged_df.describe()
merged_summary

Unnamed: 0,Customer_ID,Transactions_Per_Customer,Average_Transaction_Amount,Total_Transaction_Amount,Recency,Satisfaction_Score,Likelihood_to_Recommend,Feedback_Comments_Encoded,Satisfaction_Score_Normalized,Likelihood_to_Recommend_Normalized
count,4933.0,4933.0,4933.0,4933.0,4933.0,4933.0,4933.0,4933.0,4933.0,4933.0
mean,502.111899,0.338283,0.466414,0.332065,0.194064,5.600649,5.566592,2.106832,0.511183,0.507399
std,288.730479,0.184347,0.158536,0.175173,0.18405,2.796754,2.867632,1.294656,0.31075,0.318626
min,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,253.0,0.166667,0.368069,0.203379,0.055838,3.0,3.0,1.0,0.222222,0.222222
50%,504.0,0.333333,0.457829,0.312826,0.137056,6.0,6.0,2.0,0.555556,0.555556
75%,751.0,0.5,0.556618,0.439222,0.279188,8.0,8.0,3.0,0.777778,0.777778
max,1000.0,1.0,1.0,1.0,1.0,10.0,10.0,4.0,1.0,1.0


In [12]:
# Check the number of unique values per column
unique_values = merged_df.nunique()
unique_values

Customer_ID                           993
Transactions_Per_Customer              13
Average_Transaction_Amount            982
Total_Transaction_Amount              980
Recency                               148
Satisfaction_Score                     10
Likelihood_to_Recommend                10
Feedback_Comments_Encoded               5
Feedback_Comment                        5
Satisfaction_Score_Normalized          10
Likelihood_to_Recommend_Normalized     10
dtype: int64

In [14]:
# Normalize numerical features using MinMaxScaler
scaler = MinMaxScaler()
numerical_columns = [
    'Transactions_Per_Customer', 'Average_Transaction_Amount', 'Total_Transaction_Amount', 'Recency'
]

merged_df[numerical_columns] = scaler.fit_transform(merged_df[numerical_columns])

# Display the first few rows after normalization
merged_df.head()

Unnamed: 0,Customer_ID,Transactions_Per_Customer,Average_Transaction_Amount,Total_Transaction_Amount,Recency,Satisfaction_Score,Likelihood_to_Recommend,Feedback_Comments_Encoded,Feedback_Comment,Satisfaction_Score_Normalized,Likelihood_to_Recommend_Normalized
0,1,0.416667,0.535373,0.447358,0.13198,10.0,9,4,Excellent,1.0,0.888889
1,1,0.416667,0.535373,0.447358,0.13198,7.0,9,1,Good service,0.666667,0.888889
2,2,0.083333,0.460484,0.124909,0.340102,3.0,3,4,Needs improvement,0.222222,0.222222
3,2,0.083333,0.460484,0.124909,0.340102,7.0,5,3,Good service,0.666667,0.444444
4,2,0.083333,0.460484,0.124909,0.340102,3.0,7,4,Needs improvement,0.222222,0.666667


In [18]:
# Save the merged dataset to a CSV file
merged_df.to_csv("merged_customer_data.csv", index=False)