In [2]:
import pandas as pd

# Load the dataset
feedback = pd.read_csv('/Users/samleonor/Desktop/ITDA/DMP:MLPA/CFeedback.csv')

# Inspect the dataset
print(feedback.head())
print(feedback.info())
print(feedback.describe())
print(feedback.isnull().sum()) 

# Fill missing values in Satisfaction_Score with the median
feedback["Satisfaction_Score"] = feedback["Satisfaction_Score"].fillna(feedback["Satisfaction_Score"].median())

# Cap Satisfaction_Score at 10
feedback["Satisfaction_Score"] = feedback["Satisfaction_Score"].clip(upper=10)

# Clean Feedback_Comments
feedback["Feedback_Comments"] = (
    feedback["Feedback_Comments"]
    .str.lower()  # Convert to lowercase
    .str.strip()  # Remove leading/trailing spaces
    .str.replace(r"[^\w\s]", "", regex=True)  # Remove special characters
)

# Categorize Satisfaction_Score into levels
bins = [0, 4, 7, 10]
labels = ["Low", "Medium", "High"]
feedback["Satisfaction_Level"] = pd.cut(
    feedback["Satisfaction_Score"], bins=bins, labels=labels, include_lowest=True
)

# Dataset check
print(feedback.head())
print(feedback.info())
print(feedback.describe())
print(feedback.isnull().sum())

   Customer_ID  Satisfaction_Score  Feedback_Comments  Likelihood_to_Recommend
0            1                10.0     Very satisfied                        9
1            2                 3.0     Very satisfied                        3
2            3                10.0     Very satisfied                        1
3            4                 7.0  Needs improvement                        4
4            5                 8.0     Unsatisfactory                        7
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5050 entries, 0 to 5049
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Customer_ID              5050 non-null   int64  
 1   Satisfaction_Score       4949 non-null   float64
 2   Feedback_Comments        5050 non-null   object 
 3   Likelihood_to_Recommend  5050 non-null   int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 157.9+ KB
None
       Customer_ID  Satisfa

In [4]:
import pandas as pd

# Load the dataset
proffering = pd.read_csv('/Users/samleonor/Desktop/ITDA/DMP:MLPA/PrOffering.csv')

# Clean up column names
proffering.columns = proffering.columns.str.strip()

# Remove duplicates based on Product_ID
proffering.drop_duplicates(subset=["Product_ID"], inplace=True)

# Drop the 'Target_Age_Group' column
if 'Target_Age_Group' in proffering.columns:
    proffering.drop(columns=['Target_Age_Group'], inplace=True)

# Map Risk_Level to numeric values
risk_map = {'Low': 1, 'Medium': 2, 'High': 3}
proffering['Risk_Level_numeric'] = proffering['Risk_Level'].map(risk_map)

print(proffering.head())

   Product_ID                   Product_Name     Product_Type Risk_Level  \
0           1           Platinum Credit Card      Credit Card     Medium   
1           2           Gold Savings Account  Savings Account        Low   
2           3  High-Yield Investment Account       Investment       High   
3           4                  Mortgage Loan             Loan     Medium   
4           5                      Auto Loan             Loan     Medium   

  Target_Income_Group  Risk_Level_numeric  
0              Medium                   2  
1                 Low                   1  
2                High                   3  
3                High                   2  
4              Medium                   2  


In [6]:
import pandas as pd

# Load the dataset
transactions = pd.read_csv('/Users/samleonor/Desktop/ITDA/DMP:MLPA/Transactions.csv')

# Ensure Product_ID is correctly mapped based on Transaction_Type
transaction_type_map = {
    'Purchase': 1,
    'Bill Payment': 2,
    'Investment': 3
}
transactions['Product_ID'] = transactions['Transaction_Type'].map(transaction_type_map)

# Fill missing values in Transaction_Amount with 0
transactions['Transaction_Amount'] = transactions['Transaction_Amount'].fillna(0)

# Replace 0 values in Transaction_Amount with the mean (excluding zeros)
mean_value = transactions[transactions['Transaction_Amount'] != 0]['Transaction_Amount'].mean()
transactions['Transaction_Amount'] = transactions['Transaction_Amount'].replace(0.0, mean_value)

# Convert Transaction_Date to datetime format
transactions['Transaction_Date'] = pd.to_datetime(transactions['Transaction_Date'])

# Extract month and weekday from Transaction_Date
transactions['Transaction_Month'] = transactions['Transaction_Date'].dt.month
transactions['Transaction_Weekday'] = transactions['Transaction_Date'].dt.day_name()

# Dataset check
print(transactions.head())
print(transactions[['Transaction_Date', 'Transaction_Month', 'Transaction_Weekday']].head())
print(transactions.isnull().sum())
print(transactions.dtypes)

   Transaction_ID  Customer_ID    Transaction_Date  Transaction_Amount  \
0               1          393 2023-01-01 00:00:00         3472.000000   
1               2          826 2023-01-01 01:00:00         3094.726465   
2               3          916 2023-01-01 02:00:00           10.000000   
3               4          109 2023-01-01 03:00:00           72.000000   
4               5          889 2023-01-01 04:00:00         1793.000000   

  Transaction_Type  Product_ID  Transaction_Month Transaction_Weekday  
0         Purchase         1.0                  1              Sunday  
1     Bill Payment         2.0                  1              Sunday  
2         Purchase         1.0                  1              Sunday  
3       Investment         3.0                  1              Sunday  
4       Investment         3.0                  1              Sunday  
     Transaction_Date  Transaction_Month Transaction_Weekday
0 2023-01-01 00:00:00                  1              Sunday
1

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Check class distribution
print("Class Distribution:\n", merged_data['Purchased'].value_counts())

# Feature Engineering
features = [
    'Satisfaction_Score', 'Likelihood_to_Recommend', 'Transaction_Amount',
    'Transaction_Month', 'Risk_Level_numeric'
]

# Add one-hot encoded weekday columns
weekday_columns = [col for col in merged_data.columns if 'Transaction_Weekday_' in col]
features.extend(weekday_columns)

# Define X (features) and y (target variable)
x = merged_data[features]
y = merged_data['Purchased']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Train the Logistic Regression Model
model = LogisticRegression(class_weight='balanced', max_iter=1000)
model.fit(X_train_resampled, y_train_resampled)

# Make Predictions
y_pred = model.predict(X_test)

# Evaluate the Model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f"ROC-AUC: {roc_auc}")

# Interpret results
print("Model Coefficients:", model.coef_)
print("Model Intercept:", model.intercept_)

Class Distribution:
 Purchased
0    12831
1     6314
Name: count, dtype: int64
Accuracy: 0.5411334552102377
Precision: 0.3603504928806134
Recall: 0.5276663993584603
F1-Score: 0.428246013667426
ROC-AUC: 0.5330969384617582
Model Coefficients: [[-0.00074671  0.00341052 -0.02935525  0.09008418 -0.01796472 -0.06384409
  -0.00674278 -0.06503396  0.01369161 -0.06550374 -0.09473139]]
Model Intercept: [-0.00313538]
