In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error as mse, mean_absolute_error as mae

%matplotlib inline
pd.options.display.max_rows = 50

# Load the dataset
df = pd.read_csv("Data_Marketing_Customer_Analysis_Round3.csv")

# Separate numerical and categorical columns
numerical = df.select_dtypes(include=[np.number])
categorical = df.select_dtypes(include=[object])

X_df = numerical.drop("total_claim_amount", axis=1)
X = X_df
y = df["total_claim_amount"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply linear regression
lm = LinearRegression()
model = lm.fit(X_train_scaled, y_train)

# Model Interpretation
print(f'Intercept: {model.intercept_}')
coeff_df = pd.DataFrame(model.coef_, X_df.columns, columns=['Coefficient'])
print(coeff_df)

# Model Evaluation
print(f'MSE: {mse(y_test, y_pred)}')
print(f'RMSE: {np.sqrt(mse(y_test,y_pred))}')
print(f'MAE: {mae(y_test, y_pred)}')
print(f'R2: {r2_score(y_test, y_pred)}')
n = len(y_test)
p = X_test_scaled.shape[1]
adj_r2 = 1 - (1-r2_score(y_test, y_pred))*(n-1)/(n-p-1)
print(f'Adjusted R2: {adj_r2}')

# Feature Importance
feature_importance = coeff_df.sort_values(by='Coefficient', ascending=False)
print("Feature Importance:")
print(feature_importance)

# Rerun the model with hot encoded categorical variables
categorical_encoded = pd.get_dummies(categorical, drop_first=True)
X_df = pd.concat([numerical.drop("total_claim_amount", axis=1), categorical_encoded], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.2, random_state=42)

# Rescale the new data
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Retrain and predict
model = lm.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

# Model Evaluation for new features
print(f'MSE after One-Hot Encoding: {mse(y_test, y_pred)}')
print(f'RMSE after One-Hot Encoding: {np.sqrt(mse(y_test,y_pred))}')
print(f'MAE after One-Hot Encoding: {mae(y_test, y_pred)}')
print(f'R2 after One-Hot Encoding: {r2_score(y_test, y_pred)}')
n = len(y_test)
p = X_test_scaled.shape[1]
adj_r2 = 1 - (1-r2_score(y_test, y_pred))*(n-1)/(n-p-1)
print(f'Adjusted R2 after One-Hot Encoding: {adj_r2}')

# (Optional) Rerun the model after removing outliers
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df_no_outliers = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

X_no_outliers = df_no_outliers.drop("total_claim_amount", axis=1)
y_no_outliers = df_no_outliers["total_claim_amount"]
X_train_no_outliers, X_test_no_outliers, y_train_no_outliers, y_test_no_outliers = train_test_split(X_no_outliers, y_no_outliers, test_size=0.2, random_state=42)

# Scaling without outliers
scaler.fit(X_train_no_outliers)
X_train_scaled_no_outliers = scaler.transform(X_train_no_outliers)
X_test_scaled_no_outliers = scaler.transform(X_test_no_outliers)

# Training without outliers
model_no_outliers = lm.fit(X_train_scaled_no_outliers, y_train_no_outliers)
y_pred_no_outliers = model_no_outliers.predict(X_test_scaled_no_outliers)

# Model Evaluation after removing outliers
print(f'R2 without outliers: {r2_score(y_test_no_outliers, y_pred_no_outliers)}')


Intercept: 322.69400522001615
                               Coefficient
customer_lifetime_value         -78.617213
income                         -122.106613
monthly_premium_auto           1293.822169
months_since_last_claim           0.441837
months_since_policy_inception   -10.676884
number_of_open_complaints         1.593050
number_of_policies                8.029217


NameError: name 'y_pred' is not defined