In [24]:
#  to perform predictive analysis using Random Forest and Heterogeneous Ensemble models

In [None]:
# HETEROGENEOUS ENSEMBLE 

In [None]:
# churn_data

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix



In [4]:
# Load the dataset

In [8]:
churn_data = pd.read_csv("Churn_Modelling.csv")

In [6]:
# Preprocess the dataset

In [9]:
churn_data = churn_data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)
label_encoder = LabelEncoder()
churn_data['Gender'] = label_encoder.fit_transform(churn_data['Gender'])
geo_dummies = pd.get_dummies(churn_data['Geography'], prefix='geo')
churn_data = pd.concat([churn_data, geo_dummies], axis=1)
churn_data = churn_data.drop(['Geography'], axis=1)
scaler = StandardScaler()
churn_data[['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']] = scaler.fit_transform(churn_data[['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']])


In [10]:
# Split the dataset into training and testing sets

In [11]:
X_train, X_test, y_train, y_test = train_test_split(churn_data.drop(['Exited'], axis=1), churn_data['Exited'], test_size=0.2, random_state=42)


In [12]:
# Train a Random Forest model

In [13]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [14]:
# Evaluate the Random Forest model

In [15]:
y_pred_rf = rf_model.predict(X_test)
print("Random Forest model accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest model confusion matrix:\n", confusion_matrix(y_test, y_pred_rf))


Random Forest model accuracy: 0.8655
Random Forest model confusion matrix:
 [[1547   60]
 [ 209  184]]


In [16]:
# Train a Heterogeneous Ensemble model

In [17]:
svc_model = SVC(kernel='linear', probability=True, random_state=42)
knn_model = KNeighborsClassifier(n_neighbors=5)
het_model = VotingClassifier(estimators=[('rf', rf_model), ('svc', svc_model), ('knn', knn_model)], voting='soft')
het_model.fit(X_train, y_train)

In [18]:
# Evaluate the Heterogeneous Ensemble model

In [23]:
y_pred_het = het_model.predict(X_test)

print("Heterogeneous Ensemble model accuracy:", accuracy_score(y_test, y_pred_het))
print("Heterogeneous Ensemble model confusion matrix:\n", confusion_matrix(y_test, y_pred_het))

Heterogeneous Ensemble model accuracy: 0.8595
Heterogeneous Ensemble model confusion matrix:
 [[1576   31]
 [ 250  143]]


In [29]:
# insurance dataset

In [None]:
# first we are importing libraries
# Load the insurance dataset
# Convert categorical variables to numerical variables
# Split the data into features (X) and target (y)
# Split the data into training and testing sets
# Define the three models to use in the ensemble
# Create a heterogeneous ensemble of the three models
# Fit the heterogeneous ensemble on the training data
# Predict on the test data
# Evaluate the performance of the ensemble

In [28]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import VotingRegressor

# Load the insurance dataset
insurance_data = pd.read_csv("insurance.csv")

# Convert categorical variables to numerical variables
insurance_data = pd.get_dummies(insurance_data, columns=['sex', 'smoker', 'region'])

# Split the data into features (X) and target (y)
X = insurance_data.drop(columns=['charges'])
y = insurance_data['charges']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Define the three models to use in the ensemble
model1 = RandomForestRegressor(n_estimators=100, random_state=1)
model2 = GradientBoostingRegressor(n_estimators=100, random_state=1)
model3 = XGBRegressor(n_estimators=100, random_state=1)

# Create a heterogeneous ensemble of the three models
heterogeneous_ensemble = VotingRegressor(estimators=[
    ('rf', model1),
    ('gb', model2),
    ('xgb', model3)
])

# Fit the heterogeneous ensemble on the training data
heterogeneous_ensemble.fit(X_train, y_train)

# Predict on the test data
y_pred_hom = heterogeneous_ensemble.predict(X_test)

# Evaluate the performance of the ensemble
mse = mean_squared_error(y_test, y_pred_hom)
print("MSE: ", mse)


MSE:  20865929.64334971


In [None]:
# I use VotingRegressor instead of VotingClassifier as we're performing regression instead of classification. We also use three different regression models instead of classifiers: RandomForestRegressor, GradientBoostingRegressor, and XGBRegressor.