In [41]:
# import
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [42]:
df = pd.read_csv(r'../Datasets/autoinsurance_churn_small.csv')
df.head(5)

Unnamed: 0,individual_id,address_id,days_tenure,cust_orig_date,date_of_birth,city,county,marital_status,home_market_value,home_owner,college_degree,good_credit,acct_suspd_date,Churn
0,221300017872,521300005982,1454,2018-12-09,1978-06-23,Kaufman,Kaufman,Married,50000 - 74999,1,1,1,,0
1,221300134410,521300051115,1795,2018-01-02,1950-05-30,Grand Prairie,Dallas,Single,50000 - 74999,1,0,0,,0
2,221300673028,521300247929,4818,2009-09-23,1967-07-07,Dallas,Dallas,Married,75000 - 99999,1,0,0,,0
3,221301573419,521300570147,130,2022-07-25,1969-05-25,Arlington,Tarrant,Married,175000 - 199999,1,0,1,2021-12-22,1
4,221301622347,521300588399,5896,2006-10-11,1972-09-25,Fort Worth,Tarrant,Married,225000 - 249999,1,1,1,,0


In [43]:
# Preprocessing steps (adjust based on your dataset)
X = df.drop(['Churn'], axis=1)
y = df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [44]:
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])

In [45]:
# Define and train the Logistic Regression model
lr_model = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', LogisticRegression(random_state=42))])
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)
print('Logistic Regression Accuracy:', accuracy_score(y_test, lr_predictions))

Logistic Regression Accuracy: 0.99965


In [46]:
# Define and train the Decision Tree model
dt_model = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', DecisionTreeClassifier(random_state=42))])
dt_model.fit(X_train, y_train)
dt_predictions = dt_model.predict(X_test)
print('Decision Tree Accuracy:', accuracy_score(y_test, dt_predictions))

Decision Tree Accuracy: 0.99925


In [47]:
# Define and train the Random Forest model
rf_model = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(random_state=42))])
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
print('Random Forest Accuracy:', accuracy_score(y_test, rf_predictions))