In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
#from xgboost import XGBClassifier
#from lightgbm import LGBMClassifier

# Load your dataset
data = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

# Convert 'TotalCharges' column to numeric and fill missing values with 0
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
data['TotalCharges'].fillna(0, inplace=True)

# Convert 'Churn' column to binary values
data['Churn'] = data['Churn'].map({'No': 0, 'Yes': 1})

# Split the data into a train-test split with a random state of 1
X = data[['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',
          'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
          'Contract', 'PaperlessBilling', 'PaymentMethod', 'tenure', 'MonthlyCharges', 'TotalCharges']]
y = data['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Feature Engineering: Scaling numerical features and one-hot encoding categorical features
numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',
               'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
               'Contract', 'PaperlessBilling', 'PaymentMethod']

# Standardize numerical features
scaler = StandardScaler()
X_train[numerical] = scaler.fit_transform(X_train[numerical])
X_test[numerical] = scaler.transform(X_test[numerical])

# One-hot encode categorical features
# One-hot encode categorical features and retrieve column names
encoder = OneHotEncoder(sparse=False, drop='first')
X_train_encoded = encoder.fit_transform(X_train[categorical])
X_test_encoded = encoder.transform(X_test[categorical])
# Get the column names for the one-hot encoded features
encoded_feature_names = encoder.get_feature_names_out(input_features=categorical)

X_train_encoded = pd.DataFrame(X_train_encoded, columns=encoded_feature_names)
X_test_encoded = pd.DataFrame(X_test_encoded, columns=encoded_feature_names)

# Combine scaled numerical and one-hot encoded categorical features
X_train_final = pd.concat([X_train_encoded, X_train[numerical]], axis=1)
X_test_final = pd.concat([X_test_encoded, X_test[numerical]], axis=1)

# Train and evaluate models
random_state = 1

# Random Forest
rf_model = RandomForestClassifier(random_state=random_state)
rf_model.fit(X_train_final, y_train)
rf_accuracy = rf_model.score(X_test_final, y_test)

# Extra Trees
et_model = ExtraTreesClassifier(random_state=random_state)
et_model.fit(X_train_final, y_train)
et_accuracy = et_model.score(X_test_final, y_test)

# XGBoost
xgb_model = XGBClassifier(random_state=random_state)
xgb_model.fit(X_train_final, y_train)
xgb_accuracy = xgb_model.score(X_test_final, y_test)

# LightGBM
lgbm_model = LGBMClassifier(random_state=random_state)
lgbm_model.fit(X_train_final, y_train)
lgbm_accuracy = lgbm_model.score(X_test_final, y_test)

print("Random Forest Accuracy:", rf_accuracy)
print("Extra Trees Accuracy:", et_accuracy)
print("XGBoost Accuracy:", xgb_accuracy)
print("LightGBM Accuracy:", lgbm_accuracy)




ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values