In [None]:
# Import necessary packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# Load the datasets
train_df = pd.read_csv("/home/jovyan/work/train.csv")
test_df = pd.read_csv("/home/jovyan/work/test.csv")

# Fill missing values
train_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)

# Identify categorical columns (automatically)
categorical_columns = train_df.select_dtypes(include=['object']).columns.tolist()

# Exclude the 'CustomerID' and 'Churn' columns from one-hot encoding
categorical_columns = [col for col in categorical_columns if col not in ['CustomerID', 'Churn']]

# Convert categorical features to numeric using one-hot encoding
train_df = pd.get_dummies(train_df, columns=categorical_columns, drop_first=True)
test_df = pd.get_dummies(test_df, columns=categorical_columns, drop_first=True)

# Align columns between train and test datasets 
X = train_df.drop(['CustomerID', 'Churn'], axis=1)  # Features without target column
y = train_df['Churn']

# Align test dataset to match the train dataset
X_test = test_df.drop(['CustomerID'], axis=1)
X_test = X_test.reindex(columns=X.columns, fill_value=0)

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Validate the model on the validation set
y_val_pred = rf_model.predict_proba(X_val)[:, 1]  # Get the probability scores for the validation set
roc_auc = roc_auc_score(y_val, y_val_pred)
print("Validation ROC AUC Score:", roc_auc)

# Make predictions on the test set
predicted_probability = rf_model.predict_proba(X_test)[:, 1]

# Create the submission dataframe
prediction_df = pd.DataFrame({
    'CustomerID': pd.read_csv("/home/jovyan/work/test.csv")['CustomerID'], 
    'predicted_probability': predicted_probability
})

# Save the submission file
prediction_df.to_csv("/home/jovyan/work/prediction_submission.csv", index=False)

print(prediction_df.shape)
print(prediction_df.head())

assert prediction_df.shape[0] == 104480, 'The dataframe prediction_df should have 104480 rows.'
assert prediction_df.shape[1] == 2, 'The dataframe prediction_df should have 2 columns.'
assert 'CustomerID' in prediction_df.columns, 'The first column name should be CustomerID.'
assert 'predicted_probability' in prediction_df.columns, 'The second column name should be predicted_probability.'
