In [None]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from scipy.stats import ttest_ind
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer



In [None]:
df = pd.read_csv('/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()

In [None]:
df.info()

In [None]:
df.hist(figsize=(12, 8))
plt.show()

In [None]:
# Convert 'TotalCharges' to numeric, coerce errors to NaN
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Ensure the 'Churn' column is categorical
df['Churn'] = df['Churn'].astype('category')

# Create a box plot with 'Churn' as the x-axis and 'TotalCharges' as the y-axis
sns.boxplot(x='Churn', y='TotalCharges', data=df)
plt.show()


In [None]:
sns.countplot(x='PaymentMethod', data=df)
plt.xticks(rotation=45)
plt.show()

In [None]:
numeric_columns = df.select_dtypes(include=['number'])

corr_matrix = numeric_columns.corr()

plt.figure(figsize=(5, 5))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.show()


In [None]:
cols = numeric_columns
for col in cols:
  print(f"Value counts of {col} column")
  print(df[col].value_counts(), '\n')

In [None]:
# Convert 'TotalCharges' to numeric, coerce errors to NaN
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Check for missing values
missing_values = df['TotalCharges'].isnull().sum()
print(f"Number of missing values in 'TotalCharges': {missing_values}")

# Drop rows with missing 'TotalCharges' values or fill with a suitable value

#perform the t-test
churned = df[df['Churn'] == 'Yes']['TotalCharges']
not_churned = df[df['Churn'] == 'No']['TotalCharges']
t_statistic, p_value = ttest_ind(churned, not_churned)
print(f'T-statistic: {t_statistic}, p-value: {p_value}')

In [None]:
# Replace missing values in 'MonthlyCharges' and 'TotalCharges' with the mean
imputer = SimpleImputer(strategy='mean')
X_imputed = df[['MonthlyCharges', 'TotalCharges']].copy()
X_imputed[['MonthlyCharges', 'TotalCharges']] = imputer.fit_transform(X_imputed[['MonthlyCharges', 'TotalCharges']])
kmeans = KMeans(n_clusters=3, n_init=4)  
df['Cluster'] = kmeans.fit_predict(X_imputed)

In [None]:
X = df.drop(['Churn'], axis=1)
y = df['Churn']

# Handle missing values in the dataset
#SimpleImputer to fill missing numeric values with the mean
# and missing categorical values with the most frequent value.
numeric_cols = X.select_dtypes(include='number').columns
categorical_cols = X.select_dtypes(exclude='number').columns

numeric_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

X[numeric_cols] = numeric_imputer.fit_transform(X[numeric_cols])
X[categorical_cols] = categorical_imputer.fit_transform(X[categorical_cols])

X_encoded = pd.get_dummies(X, drop_first=True)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Build and evaluate a Random Forest classifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))




In [None]:
df_encoded = pd.get_dummies(df, columns=['gender'], drop_first=True)

# Apply one-hot encoding to the 'contract' column
df_encoded = pd.get_dummies(df_encoded, columns=['Churn'], drop_first=True)

# Apply one-hot encoding to the 'payment method' column
df_encoded = pd.get_dummies(df_encoded, columns=['PaymentMethod'], drop_first=True)


In [None]:
print("After encoding 'Churn':")
print(df_encoded.head())

print("After encoding 'PaymentMethod':")
print(df_encoded.head())

In [None]:
# Check the column names in your DataFrame
print(df.columns)

numerical_features = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']

# Check the data types of columns
print(df.dtypes)

# Ensure the specified column names exist in your DataFrame
missing_columns = [col for col in numerical_features if col not in df.columns]
print("Missing columns:", missing_columns)

print(df.dtypes)

# Ensure the specified column names exist in your DataFrame
missing_columns = [col for col in numerical_features if col not in df.columns]
print("Missing columns:", missing_columns)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Define the numerical features you want to use
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']

# Define the target variable
target_variable = 'Churn'

# Impute missing values with the mean for numerical features
imputer = SimpleImputer(strategy='mean')
X_train[numerical_features] = imputer.fit_transform(X_train[numerical_features])
X_test[numerical_features] = imputer.transform(X_test[numerical_features])

# Initialize and train the Logistic Regression model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set and evaluate the model

y_pred_proba = model.predict_proba(X_test)[:, 1] 


accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='Yes')
recall = recall_score(y_test, y_pred, pos_label='Yes')
f1 = f1_score(y_test, y_pred, pos_label='Yes')
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print(f'ROC AUC Score: {roc_auc:.2f}')

In [None]:
# Initialize and train the Random Forest model (you can replace RandomForestClassifier with your specific model)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Get feature importances from the trained model
feature_importances = model.feature_importances_

feature_names = X_train.columns

# Create a DataFrame to store the feature importances and their corresponding names
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})

# Sort the features by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot the top N most important features
top_n = 4
plt.figure(figsize=(5, 6))
plt.barh(range(top_n), feature_importance_df['Importance'][:top_n], align='center')
plt.yticks(range(top_n), feature_importance_df['Feature'][:top_n])
plt.xlabel('Feature Importance')
plt.title(f'Top {top_n} Most Important Features')
plt.gca().invert_yaxis() 
plt.show()
