In [None]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from scipy.stats import ttest_ind
from sklearn.cluster import KMeans



In [None]:
df = pd.read_csv('/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()

In [None]:
df.info()

In [None]:
df.hist(figsize=(12, 8))
plt.show()

In [None]:
# Convert 'TotalCharges' to numeric, coerce errors to NaN
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Ensure the 'Churn' column is categorical
df['Churn'] = df['Churn'].astype('category')

# Create a box plot with 'Churn' as the x-axis and 'TotalCharges' as the y-axis
sns.boxplot(x='Churn', y='TotalCharges', data=df)
plt.show()


In [None]:
sns.countplot(x='PaymentMethod', data=df)
plt.xticks(rotation=45)
plt.show()

In [None]:
numeric_columns = df.select_dtypes(include=['number'])

corr_matrix = numeric_columns.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.show()


In [None]:
# Convert 'TotalCharges' to numeric, coerce errors to NaN
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Check for missing values
missing_values = df['TotalCharges'].isnull().sum()
print(f"Number of missing values in 'TotalCharges': {missing_values}")

# Drop rows with missing 'TotalCharges' values or fill with a suitable value

#perform the t-test
churned = df[df['Churn'] == 'Yes']['TotalCharges']
not_churned = df[df['Churn'] == 'No']['TotalCharges']
t_statistic, p_value = ttest_ind(churned, not_churned)
print(f'T-statistic: {t_statistic}, p-value: {p_value}')

In [None]:
# Replace missing values in 'MonthlyCharges' and 'TotalCharges' with the mean
imputer = SimpleImputer(strategy='mean')
X_imputed = df[['MonthlyCharges', 'TotalCharges']].copy()
X_imputed[['MonthlyCharges', 'TotalCharges']] = imputer.fit_transform(X_imputed[['MonthlyCharges', 'TotalCharges']])
kmeans = KMeans(n_clusters=3, n_init=4)  
df['Cluster'] = kmeans.fit_predict(X_imputed)

In [None]:
X = df.drop(['Churn'], axis=1)
y = df['Churn']

# Handle missing values in the dataset
#SimpleImputer to fill missing numeric values with the mean
# and missing categorical values with the most frequent value.
numeric_cols = X.select_dtypes(include='number').columns
categorical_cols = X.select_dtypes(exclude='number').columns

numeric_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

X[numeric_cols] = numeric_imputer.fit_transform(X[numeric_cols])
X[categorical_cols] = categorical_imputer.fit_transform(X[categorical_cols])

X_encoded = pd.get_dummies(X, drop_first=True)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Build and evaluate a Random Forest classifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))


