## Step 1: Import Required Libraries
We will use pandas and numpy for data handling, matplotlib and seaborn for visualization, and scikit-learn for machine learning.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix


## Step 2: Load Dataset
We will use a publicly available Telco Customer Churn dataset.


In [2]:
import pandas as pd

data = pd.read_csv(r"C:\Users\HARI\Downloads\WA_Fn-UseC_-Telco-Customer-Churn.csv")
data.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## Step 3: Explore the Dataset
In this step, we analyze the dataset to understand its structure, data types, and target variable distribution.


In [3]:
data = pd.read_csv(r"C:\Users\HARI\Downloads\WA_Fn-UseC_-Telco-Customer-Churn.csv")
# Check number of rows and columns
data.shape
#check column name datatypes
data.info()
# Statistical summary of numerical columns
data.describe()
# Check target variable distribution
data['Churn'].value_counts()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


Churn
No     5174
Yes    1869
Name: count, dtype: int64

## Step 4: Data Preprocessing
We clean the data by removing unnecessary columns, handling missing values, and converting categorical data into numerical format.


In [5]:
data = pd.read_csv(r"C:\Users\HARI\Downloads\WA_Fn-UseC_-Telco-Customer-Churn.csv")
# Drop customerID as it does not help in prediction
data.drop('customerID', axis=1, inplace=True)
# Convert target variable 'Churn' to numeric
data['Churn'] = data['Churn'].map({'Yes': 1, 'No': 0})
# Convert TotalCharges to numeric and handle missing values
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
data.dropna(inplace=True)


## Step 5: Encode Categorical Features
Machine learning models require numerical input. We use one-hot encoding to convert categorical features into numbers.


In [None]:
categorical_cols = data.select_dtypes(include='object').columns
data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

data.head()


## Step 6: Split Features and Target
X contains input features and y contains the target variable.


In [None]:
X = data.drop('Churn', axis=1)
y = data['Churn']


## Step 7: Train-Test Split
We split the dataset into training and testing sets to evaluate model performance.


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


## Step 8: Model Training
We use Logistic Regression for binary classification.


In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [None]:
## Step 9: Model Evaluation
We evaluate the model using accuracy score and confusion matrix.


In [None]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

accuracy, conf_matrix


In [None]:
## Step 10: Prediction
The trained model predicts whether a customer will churn or not.


In [None]:
sample = X_test.iloc[0:1]
prediction = model.predict(sample)

if prediction[0] == 1:
    print("Customer will churn")
else:
    print("Customer will not churn")


In [None]:
## Step 11: Data Visualization
Visualizing churn distribution helps understand customer behavior.


In [None]:
sns.countplot(x='Churn', data=data)
plt.show()
