<a href="https://colab.research.google.com/github/noureldeenhossam/uneeq-internship/blob/main/uneeq_task_2_customer_churn_prediction_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Load the dataset
df = pd.read_csv('customer_churn_dataset-training-master.csv')

In [None]:
df.isnull().sum()

Unnamed: 0,0
CustomerID,1
Age,1
Gender,1
Tenure,1
Usage Frequency,1
Support Calls,1
Payment Delay,1
Subscription Type,1
Contract Length,1
Total Spend,1


In [None]:
# Drop rows with any missing values
df.dropna(inplace=True)

In [None]:
df.isnull().sum()

Unnamed: 0,0
CustomerID,0
Age,0
Gender,0
Tenure,0
Usage Frequency,0
Support Calls,0
Payment Delay,0
Subscription Type,0
Contract Length,0
Total Spend,0


In [None]:
 # Encoding categorical variables using LabelEncoder
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])
df['Subscription Type'] = label_encoder.fit_transform(df['Subscription Type'])
df['Contract Length'] = label_encoder.fit_transform(df['Contract Length'])


In [None]:
# Split the dataset into features (X) and target (y)
X = df.drop(columns=['CustomerID', 'Churn'])
y = df['Churn']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train the Logistic Regression model
log_reg = LogisticRegression(random_state=42, max_iter=1000)
log_reg.fit(X_train, y_train)

In [None]:
y_pred = log_reg.predict(X_test)


In [None]:

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print detailed classification report
print(classification_report(y_test, y_pred))

Accuracy: 0.85
              precision    recall  f1-score   support

         0.0       0.81      0.85      0.83     38063
         1.0       0.88      0.85      0.87     50104

    accuracy                           0.85     88167
   macro avg       0.85      0.85      0.85     88167
weighted avg       0.85      0.85      0.85     88167



In [None]:
# Function to test the model on another dataset
def test_model_on_new_data(model, new_data_path):
    # Load new dataset
    new_df = pd.read_csv(new_data_path)

    # Drop rows with any missing values
    new_df.dropna(inplace=True)

    # Encode categorical variables
    for column in ['Gender', 'Subscription Type', 'Contract Length']:
        unique_values = list(set(df[column]).union(set(new_df[column])))
        label_encoder = LabelEncoder()
        label_encoder.fit(unique_values)
        new_df[column] = label_encoder.transform(new_df[column])  # Transform the new dataset

    # Split the new data into features and target
    X_new = new_df.drop(columns=['CustomerID', 'Churn'])
    y_new = new_df['Churn']

    # Predict on the new dataset
    y_new_pred = model.predict(X_new)

    # Calculate accuracy and print classification report
    new_accuracy = accuracy_score(y_new, y_new_pred)
    print(f"Accuracy on Test Data: {new_accuracy:.2f}")
    print(classification_report(y_new, y_new_pred))

# Path to the test CSV file
test_file_path = '/content/customer_churn_dataset-testing-master.csv'

# Test the model on the new test dataset and display accuracy
test_model_on_new_data(log_reg, test_file_path)

Accuracy on Test Data: 0.69
              precision    recall  f1-score   support

           0       0.81      0.54      0.65     33881
           1       0.63      0.86      0.72     30493

    accuracy                           0.69     64374
   macro avg       0.72      0.70      0.69     64374
weighted avg       0.72      0.69      0.68     64374

