In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Step 1: Create a sample customer dataset
data = {
    'customer_id': np.arange(1, 101),  # 100 customers
    'age': np.random.randint(18, 65, size=100),
    'tenure': np.random.randint(1, 10, size=100),
    'churn': np.random.choice([0, 1], size=100)  # 0: No churn, 1: Churn
}

# Load the data into a pandas DataFrame
df = pd.DataFrame(data)

# Step 2: Data Aggregation - Calculate average age and tenure of churned vs non-churned customers
churn_agg = df.groupby('churn').agg({
    'age': 'mean',
    'tenure': 'mean'
}).reset_index()

print("Aggregated data (average age and tenure of churned vs non-churned customers):\n", churn_agg)

# Step 3: Data Splitting
# Features (X) are age and tenure, Target (y) is churn
X = df[['age', 'tenure']]
y = df['churn']

# Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)

# Step 4: Model Training - Using Logistic Regression for classification
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Step 5: Evaluate model performance using accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the Logistic Regression model: {accuracy:.2f}")

Aggregated data (average age and tenure of churned vs non-churned customers):
    churn        age    tenure
0      0  40.313725  5.058824
1      1  39.938776  5.795918
Training set size: (80, 2)
Testing set size: (20, 2)
Accuracy of the Logistic Regression model: 0.70
