<a href="https://colab.research.google.com/github/opeokupe/capstone-ai-ml/blob/main/data_exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Churn Rate Analysis & Data Preprocessing
## Imperial College Capstone Project


In [2]:
# 1. Initial Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load the data
train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')

FileNotFoundError: [Errno 2] No such file or directory: '../data/train.csv'

In [1]:


# 2. Initial Data Exploration
## 2.1 Basic Data Overview
def explore_data(df, title="Dataset"):
    """
    Performs initial exploration of the dataset
    """
    print(f"\n{title} Exploration:")
    print("-" * 50)
    print(f"Shape: {df.shape}")
    print("\nFirst few rows:")
    print(df.head())
    print("\nData Info:")
    print(df.info())
    print("\nMissing Values:")
    print(df.isnull().sum())
    print("\nBasic Statistics:")
    print(df.describe())

# Explore training data
explore_data(train_data, "Training Dataset")

## 2.2 Data Distribution Analysis
def analyze_distributions(df):
    """
    Analyzes and plots distributions of numerical features
    """
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

    plt.figure(figsize=(15, len(numerical_cols)*4))
    for i, col in enumerate(numerical_cols, 1):
        plt.subplot(len(numerical_cols), 2, i*2-1)
        sns.histplot(df[col], kde=True)
        plt.title(f'Distribution of {col}')

        plt.subplot(len(numerical_cols), 2, i*2)
        sns.boxplot(y=df[col])
        plt.title(f'Boxplot of {col}')

    plt.tight_layout()
    plt.show()

# Plot distributions
analyze_distributions(train_data)

# 3. Bias Analysis
"""
Document potential biases in the dataset:
- Selection bias: [Your analysis here]
- Measurement bias: [Your analysis here]
- Demographic representation: [Your analysis here]
- Time-related biases: [Your analysis here]
"""

# 4. Feature Engineering
def engineer_features(df):
    """
    Creates new features based on existing data
    Returns DataFrame with new features
    """
    df_processed = df.copy()

    # Example feature engineering (modify based on your specific needs):

    # 4.1 Time-based features
    if 'last_login' in df.columns:
        df_processed['days_since_login'] = (pd.to_datetime('now') - pd.to_datetime(df['last_login'])).dt.days

    # 4.2 Interaction features
    if 'visit_frequency' in df.columns and 'duration' in df.columns:
        df_processed['engagement_score'] = df['visit_frequency'] * df['duration']

    # 4.3 Categorical encoding
    # Add your categorical encoding logic here

    return df_processed

# Apply feature engineering
train_data_processed = engineer_features(train_data)
test_data_processed = engineer_features(test_data)

# 5. Data Preprocessing
def preprocess_data(df):
    """
    Applies all preprocessing steps:
    - Handles missing values
    - Scales numerical features
    - Encodes categorical variables
    Returns preprocessed DataFrame
    """
    df_processed = df.copy()

    # 5.1 Handle missing values
    # Add your missing value handling logic here

    # 5.2 Scale numerical features
    scaler = StandardScaler()
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
    df_processed[numerical_cols] = scaler.fit_transform(df_processed[numerical_cols])

    # 5.3 Additional preprocessing steps
    # Add any additional preprocessing steps here

    return df_processed

# Apply preprocessing
train_data_final = preprocess_data(train_data_processed)
test_data_final = preprocess_data(test_data_processed)

# 6. Save Processed Data
# Save the processed datasets
train_data_final.to_csv('../data/processed_train.csv', index=False)
test_data_final.to_csv('../data/processed_test.csv', index=False)

# 7. Correlation Analysis
plt.figure(figsize=(12, 8))
sns.heatmap(train_data_final.corr(), annot=True, cmap='coolwarm')
plt.title('Feature Correlation Matrix')
plt.show()

"""
Final Notes and Observations:

1. Data Quality:
   - [Document your findings about data quality]
   - [Note any concerning patterns or issues]

2. Feature Engineering Decisions:
   - [Document why you created each new feature]
   - [Explain the expected impact on the model]

3. Preprocessing Decisions:
   - [Explain your choice of scaling methods]
   - [Document handling of outliers]
   - [Explain treatment of missing values]

4. Next Steps:
   - [List any additional analyses needed]
   - [Note potential model considerations based on this analysis]
"""

FileNotFoundError: [Errno 2] No such file or directory: '../data/train.csv'