In [11]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

# Load dataset
data = pd.read_csv(r"C:\Users\ACER\Documents\BAN6800\churn_80_airtelnigeria.csv")

# Step 1: Check for missing or irregular values
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(0, inplace=True)  # Assuming missing numeric values can be treated as 0 for now

# Step 2: Standardize data types
categorical_columns = ['Region', 'Roaming Plan', 'Data Plan']
bool_columns = ['Churn (Inactive >30 Days)']
data[bool_columns] = data[bool_columns].astype(bool)

# Step 3: Encode categorical variables
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_columns = pd.DataFrame(encoder.fit_transform(data[categorical_columns]),
                               columns=encoder.get_feature_names_out(categorical_columns),
                               index=data.index)
data = pd.concat([data.drop(columns=categorical_columns), encoded_columns], axis=1)

# Step 4: Feature Engineering (if needed)
data['Total Data Usage (MB)'] = (data['Total Day Data (MB)'] + 
                                 data['Total Evening Data (MB)'] + 
                                 data['Total Night Data (MB)'] + 
                                 data['Total Roaming Data (MB)'])

# Step 5: Scaling numerical features
numerical_columns = data.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Step 6: Outlier detection (basic z-score based removal)
z_scores = np.abs((data[numerical_columns] - data[numerical_columns].mean()) / data[numerical_columns].std())
data = data[(z_scores < 3).all(axis=1)]

# Save the cleaned and processed dataset
cleaned_file_path = r"C:\Users\ACER\Documents\BAN6800\cleaned_churn_data.csv"
data.to_csv(cleaned_file_path, index=False, header=True)

print(f"Cleaned dataset saved to {cleaned_file_path}")

Cleaned dataset saved to C:\Users\ACER\Documents\BAN6800\cleaned_churn_data.csv


In [17]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

# Load dataset
data = pd.read_csv(r"C:\Users\ACER\Documents\BAN6800\churn_80_airtelnigeria.csv")

# Step 1: Check for missing or irregular values
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(0, inplace=True)  # Assuming missing numeric values can be treated as 0 for now

# Step 2: Standardize data types
categorical_columns = ['Region', 'Roaming Plan', 'Data Plan']
bool_columns = ['Churn (Inactive >30 Days)']
data[bool_columns] = data[bool_columns].astype(bool)

# Step 3: Encode categorical variables
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_columns = pd.DataFrame(encoder.fit_transform(data[categorical_columns]),
                               columns=encoder.get_feature_names_out(categorical_columns),
                               index=data.index)
data = pd.concat([data.drop(columns=categorical_columns), encoded_columns], axis=1)

# Step 4: Feature Engineering (if needed)
data['Total Data Usage (MB)'] = (data['Total Day Data (MB)'] + 
                                 data['Total Evening Data (MB)'] + 
                                 data['Total Night Data (MB)'] + 
                                 data['Total Roaming Data (MB)'])

# Step 5: Scaling numerical features
numerical_columns = data.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Step 6: Outlier detection (adjusted logic)
z_scores = np.abs((data[numerical_columns] - data[numerical_columns].mean()) / data[numerical_columns].std())
outlier_threshold = 0.5  # Allow up to 50% of columns in a row to be outliers
outlier_mask = (z_scores > 3).sum(axis=1) / len(numerical_columns) <= outlier_threshold
print(f"Rows before outlier removal: {data.shape[0]}")
data = data[outlier_mask]
print(f"Rows after outlier removal: {data.shape[0]}")

# Save the cleaned and processed dataset
cleaned_file_path = r"C:\Users\ACER\Documents\BAN6800\cleaned_churn_data.csv"
data.to_csv(cleaned_file_path, index=False, header=True)

print(f"Cleaned dataset saved to {cleaned_file_path}")


Rows before outlier removal: 2666
Rows after outlier removal: 2666
Cleaned dataset saved to C:\Users\ACER\Documents\BAN6800\cleaned_churn_data.csv
