<a href="https://colab.research.google.com/github/moni-sarah/-GitHub-Actions/blob/main/prepare_data_for_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

# Set seed for reproducibility
np.random.seed(42)
n_samples = 1000  # increased number of samples

# Generate synthetic dataset
data = {
    'income': np.random.normal(50000, 15000, n_samples).clip(10000, 200000),
    'credit_score': np.random.normal(650, 50, n_samples).clip(300, 850),
    'age': np.random.randint(21, 70, n_samples),
    'loan_amount': np.random.normal(20000, 10000, n_samples).clip(1000, 100000),
    'job_title': np.random.choice(['Engineer', 'Teacher', 'Doctor', 'Artist', 'Lawyer', 'Nurse', 'Salesperson'], n_samples),
    'marital_status': np.random.choice(['Single', 'Married', 'Divorced', 'Widowed'], n_samples),
    'education_level': np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], n_samples),
    'num_dependents': np.random.poisson(1.5, n_samples),
    'target': np.random.choice([0, 1], n_samples, p=[0.7, 0.3])  # Example: 30% positive class
}

# Convert to DataFrame for preview or export
df = pd.DataFrame(data)
print(df.head())


         income  credit_score  age   loan_amount job_title marital_status  \
0  57450.712295    719.967772   50   5993.260089  Engineer         Single   
1  47926.035482    696.231684   51  20143.320598    Lawyer       Divorced   
2  59715.328072    652.981518   58  24176.736985   Teacher        Married   
3  72845.447846    617.653161   31  28469.970877    Doctor       Divorced   
4  46487.699379    684.911166   55  21748.177688   Teacher       Divorced   

  education_level  num_dependents  target  
0          Master               0       1  
1             PhD               1       0  
2        Bachelor               1       1  
3          Master               1       0  
4        Bachelor               2       0  


In [2]:
# Step 2: Handle missing values

# Handle missing values for numeric columns by filling with the median
for column in df.select_dtypes(include=['float64', 'int64']).columns:
    df[column].fillna(df[column].median(), inplace=True)

# Handle missing values for categorical columns by filling with the mode
for column in df.select_dtypes(include=['object', 'category']).columns:
    df[column].fillna(df[column].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].mode()[0], inplace=True)


In [3]:
print("\nMissing values after handling:")
print(df.isnull().sum())


Missing values after handling:
income             0
credit_score       0
age                0
loan_amount        0
job_title          0
marital_status     0
education_level    0
num_dependents     0
target             0
dtype: int64


In [4]:
num_duplicates = df.duplicated().sum()
print(f"Number of duplicate rows removed: {num_duplicates}")
df.drop_duplicates(inplace=True)


Number of duplicate rows removed: 0


In [5]:
# Step 4: Encode categorical variables using one-hot encoding
categorical_cols = df.select_dtypes(include=['object', 'category']).columns

df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


In [6]:
print(f"Original shape: {df.shape}")
print(f"Encoded shape: {df_encoded.shape}")


Original shape: (1000, 9)
Encoded shape: (1000, 18)


In [7]:
from scipy.stats import zscore

# Step 5: Detect and remove outliers using Z-score

# Compute Z-scores for numeric columns
numeric_cols = df_encoded.select_dtypes(include=['float64', 'int64']).columns
z_scores = np.abs(zscore(df_encoded[numeric_cols]))

# Set a threshold for identifying outliers (commonly 3)
threshold = 3
mask = (z_scores < threshold).all(axis=1)

# Filter out the outliers
df_cleaned = df_encoded[mask]
print(f"Original rows: {df_encoded.shape[0]}")
print(f"Rows after outlier removal: {df_cleaned.shape[0]}")


Original rows: 1000
Rows after outlier removal: 984


In [8]:
# Step 6: Address skewed data with log transformation

# Identify numeric columns
numeric_cols = df_cleaned.select_dtypes(include=['float64', 'int64']).columns

# Check skewness
skewed_cols = df_cleaned[numeric_cols].apply(lambda x: x.skew()).sort_values(ascending=False)
print("Skewness before transformation:\n", skewed_cols)

# Choose features with high skewness (e.g., abs(skew) > 1)
high_skew = skewed_cols[abs(skewed_cols) > 1].index

# Apply log1p (log(x + 1)) to handle zero or negative values
df_cleaned[high_skew] = df_cleaned[high_skew].apply(lambda x: np.log1p(x))

# Check skewness again
print("\nSkewness after transformation:\n", df_cleaned[high_skew].apply(lambda x: x.skew()))


Skewness before transformation:
 target            0.995682
num_dependents    0.507427
loan_amount       0.114780
income            0.080624
age               0.002627
credit_score     -0.055397
dtype: float64

Skewness after transformation:
 Series([], dtype: float64)


In [9]:
from sklearn.model_selection import train_test_split

# Step 7: Split the dataset into features and target
X = df_cleaned.drop('target', axis=1)
y = df_cleaned['target']

# Split into training and testing sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Show the result
print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")


Training samples: 787
Testing samples: 197
