<a href="https://colab.research.google.com/github/megmarv/Client-Deposit-Prediction/blob/main/DatasetPreparationForML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git status

fatal: not a git repository (or any of the parent directories): .git


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE

# Load the training dataset
data_path = '/content/drive/MyDrive/ML/bank-additional-full.csv'
df = pd.read_csv(data_path, sep=';')

# Display initial information
print("Initial Training Dataset Info:")
df.info()
print("\nSample Data:")
print(df.head())

# 1. Handle missing values ("unknown")
missing_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan']
for col in missing_cols:
    mode_value = df[col].mode()[0]
    df[col] = df[col].replace('unknown', mode_value)

# 2. Remove duplicate rows
df.drop_duplicates(inplace=True)

# 3. Drop or exclude the `duration` column
df.drop(columns=['duration'], inplace=True)

# 4. One-hot encode categorical variables and scale numeric features
categorical_columns = [
    'job', 'marital', 'education', 'default', 'housing', 'loan',
    'contact', 'month', 'day_of_week', 'poutcome'
]
numeric_columns = [
    'age', 'campaign', 'pdays', 'previous', 'emp.var.rate',
    'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed'
]

# Transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_columns),
        ('cat', OneHotEncoder(drop='first'), categorical_columns)
    ]
)

X = df.drop(columns=['y'])
y = df['y'].apply(lambda x: 1 if x == 'yes' else 0)  # Convert target to binary (0, 1)

# Apply preprocessing
X_preprocessed = preprocessor.fit_transform(X)

# 5. Handle class imbalance
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_preprocessed, y)

# Save processed training data
pd.DataFrame(X_balanced).to_csv('/content/X_balanced.csv', index=False)
pd.DataFrame({'y': y_balanced}).to_csv('/content/y_balanced.csv', index=False)

print("Training dataset preprocessing complete. Datasets saved for modeling.")

# Load the testing dataset
test_data_path = '/content/drive/MyDrive/ML/bank-additional.csv'
df_test = pd.read_csv(test_data_path, sep=';')

# Display initial information
print("Initial Testing Dataset Info:")
df_test.info()
print("\nSample Data (Testing Dataset):")
print(df_test.head())

# 1. Handle missing values ("unknown") using mode values from training dataset
for col in missing_cols:
    mode_value = df[col].mode()[0]
    df_test[col] = df_test[col].replace('unknown', mode_value)

# 2. Drop the `duration` column
df_test.drop(columns=['duration'], inplace=True)

# Apply preprocessing to the testing dataset
X_test_original = df_test.drop(columns=['y'])
y_test_original = df_test['y'].apply(lambda x: 1 if x == 'yes' else 0)  # Convert target to binary (0, 1)

X_test_preprocessed = preprocessor.transform(X_test_original)

# Save processed testing data
pd.DataFrame(X_test_preprocessed).to_csv('/content/X_test_preprocessed.csv', index=False)
pd.DataFrame({'y': y_test_original}).to_csv('/content/y_test_preprocessed.csv', index=False)

print("Testing dataset preprocessing complete. Processed datasets saved.")

# Mount Google Drive
import os
from google.colab import drive

if not os.path.exists("/content/drive/MyDrive"):
    drive.mount('/content/drive')

# Define paths for saving
save_dir = '/content/drive/MyDrive/ML'
os.makedirs(save_dir, exist_ok=True)

X_balanced_path = os.path.join(save_dir, 'X_balanced.csv')
y_balanced_path = os.path.join(save_dir, 'y_balanced.csv')
X_test_path = os.path.join(save_dir, 'X_test_preprocessed.csv')
y_test_path = os.path.join(save_dir, 'y_test_preprocessed.csv')

# Save processed datasets
pd.DataFrame(X_balanced).to_csv(X_balanced_path, index=False)
pd.DataFrame({'y': y_balanced}).to_csv(y_balanced_path, index=False)
pd.DataFrame(X_test_preprocessed).to_csv(X_test_path, index=False)
pd.DataFrame({'y': y_test_original}).to_csv(y_test_path, index=False)

print(f"Datasets saved to Google Drive:\n"
      f"{X_balanced_path}\n"
      f"{y_balanced_path}\n"
      f"{X_test_path}\n"
      f"{y_test_path}")



Initial Training Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  



Training dataset preprocessing complete. Datasets saved for modeling.
Initial Testing Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4119 entries, 0 to 4118
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             4119 non-null   int64  
 1   job             4119 non-null   object 
 2   marital         4119 non-null   object 
 3   education       4119 non-null   object 
 4   default         4119 non-null   object 
 5   housing         4119 non-null   object 
 6   loan            4119 non-null   object 
 7   contact         4119 non-null   object 
 8   month           4119 non-null   object 
 9   day_of_week     4119 non-null   object 
 10  duration        4119 non-null   int64  
 11  campaign        4119 non-null   int64  
 12  pdays           4119 non-null   int64  
 13  previous        4119 non-null   int64  
 14  poutcome        4119 non-null   object 
 15  emp.var.rate    4119 no