[GitHub source](https://github.com/ElvisCasco/process_data)

In [1]:
# Automatically detect and set working directory
import os
from pathlib import Path
wd = os.getcwd()
print(f"Working directory: {wd}")

Working directory: c:\EC\BSE\DSDM\Term 1\21DM004 Computing for Data Science\HW7


In [2]:
!pip install --upgrade git+https://github.com/ElvisCasco/process_data.git

Collecting git+https://github.com/ElvisCasco/process_data.git
  Cloning https://github.com/ElvisCasco/process_data.git to c:\users\ecasc\appdata\local\temp\pip-req-build-gszftprf
  Resolved https://github.com/ElvisCasco/process_data.git to commit eb1c7ae7ed08fcadfa10928b1014f5545750a38f
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'


  Running command git clone --filter=blob:none --quiet https://github.com/ElvisCasco/process_data.git 'C:\Users\ecasc\AppData\Local\Temp\pip-req-build-gszftprf'

[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# Cell 1: Import the package (installed from GitHub)
import process_data as pdlib
import pandas as pd
import numpy as np
import inspect
from pathlib import Path

print("=" * 60)
print("process_data package - Installed from GitHub")
print("=" * 60)
print(f"Version: {getattr(pdlib, '__version__', 'unknown')}")
print(f"Location: {pdlib.__file__}")
print(f"\nAvailable functions:")
exports = [n for n in dir(pdlib) if not n.startswith("_")]
for i, func in enumerate(exports, 1):
    print(f"  {i}. {func}")
print("=" * 60)

process_data package - Installed from GitHub
Version: 0.3.0
Location: c:\Users\ecasc\AppData\Local\Programs\Python\Python313\Lib\site-packages\process_data\__init__.py

Available functions:
  1. data_binary
  2. data_encoding
  3. data_fill_nans
  4. data_loader
  5. data_remove_nans
  6. data_split
  7. model_predict
  8. model_train_models
  9. pred_auc_score


## a. Load the data

a. Load the data.

In [4]:
# Cell 2: Prepare a sample CSV
csv_path = wd + "/data/sample_diabetes_mellitus_data.csv"
df = pdlib.data_loader(csv_path)

Data loaded successfully. Shape: (10000, 53)


## b. Test data_loader and data_split:

b. Split the data between train and test. (you can use train_test_split from sklearn or any other way)

In [5]:
# Cell 3: Test data_loader and data_split
print("\n" + "=" * 60)
print("TEST 1: data_loader and data_split")
print("=" * 60)

df_loaded = pdlib.data_loader(csv_path)
print(f"data_loader: Loaded {df_loaded.shape}")

train_df, test_df = pdlib.data_split(csv_path, test_size=0.3, random_state=42)
print(f"data_split:")
print(f"Train: {train_df.shape} ({train_df.shape[0]/df_loaded.shape[0]*100:.1f}%)")
print(f"Test: {test_df.shape} ({test_df.shape[0]/df_loaded.shape[0]*100:.1f}%)")


TEST 1: data_loader and data_split


Data loaded successfully. Shape: (10000, 53)
data_loader: Loaded (10000, 53)
Data loaded successfully. Shape: (10000, 53)
data_split:
Train: (7000, 53) (70.0%)
Test: (3000, 53) (30.0%)


## c. Test data_remove_nans:

c. Remove those rows that contain NaN values in the columns: age, gender, ethnicity.

In [6]:
# Cell 4: Test data_remove_nans
print("\n" + "=" * 60)
print("TEST 2: data_remove_nans")
print("=" * 60)

cols_nan = ["age", "gender", "ethnicity"]
print(f"Before: Train {train_df.shape}, NaNs={train_df[cols_nan].isna().sum().sum()}")
print(f"        Test {test_df.shape}, NaNs={test_df[cols_nan].isna().sum().sum()}")

train_df = pdlib.data_remove_nans(train_df, columns=cols_nan)
test_df = pdlib.data_remove_nans(test_df, columns=cols_nan)

print(f"After:  Train {train_df.shape}, NaNs={train_df[cols_nan].isna().sum().sum()}")
print(f"        Test {test_df.shape}, NaNs={test_df[cols_nan].isna().sum().sum()}")
print("Passed: No NaNs in specified columns")


TEST 2: data_remove_nans
Before: Train (7000, 53), NaNs=465
        Test (3000, 53), NaNs=185
After:  Train (6547, 53), NaNs=0
        Test (2821, 53), NaNs=0
Passed: No NaNs in specified columns


## d. Test data_fill_nans:

d. Fill NaN with the mean value of the column in the columns: height, weight.

In [7]:
# Cell 5: Test data_fill_nans
print("\n" + "=" * 60)
print("TEST 3: data_fill_nans")
print("=" * 60)

cols_fill = ["height", "weight"]
print(f"Before: {train_df[cols_fill].isna().sum().to_dict()}")

train_df = pdlib.data_fill_nans(train_df, columns=cols_fill)
test_df = pdlib.data_fill_nans(test_df, columns=cols_fill)

print(f"After:  {train_df[cols_fill].isna().sum().to_dict()}")
print(f"Mean height: {train_df['height'].mean():.2f}, weight: {train_df['weight'].mean():.2f}")


TEST 3: data_fill_nans
Before: {'height': 86, 'weight': 1077}
After:  {'height': 0, 'weight': 0}
Mean height: 169.98, weight: 86.91


## e. Test data_encoding:

e. Generate dummies for ethnicity column (One hot encoding).

In [8]:
# Cell 6: Test data_encoding (one-hot encoding)
print("\n" + "=" * 60)
print("TEST 4: data_encoding")
print("=" * 60)

print(f"Before: {train_df.shape[1]} columns")
train_df = pdlib.data_encoding(train_df, columns=["ethnicity"])
test_df = pdlib.data_encoding(test_df, columns=["ethnicity"])

ethnicity_cols = [c for c in train_df.columns if c.startswith("ethnicity_")]
print(f"After:  {train_df.shape[1]} columns")
print(f"New columns: {ethnicity_cols}")


TEST 4: data_encoding
Before: 53 columns
After:  58 columns
New columns: ['ethnicity_African American', 'ethnicity_Asian', 'ethnicity_Caucasian', 'ethnicity_Hispanic', 'ethnicity_Native American', 'ethnicity_Other/Unknown']


## f. Test data_binary:

f. Create a binary variable for gender M/F.

In [9]:
# Cell 7: Test data_binary
print("\n" + "=" * 60)
print("TEST 5: data_binary")
print("=" * 60)

print(f"Before: gender dtype={train_df['gender'].dtype}, unique={train_df['gender'].unique()}")
train_df = pdlib.data_binary(train_df, column="gender")
test_df = pdlib.data_binary(test_df, column="gender")

print(f"After:  gender dtype={train_df['gender'].dtype}, unique={sorted(train_df['gender'].dropna().unique())}")
print(f"Value counts:\n{train_df['gender'].value_counts()}")


TEST 5: data_binary
Before: gender dtype=object, unique=['F' 'M']
After:  gender dtype=Int64, unique=[np.int64(0), np.int64(1)]
Value counts:
gender
1    3599
0    2948
Name: count, dtype: Int64


## g. Test model_train_models:

g. Train a model (for instance LogisticRegression or RandomForestClassifier from sklearn) in the train data.

Use as features the columns: `age`, `height`, `weight`, `aids`, `cirrhosis`, `hepatic_failure`, `immunosuppression`, `leukemia`, `lymphoma`, `solid_tumor_with_metastasis`.

Use as target the column: `diabetes_mellitus`

In [10]:
# Cell 8: Train models
print("\n" + "=" * 60)
print("TEST 6: data_train_models")
print("=" * 60)

FEATURES = [
    "age", "height", "weight",
    "aids", "cirrhosis", "hepatic_failure",
    "immunosuppression", "leukemia", "lymphoma",
    "solid_tumor_with_metastasis",
]
TARGET = "diabetes_mellitus"

X_train = train_df[FEATURES]
y_train = train_df[TARGET]

print(f"Training with {len(FEATURES)} features, {len(X_train)} samples")
model_lr = pdlib.model_train_models(X_train, y_train, model_type="logreg")
model_rf = pdlib.model_train_models(X_train, y_train, model_type="rf")

print(f"Trained: {type(model_lr).__name__}")
print(f"Trained: {type(model_rf).__name__}")


TEST 6: data_train_models
Training with 10 features, 6547 samples


Trained: LogisticRegression
Trained: RandomForestClassifier


## h. Test add_predictions:

h. Predict the targets for both the train and test sets and add the prediction as a new column (use predict_proba from the model to get the predicted probabilities) name the new column something
like predictions.

In [11]:
# Cell 9: Add predictions to train and test sets
print("\n" + "=" * 60)
print("TEST 7: add_predictions")
print("=" * 60)

# Import add_predictions function
try:
    add_predictions = pdlib.add_predictions
except AttributeError:
    from process_data.pred_auc_score import add_predictions

# Add predictions using LogisticRegression model
train_with_pred, test_with_pred = add_predictions(
    model_lr,
    train_df,
    test_df,
    FEATURES,
    pred_col="predictions",
    inplace=False
)

print(f"Added 'predictions' column to train and test sets")
print(f"\nTrain predictions sample:")
print(train_with_pred[[TARGET, "predictions"]].head())
print(f"\nTest predictions sample:")
print(test_with_pred[[TARGET, "predictions"]].head())

assert "predictions" in train_with_pred.columns
assert "predictions" in test_with_pred.columns
print("\nAssertion passed")


TEST 7: add_predictions
Added 'predictions' column to train and test sets

Train predictions sample:
      diabetes_mellitus  predictions
9069                  0     0.133033
2603                  0     0.235128
7738                  0     0.305614
1579                  0     0.168839
5058                  0     0.156940

Test predictions sample:
      diabetes_mellitus  predictions
6252                  1     0.317016
1731                  0     0.282181
4742                  0     0.117664
4521                  0     0.130718
6340                  1     0.331450

Assertion passed


## i. Test pred_auc_score:

i. Compute the train and test roc_auc metric using roc_auc_score from sklearn.

In [12]:
# Cell 10: Compute ROC AUC scores
print("\n" + "=" * 60)
print("TEST 8: pred_auc_score (ROC AUC)")
print("=" * 60)

auc_train = pdlib.pred_auc_score(
    train_with_pred[TARGET],
    train_with_pred["predictions"]
)
auc_test = pdlib.pred_auc_score(
    test_with_pred[TARGET],
    test_with_pred["predictions"]
)

print(f"ROC AUC (LogisticRegression):")
print(f"   Train AUC: {auc_train:.4f}")
print(f"   Test AUC:  {auc_test:.4f}")
print(f"   Difference: {abs(auc_train - auc_test):.4f}")

if auc_test > 0.5:
    print(f"Model performs better than random")
else:
    print(f"Model needs improvement")


TEST 8: pred_auc_score (ROC AUC)
ROC AUC (LogisticRegression):
   Train AUC: 0.6761
   Test AUC:  0.6559
   Difference: 0.0202
Model performs better than random


## Test model_predict:

In [13]:
# Cell 11: Test data_predict function
print("\n" + "=" * 60)
print("TEST 9: data_predict")
print("=" * 60)

from numpy.testing import assert_allclose

# Test predict_proba
proba_test = pdlib.model_predict(model_lr, test_df[FEATURES], proba=True)
print(f"Probabilities shape: {proba_test.shape}")
print(f"   Sample: {proba_test[:5]}")

# Verify matches add_predictions
assert_allclose(proba_test, test_with_pred["predictions"].to_numpy(), atol=1e-9)
print("data_predict matches add_predictions")

# Test class predictions
class_pred = pdlib.model_predict(model_lr, test_df[FEATURES], proba=False)
print(f"Class predictions: {class_pred[:10]}")
print(f"   Unique classes: {sorted(set(class_pred))}")


TEST 9: data_predict
Probabilities shape: (2821,)
   Sample: [0.31701586 0.28218064 0.11766365 0.13071773 0.33144975]
data_predict matches add_predictions
Class predictions: [0 0 0 0 0 0 0 0 0 0]
   Unique classes: [np.int64(0), np.int64(1)]


## Summary

In [14]:
# Cell 12: Summary
print("\n" + "=" * 60)
print("ALL TESTS COMPLETED SUCCESSFULLY")
print("=" * 60)

print("\nFunctions tested:")
functions_tested = [
    "data_loader", "data_split", "data_remove_nans",
    "data_fill_nans", "data_encoding", "data_binary",
    "data_train_models", "add_predictions",
    "pred_auc_score", "data_predict"
]
for i, func in enumerate(functions_tested, 1):
    print(f"  {i:2d}. {func}")

print(f"\nFinal Results:")
print(f"  Dataset size: {len(train_with_pred) + len(test_with_pred)} samples")
print(f"  Train: {len(train_with_pred)} samples")
print(f"  Test:  {len(test_with_pred)} samples")
print(f"  Features: {len(FEATURES)}")
print(f"  Train AUC: {auc_train:.4f}")
print(f"  Test AUC:  {auc_test:.4f}")
print("=" * 60)


ALL TESTS COMPLETED SUCCESSFULLY

Functions tested:
   1. data_loader
   2. data_split
   3. data_remove_nans
   4. data_fill_nans
   5. data_encoding
   6. data_binary
   7. data_train_models
   8. add_predictions
   9. pred_auc_score
  10. data_predict

Final Results:
  Dataset size: 9368 samples
  Train: 6547 samples
  Test:  2821 samples
  Features: 10
  Train AUC: 0.6761
  Test AUC:  0.6559
