### AI/ML – Improving Model Performance with Clean Data

**Task 1**: Data Preprocessing for Models

**Objective**: Enhance data quality for better AI/ML outcomes.

**Steps**:
1. Choose a dataset for training an AI/ML model.
2. Identify common data issues like null values, redundant features, or noisydata.
3. Apply preprocessing methods such as imputation, normalization, or feature engineering.

In [6]:
def preprocess_data(X_train, X_test):
    try:
        numeric_features = ["Age", "Income"]
        categorical_features = ["Gender"]

        numeric_transformer = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="mean")),
            ("scaler", StandardScaler())
        ])

        categorical_transformer = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ])

        preprocessor = ColumnTransformer(
            transformers=[
                ("num", numeric_transformer, numeric_features),
                ("cat", categorical_transformer, categorical_features)
            ]
        )

        preprocessor.fit(X_train)
        X_train_processed = preprocessor.transform(X_train)
        X_test_processed = preprocessor.transform(X_test)
        return X_train_processed, X_test_processed

    except Exception as e:
        print(f"Error during preprocessing: {e}")
        raise

def train_model(X_train, y_train):
    try:
        model = LogisticRegression()
        model.fit(X_train, y_train)
        return model
    except Exception as e:
        print(f"Error during model training: {e}")
        raise

In [7]:
import unittest
import numpy as np
import pandas as pd

class TestDataQualityPipeline(unittest.TestCase):

    def test_imputation(self):
        from sklearn.impute import SimpleImputer
        data = pd.DataFrame({"Age": [25, np.nan, 30]})
        imputer = SimpleImputer(strategy="mean")
        imputed = imputer.fit_transform(data)
        self.assertAlmostEqual(imputed[1][0], 27.5)

    def test_model_training(self):
        from sklearn.linear_model import LogisticRegression
        X = np.array([[1, 2], [2, 3], [3, 4]])
        y = np.array([0, 1, 0])
        model = LogisticRegression()
        model.fit(X, y)
        self.assertEqual(len(model.coef_[0]), 2)

if __name__ == "__main__":
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

..
----------------------------------------------------------------------
Ran 2 tests in 0.007s

OK


In [8]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Simulate dataset with nulls, redundant features, and mixed data types
data = {
    "Age": [25, np.nan, 35, 40, np.nan],
    "Income": [50000, 60000, np.nan, 80000, 55000],
    "Gender": ["Male", "Female", "Female", None, "Male"],
    "Purchased": [1, 0, 1, 0, 1],
    "RedundantFeature": [1, 1, 1, 1, 1]  # constant feature, redundant
}

df = pd.DataFrame(data)

# Step 1: Drop redundant feature
df_clean = df.drop(columns=["RedundantFeature"])

# Step 2: Define preprocessing for numeric and categorical features
numeric_features = ["Age", "Income"]
categorical_features = ["Gender"]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),       # fill nulls with mean
    ("scaler", StandardScaler())                        # normalize numeric data
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),  # fill nulls with mode
    ("onehot", OneHotEncoder(handle_unknown="ignore"))     # one-hot encode
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# Apply preprocessing
X = df_clean.drop(columns=["Purchased"])
y = df_clean["Purchased"]

X_processed = preprocessor.fit_transform(X)

print("Processed feature matrix (numpy array):")
print(X_processed)

Processed feature matrix (numpy array):
[[-1.7251639  -1.10448156  0.          1.          0.        ]
 [ 0.         -0.12272017  1.          0.          0.        ]
 [ 0.34503278  0.          1.          0.          0.        ]
 [ 1.38013112  1.8408026   0.          0.          1.        ]
 [ 0.         -0.61360087  0.          1.          0.        ]]


In [9]:
# Write your code from here
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Simulate dataset with nulls, redundant features, and mixed data types
data = {
    "Age": [25, np.nan, 35, 40, np.nan, 29, 33, 38, 45, 50],
    "Income": [50000, 60000, np.nan, 80000, 55000, 52000, 48000, 75000, np.nan, 82000],
    "Gender": ["Male", "Female", "Female", None, "Male", "Female", "Male", "Female", "Male", None],
    "Purchased": [1, 0, 1, 0, 1, 0, 0, 1, 1, 0],
    "RedundantFeature": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]  # constant redundant feature
}

df = pd.DataFrame(data)

# Split features and target
X = df.drop(columns=["Purchased"])
y = df["Purchased"]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# === Without Preprocessing ===
# We drop the redundant feature but do not handle missing data or encoding
X_train_no_prep = X_train.drop(columns=["RedundantFeature"])
X_test_no_prep = X_test.drop(columns=["RedundantFeature"])

# For model input without preprocessing, fill missing numerics with -1 and categorical with 'missing'
X_train_no_prep = X_train_no_prep.copy()
X_test_no_prep = X_test_no_prep.copy()

X_train_no_prep["Age"] = X_train_no_prep["Age"].fillna(-1)
X_test_no_prep["Age"] = X_test_no_prep["Age"].fillna(-1)

X_train_no_prep["Income"] = X_train_no_prep["Income"].fillna(-1)
X_test_no_prep["Income"] = X_test_no_prep["Income"].fillna(-1)

X_train_no_prep["Gender"] = X_train_no_prep["Gender"].fillna("missing")
X_test_no_prep["Gender"] = X_test_no_prep["Gender"].fillna("missing")

# Convert categorical gender to numeric codes simply (without one-hot)
X_train_no_prep["Gender"] = X_train_no_prep["Gender"].astype('category').cat.codes
X_test_no_prep["Gender"] = X_test_no_prep["Gender"].astype('category').cat.codes

model_no_prep = LogisticRegression()
model_no_prep.fit(X_train_no_prep, y_train)
y_pred_no_prep = model_no_prep.predict(X_test_no_prep)

acc_no_prep = accuracy_score(y_test, y_pred_no_prep)
print("=== Model performance WITHOUT preprocessing ===")
print(f"Accuracy: {acc_no_prep:.3f}")
print(classification_report(y_test, y_pred_no_prep))


# === With Preprocessing ===
# Drop redundant feature
X_train_prep = X_train.drop(columns=["RedundantFeature"])
X_test_prep = X_test.drop(columns=["RedundantFeature"])

numeric_features = ["Age", "Income"]
categorical_features = ["Gender"]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# Create a pipeline that applies preprocessing then LogisticRegression
model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression())
])

model_pipeline.fit(X_train_prep, y_train)
y_pred_prep = model_pipeline.predict(X_test_prep)

acc_prep = accuracy_score(y_test, y_pred_prep)
print("\n=== Model performance WITH preprocessing ===")
print(f"Accuracy: {acc_prep:.3f}")
print(classification_report(y_test, y_pred_prep))

=== Model performance WITHOUT preprocessing ===
Accuracy: 0.000
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       2.0
           1       0.00      0.00      0.00       1.0

    accuracy                           0.00       3.0
   macro avg       0.00      0.00      0.00       3.0
weighted avg       0.00      0.00      0.00       3.0


=== Model performance WITH preprocessing ===
Accuracy: 0.000
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       2.0
           1       0.00      0.00      0.00       1.0

    accuracy                           0.00       3.0
   macro avg       0.00      0.00      0.00       3.0
weighted avg       0.00      0.00      0.00       3.0



**Task 2**: Evaluate Model Performance

**Objective**: Assess the impact of data quality improvements on model performance.

**Steps**:
1. Train a simple ML model with and without preprocessing.
2. Analyze and compare model performance metrics to evaluate the impact of data quality strategies.

In [10]:
# Write your code from here
