preprocess.py

In [4]:
# src/preprocess.py

import pandas as pd
from sklearn.model_selection import train_test_split

def load_data(filepath):
    data = pd.read_csv(filepath)
    return data

def preprocess_data(data):
    X = data.drop('species', axis=1)
    y = data['species']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=420)
    return X_train, X_test, y_train, y_test

if __name__ == "__main__":
    data = load_data('data/iris_data.csv')
    X_train, X_test, y_train, y_test = preprocess_data(data)
    # Save preprocessed data if needed

In [6]:
# src/train.py

import joblib
from sklearn.ensemble import RandomForestClassifier
# from preprocess import load_data, preprocess_data            #loading from preprocessing script

def train_model(X_train, y_train):
    model = RandomForestClassifier(n_estimators=100, random_state=420)
    model.fit(X_train, y_train)
    return model

if __name__ == "__main__":
    data = load_data('data/iris_data.csv')
    X_train, X_test, y_train, y_test = preprocess_data(data)
    model = train_model(X_train, y_train)
    joblib.dump(model, 'model/random_forest_model.pkl')

In [8]:
# src/evaluate.py

import joblib
from sklearn.metrics import accuracy_score
# from preprocess import load_data, preprocess_data                #loading from preprocessing script

def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    return accuracy

if __name__ == "__main__":
    data = load_data('data/iris_data.csv')
    X_train, X_test, y_train, y_test = preprocess_data(data)
    model = joblib.load('model/random_forest_model.pkl')
    accuracy = evaluate_model(model, X_test, y_test)
    print(f"Model Accuracy: {accuracy * 100:.2f}%")

Model Accuracy: 100.00%


In [11]:
# tests/test_preprocess.py

import pytest
# from src.preprocess import load_data, preprocess_data              #loading from preprocessing script

def test_load_data():
    data = load_data('data/iris_data.csv')
    assert not data.empty

def test_preprocess_data():
    data = load_data('data/iris_data.csv')
    X_train, X_test, y_train, y_test = preprocess_data(data)
    assert len(X_train) > 0
    assert len(X_test) > 0