In [47]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from hyperopt import fmin, tpe, hp
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# functions

In [48]:
def read_csv(file_path):
    """Reads a CSV file into a Pandas DataFrame."""
    return pd.read_csv(file_path)


def create_feature():
    """Creates feature functions (currently empty)."""
    pass


def train_classifier(
    data, n_estimators=100, max_depth=None, min_samples_split=2, random_state=42
):
    X = data.iloc[:, :-1]
    Y = data.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(
        X, Y, test_size=0.2, random_state=42
    )
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        random_state=random_state,
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return model, accuracy


def objective(params):
    """
    Calculates the cross-validation score for a RandomForestClassifier.
    """

    model = RandomForestClassifier()
    scores = cross_val_score(model, X, Y, cv=5).mean()
    return scores


def evaluate_model(model, X_test, Y_test):
    """
    Evaluates the trained model on the given data.
    """
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(Y_test, y_pred)
    return accuracy

In [49]:
data = read_csv("C:/Users/asus/Desktop/MLOOPS/session-datasets/session-datasets/datasets-session-17/diabetes.csv")
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


# pipeline


In [50]:
# Assume you have your data in a pandas DataFrame called 'data'
# and target variable in 'target'

# Example data (replace with your actual data)
data = data

# Split the data into training and testing sets
X = data.iloc[:, :-1]
Y = data.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)


In [None]:
from sklearn import set_config

set_config(display="diagram")

p1 = ColumnTransformer(
    [("preprocessor", StandardScaler(), X.columns)], remainder="passthrough"
)
# from hyper parameters that are find at the bottom of this page

p2 = RandomForestClassifier(n_estimators=175, max_depth=12)

pipeline = Pipeline([("p1", p1), ("p2", p2)])
pipeline.fit(X_train, y_train)

In [69]:
from sklearn.model_selection import cross_val_score

# Make predictions
y_pred = pipeline.predict(X_test)

# Print accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

overall_accuracy = cross_val_score(pipeline, X, Y, cv=5).mean()
print(overall_accuracy)

Accuracy: 0.7337662337662337
0.7760971055088701


# random sample test (my generated)

In [71]:
my_sample = np.array([6, 148, 72, 35, 0, 33.6, 0.627, 50]).reshape(1, -1)
my_sample_df = pd.DataFrame(my_sample , columns = X.columns)

prediction = pipeline.predict(my_sample_df)

print(prediction)
print(my_sample_df)

[1]
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0          6.0    148.0           72.0           35.0      0.0  33.6   

   DiabetesPedigreeFunction   Age  
0                     0.627  50.0  


# hyper parameter tuning

In [None]:
# from hyperopt import hp

# space = {
#     "nestimator": hp.quniform("nestimator", 50, 500, 25),
#     "maxdepth": hp.quniform("maxdepth", 4, 15, 1),
# }

# best_paramas = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100)

# best_paramas

100%|██████████| 100/100 [01:01<00:00,  1.61trial/s, best loss: 0.7539767422120363]


{'maxdepth': 12.0, 'nestimator': 175.0}