In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import yaml

In [17]:
SEED: int = 666
TEST_SIZE: float = 0.2
DATA: str = 'iris.csv'
target = 'target'

In [18]:
df = pd.read_csv(DATA)

In [19]:
features = list(df.columns)
features.remove(target)
X = df[features]
y = df[target]

In [20]:
le = LabelEncoder()
le.fit(y)
y = le.fit_transform(y)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=SEED)


In [22]:
random_forest = RandomForestClassifier(random_state=SEED)
random_forest.fit(X_train, y_train)

In [23]:
preds = random_forest.predict(X_test)

In [24]:
accuracy = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds, average='micro')
recall = recall_score(y_test, preds, average='micro')
f1 = f1_score(y_test, preds, average='micro')

In [25]:
metrics = {}
metrics['accuracy_rf'] = float(accuracy)
metrics['precision_rf'] = float(precision)
metrics['recall_rf'] = float(recall)
metrics['f1_rf'] = float(f1)

In [26]:
with open('metrics_random_forest.yaml', 'w') as file:
    yaml.dump(metrics, file, default_flow_style=False)

In [27]:
test_classes = pd.DataFrame()
test_classes['actual_class'] = le.inverse_transform(y_test)
test_classes['predicted_class'] = le.inverse_transform(preds)

In [28]:
test_classes.to_csv('test_classes_random_forest.csv', index=False)


In [29]:
train_classes = pd.DataFrame()
train_classes['actual_class'] = le.inverse_transform(y_train)
train_classes['predicted_class'] = le.inverse_transform(random_forest.predict(X_train))


In [30]:
train_classes.to_csv('train_classes_random_forest.csv', index=False)


In [31]:
import pickle
with open('model_random_forest.pckl', 'wb') as file:
    pickle.dump(random_forest, file)