# # Data Splitting and Model Evaluation

# # Data Splitting Methods
# - Random Train-Test Split
# - Stratified Train-Test Split
# - Holdout Method
# - Cross Validation
# - Leave One Out Cross Validation

In [None]:
# Prepare the data
import pandas as pd

# Load the dataset
df = pd.read_csv('data/Iris.csv')
print(df.describe(include='all'))

# Display the Species class distribution
target_count = df.Species.value_counts()
print(target_count)

In [ ]:
# # Random Train-Test Split
from sklearn.model_selection import train_test_split

# Split the dataset into 70% training and 30% test
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df.iloc[:, -1], test_size=0.3, random_state=42)

# Display the shape of the training and test sets
print('Random Train-Test Split, Training set shape:', X_train.shape, y_train.shape)
print('Random Train-Test Split, Test set shape:', X_test.shape, y_test.shape)

# Display the Species class distribution of the training and test sets
print('Training set class distribution:')
print(y_train.value_counts())
print('Test set class distribution:')
print(y_test.value_counts())

In [ ]:
# # Stratified Train-Test Split
from sklearn.model_selection import StratifiedShuffleSplit

# Create stratified sampling object
stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)

# Split dataset into training and test sets using stratified sampling
for train_index, test_index in stratified_split.split(df.iloc[:, :-1], df.iloc[:, -1]):
    X_train_strat, X_test_strat = df.iloc[train_index, :-1], df.iloc[test_index, :-1]
    y_train_strat, y_test_strat = df.iloc[train_index, -1], df.iloc[test_index, -1]

# Display the shape of the training and test sets
print('Stratified Train-Test Split, Training set shape:', X_train_strat.shape, y_train_strat.shape)
print('Stratified Train-Test Split, Test set shape:', X_test_strat.shape, y_test_strat.shape)

# Display the Species class distribution of the training and test sets
print('Training set class distribution:')
print(y_train_strat.value_counts())
print('Test set class distribution:')
print(y_test_strat.value_counts())

In [ ]:
# # # Holdout Validation with Validation Set
# Split the dataset into 60% training, 20% validation, and 20% test
X_train, X_temp, y_train, y_temp = train_test_split(df.iloc[:, :-1], df.iloc[:, -1], test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Display the shape of the training, validation, and test sets
print('Holdout Validation with Validation Set, Training set shape:', X_train.shape, y_train.shape)
print('Holdout Validation with Validation Set, Validation set shape:', X_val.shape, y_val.shape)
print('Holdout Validation with Validation Set, Test set shape:', X_test.shape, y_test.shape)

# Display the Species class distribution of the training, validation, and test sets
print('Training set class distribution:')
print(y_train.value_counts())
print('Validation set class distribution:')
print(y_val.value_counts())
print('Test set class distribution:')
print(y_test.value_counts())

In [ ]:
# # # K-Fold Cross-Validation
from sklearn.model_selection import KFold

# Create K-Fold cross-validation object
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)

# Split dataset into training and test sets using K-Fold cross-validation
for train_index, test_index in k_fold.split(df.iloc[:, :-1], df.iloc[:, -1]):
    X_train_kfold, X_test_kfold = df.iloc[train_index, :-1], df.iloc[test_index, :-1]
    y_train_kfold, y_test_kfold = df.iloc[train_index, -1], df.iloc[test_index, -1]

# Display the shape of the training and test sets
print('K-Fold Cross-Validation, Training set shape:', X_train_kfold.shape, y_train_kfold.shape)
print('K-Fold Cross-Validation, Test set shape:', X_test_kfold.shape, y_test_kfold.shape)

# Display the Species class distribution of the training and test sets
print('Training set class distribution:')
print(y_train_kfold.value_counts())
print('Test set class distribution:')
print(y_test_kfold.value_counts())

In [ ]:
# # Stratified K-Fold Cross-Validation
from sklearn.model_selection import StratifiedKFold

# Create stratified K-Fold cross-validation object
stratified_k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Split dataset into training and test sets using stratified K-Fold cross-validation
for train_index, test_index in stratified_k_fold.split(df.iloc[:, :-1], df.iloc[:, -1]):
    X_train_strat_kfold, X_test_strat_kfold = df.iloc[train_index, :-1], df.iloc[test_index, :-1]
    y_train_strat_kfold, y_test_strat_kfold = df.iloc[train_index, -1], df.iloc[test_index, -1]

# Display the shape of the training and test sets
print('Stratified K-Fold Cross-Validation, Training set shape:', X_train_strat_kfold.shape, y_train_strat_kfold.shape)
print('Stratified K-Fold Cross-Validation, Test set shape:', X_test_strat_kfold.shape, y_test_strat_kfold.shape)

# Display the Species class distribution of the training and test sets
print('Training set class distribution:')
print(y_train_strat_kfold.value_counts())
print('Test set class distribution:')
print(y_test_strat_kfold.value_counts())

In [ ]:
# # Leave-One-Out Cross-Validation
from sklearn.model_selection import LeaveOneOut

# Create Leave-One-Out cross-validation object
leave_one_out = LeaveOneOut()

i = 0
# Split dataset into training and test sets using Leave-One-Out cross-validation
for train_index, test_index in leave_one_out.split(df.iloc[:, :-1], df.iloc[:, -1]):
    X_train_loo, X_test_loo = df.iloc[train_index, :-1], df.iloc[test_index, :-1]
    y_train_loo, y_test_loo = df.iloc[train_index, -1], df.iloc[test_index, -1]

    print("Loop: ", i)
    # Display the shape of the training and test sets
    print('Leave-One-Out Cross-Validation, Training set shape:', X_train_loo.shape, y_train_loo.shape)
    print('Leave-One-Out Cross-Validation, Test set shape:', X_test_loo.shape, y_test_loo.shape)

    # Display the Species class distribution of the training and test sets
    print('Training set class distribution:')
    print(y_train_loo.value_counts())
    print('Test set class distribution:')
    print(y_test_loo.value_counts(), '\n')

    i += 1

# # Model Evaluation Metrics

# # Regression Metrics

In [ ]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Import Regression models
from sklearn.linear_model import LinearRegression

# Load the dataset
import pandas as pd
df = pd.read_csv('data/Iris.csv')

# Encoding the Species column
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['Species'] = label_encoder.fit_transform(df['Species'])

# Split the dataset into 70% training and 30% test
from sklearn.model_selection import train_test_split
# Split the dataset into 70% training and 30% test
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df.iloc[:, -1], test_size=0.3, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Calculate the mean squared error
mse = round(mean_squared_error(y_test, y_pred), 2)
mae = round(mean_absolute_error(y_test, y_pred), 2)
r2 = round(r2_score(y_test, y_pred), 2)
print('Mean Squared Error: {0}, Mean Absolute Error: {1}, R2 Score: {2}'.format(mse, mae, r2), '\n')

In [ ]:
# # Classification Metrics
# Import libraries
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

# Load the dataset
import pandas as pd
df = pd.read_csv('data/Iris.csv')

# Encoding the Species column
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['Species'] = label_encoder.fit_transform(df['Species'])

# Split the dataset into 70% training and 30% test
from sklearn.model_selection import train_test_split
# Split the dataset into 70% training and 30% test
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df.iloc[:, -1], test_size=0.3, random_state=42)

# Import classifiers, DT
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

# Calculate the classification report
print('Classification Report:')
print(classification_report(y_test, y_pred), '\n')

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
presicion = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
print('Accuracy: {0}, Precision: {1}, Recall: {2}, F1 Score: {3}'.format(accuracy, presicion, recall, f1), '\n')

# Visualize the confusion matrix
import matplotlib.pyplot as plt
import seaborn as sns

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()