In [None]:
# Imports and dataset
# Load libraries and the Iris dataset so every cell can use X, y, and Xs (scaled features)
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from scipy.stats import bernoulli, poisson
import numpy.linalg as la
import warnings
warnings.filterwarnings("ignore")
iris = load_iris()
X = iris.data
y = iris.target
scaler = StandardScaler()
Xs = scaler.fit_transform(X)
import pandas as pd
df = pd.DataFrame(np.c_[X, y], columns=iris['feature_names']+['target'])
df.head()

In [None]:
# K-Fold cross-validation:
# K-Fold splits the data into k parts, trains on k-1 parts and tests on the remaining part,
# repeating so each part is used as the test set once.
kf = KFold(n_splits=5, shuffle=True, random_state=42)
svc = SVC()
scores = cross_val_score(svc, Xs, y, cv=kf)
print('K-Fold SVC scores:', scores)
print('Mean accuracy:', scores.mean())

In [None]:
# Bootstrapping:
# Bootstrapping resamples the dataset with replacement to estimate variability (e.g., accuracy),
# and uses out-of-bag samples to test each bootstrap model.
dt = DecisionTreeClassifier(random_state=0)
n = X.shape[0]
boot_scores = []
for i in range(200):
    idx = np.random.choice(n, n, replace=True)
    oob_idx = np.setdiff1d(np.arange(n), np.unique(idx))
    if oob_idx.size == 0:
        continue
    dt.fit(X[idx], y[idx])
    boot_scores.append(dt.score(X[oob_idx], y[oob_idx]))
boot_scores = np.array(boot_scores)
print('Bootstrap OOB mean accuracy:', boot_scores.mean())
print('95% CI:', np.percentile(boot_scores, [2.5, 97.5]))

In [None]:
# PCA and SVD:
# PCA finds directions (principal components) that capture the most variance in the data.
# SVD is a matrix factorization (similar to PCA) that decomposes the data into singular vectors and values.
pca = PCA(n_components=2)
Xp = pca.fit_transform(Xs)
print('PCA shape:', Xp.shape)
u, s, vh = la.svd(Xs, full_matrices=False)
print('SVD shapes U,S,Vh:', u.shape, s.shape, vh.shape)

In [None]:
# LDA (Linear Discriminant Analysis):
# LDA finds linear combinations of features that best separate the classes.
lda = LinearDiscriminantAnalysis(n_components=2)
Xl = lda.fit_transform(Xs, y)
print('LDA shape:', Xl.shape)

In [None]:
# Bernoulli and Poisson distributions (random samples):
# Bernoulli models binary outcomes (success/failure).
# Poisson models counts (number of events in a fixed interval).
print('Bernoulli samples (p=0.4):', bernoulli.rvs(0.4, size=10))
print('Poisson samples (mu=3):', poisson.rvs(mu=3, size=10))

In [None]:
# Gaussian Naive Bayes:
# GaussianNB assumes features are normally distributed and independent given the class.
gnb = GaussianNB()
print('GaussianNB CV mean accuracy:', cross_val_score(gnb, Xs, y, cv=kf).mean())

In [None]:
# Support Vector Machine (SVM):
# SVM finds a boundary (hyperplane) that separates classes with maximum margin.
svm = SVC(kernel='rbf', gamma='scale')
print('SVM CV mean accuracy:', cross_val_score(svm, Xs, y, cv=kf).mean())

In [None]:
# K-Nearest Neighbors (KNN):
# KNN classifies a sample by majority vote among its k nearest neighbors.
knn = KNeighborsClassifier(n_neighbors=5)
print('KNN CV mean accuracy:', cross_val_score(knn, Xs, y, cv=kf).mean())

In [None]:
# Random Forest:
# Random Forest is an ensemble of decision trees; it averages many trees to improve accuracy and reduce overfitting.
rf = RandomForestClassifier(n_estimators=100, random_state=0)
print('Random Forest CV mean accuracy:', cross_val_score(rf, Xs, y, cv=kf).mean())

In [None]:
# Decision Tree:
# Decision Tree splits the data by asking feature-based yes/no questions to reach a decision.
dt2 = DecisionTreeClassifier(random_state=0)
print('Decision Tree CV mean accuracy:', cross_val_score(dt2, Xs, y, cv=kf).mean())

In [None]:
# Simple (univariate) Linear Regression:
# Simple regression predicts a continuous target using a single feature (here petal length).
X_train, X_test, y_train, y_test = train_test_split(X[:, 2:3], y, test_size=0.2, random_state=1)
lr_simple = LinearRegression()
lr_simple.fit(X_train, y_train)
y_pred = lr_simple.predict(X_test)
print('Coef:', lr_simple.coef_, 'Intercept:', lr_simple.intercept_)
print('MSE:', mean_squared_error(y_test, y_pred), 'R2:', r2_score(y_test, y_pred))

In [None]:
# Multiple Linear Regression:
# Multiple regression predicts a continuous target using multiple features.
Xm_train, Xm_test, ym_train, ym_test = train_test_split(X, y, test_size=0.2, random_state=2)
lr_multi = LinearRegression()
lr_multi.fit(Xm_train, ym_train)
ym_pred = lr_multi.predict(Xm_test)
print('Coefs:', lr_multi.coef_, 'Intercept:', lr_multi.intercept_)
print('MSE:', mean_squared_error(ym_test, ym_pred), 'R2:', r2_score(ym_test, ym_pred))

In [None]:
# Logistic Regression:
# Logistic regression models the probability of class membership (classification) using a logistic function.
logreg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=200)
print('Logistic Regression CV mean accuracy:', cross_val_score(logreg, Xs, y, cv=kf).mean())