#**Splitting Data in Python**

The `train_test_split()` function in pandas divides a DataFrame into two subsets as specified by the user. Parameters are outlined in scikit-learn documentation. Python employs a random number generator for creating training-testing-validation divisions. Using a seed value ensures consistent splits across code runs.  

The code below does so for the bad drivers dataset using the proportions 70%/10%/20% for training, validation, and test data, respectively .



In [None]:
# Import packages
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# Load bad drivers data
badDrivers = pd.read_csv('bad-drivers.csv')

In [None]:
# Display the bad drivers data
badDrivers

In [None]:
# Set the proportions of the training-validation-test split
trainingProportion = 0.70
validationProportion = 0.10
testProportion = 0.20

# Split off the test data
trainingAndValidationData, testData = train_test_split(
    badDrivers, test_size=testProportion
)

# Split the remaining into training and validation data
trainingData, validationData = train_test_split(
    trainingAndValidationData,
    train_size=trainingProportion / (trainingProportion + validationProportion),
)

In [None]:
# Display training data
trainingData

In [None]:
# Display validation data
validationData

In [None]:
# Display test data
testData

In [None]:
# Display the scatter plot for the entire sample data
plt.scatter(
    badDrivers[['Losses incurred by insurance companies for collisions per insured driver ($)']],
    badDrivers[['Car Insurance Premiums ($)']],
)
plt.xlabel('Losses incurred by insurance companies', fontsize=14)
plt.ylabel('Car insurance premiums', fontsize=14)
plt.xlim(80, 200)
plt.ylim(600, 1400)
plt.title('Sample data')
plt.show()

In [None]:
# Display the scatter plot for the training data
plt.scatter(
    trainingData[['Losses incurred by insurance companies for collisions per insured driver ($)']],
    trainingData[['Car Insurance Premiums ($)']],
)
plt.xlabel('Losses incurred by insurance companies', fontsize=14)
plt.ylabel('Car insurance premiums', fontsize=14)
plt.xlim(80, 200)
plt.ylim(600, 1400)
plt.title('Training data')
plt.show()

In [None]:
# Display the scatter plot for the validation data
plt.scatter(
    validationData[['Losses incurred by insurance companies for collisions per insured driver ($)']],
    validationData[['Car Insurance Premiums ($)']],
)
plt.xlabel('Losses incurred by insurance companies', fontsize=14)
plt.ylabel('Car insurance premiums', fontsize=14)
plt.xlim(80, 200)
plt.ylim(600, 1400)
plt.title('Validation data')
plt.show()

In [None]:
# Display the scatter plot for the test data
plt.scatter(
    testData[['Losses incurred by insurance companies for collisions per insured driver ($)']],
    testData[['Car Insurance Premiums ($)']],
)
plt.xlabel('Losses incurred by insurance companies', fontsize=14)
plt.ylabel('Car insurance premiums', fontsize=14)
plt.xlim(80, 200)
plt.ylim(600, 1400)
plt.title('Test data')
plt.show()

#**Loss Functions for Regression in Python**

Python computes regression model errors with `mean_squared_error()` for MSE and `mean_absolute_error() `for MAE. For root mean squared error, use squared=False in `mean_squared_error()`. Details in respective function documentation.  

The code below computes all four metrics for the linear and quadratic regression on the tortoise data.

In [None]:
# Import packages
import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split

In [None]:
# Load tortoise data
tortoise = pd.read_csv("Tortoises.csv")

In [None]:
# Store relevant columns as variables
X = tortoise["Length"]
y = tortoise["Clutch"]

In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=123
)

In [None]:
# Create a linear model using the training set and predictions using the test set
X_test = np.asarray(X_test)
y_test = np.asarray(y_test)
linModel = LinearRegression()
linModel.fit(X_train.values.reshape(-1, 1), y_train.values.reshape(-1, 1))
y_pred = np.ravel(linModel.predict(X_test.reshape(-1, 1)))

In [None]:
# Display linear model and scatter plot of the test set
plt.scatter(X_test, y_test)
plt.xlabel("Length (mm)", fontsize=14)
plt.ylabel("Clutch size", fontsize=14)
plt.plot(X_test, y_pred, color='red')
plt.ylim([0, 14])
for i in range(5):
    plt.plot([X_test[i], X_test[i]], [y_test[i], y_pred[i]], color='grey', linewidth=2)

In [None]:
# Display MSE
metrics.mean_squared_error(y_test, y_pred)

In [None]:
# Display RMSE
metrics.mean_squared_error(y_test, y_pred, squared=False)

In [None]:
# Display MAE
metrics.mean_absolute_error(y_test, y_pred)

In [None]:
# Create a quadratic model using the training set and predictions using the test set
X_train = np.asarray(X_train)
y_train = np.asarray(y_train)
poly = PolynomialFeatures().fit_transform(X_train.reshape(-1, 1))
poly_reg_model = LinearRegression().fit(poly, y_train)
poly_test = PolynomialFeatures().fit_transform(X_test.reshape(-1, 1))
y_pred = poly_reg_model.predict(poly_test)

In [None]:
# Display quadratic model and scatter plot of the test set
plt.scatter(X_test, y_test)
plt.xlabel("Length (mm)", fontsize=14)
plt.ylabel("Clutch size", fontsize=14)
x = np.linspace(X_test.min(), X_test.max(), 100)
y = (
    poly_reg_model.coef_[2] * x**2
    + poly_reg_model.coef_[1] * x
    + poly_reg_model.intercept_
)
plt.plot(x, y, color='red', linewidth=2)
plt.ylim([0, 14])
for i in range(5):
    plt.plot([X_test[i], X_test[i]], [y_test[i], y_pred[i]], color='grey', linewidth=2)

In [None]:
# Display MSE
metrics.mean_squared_error(y_test, y_pred)

In [None]:
# Display RMSE
metrics.mean_squared_error(y_test, y_pred, squared=False)

In [None]:
# Display MAE
metrics.mean_absolute_error(y_test, y_pred)

#**Loss Functions for Classification in Python**

Compute log loss with `log_loss()` from `sklearn.metrics` package. Use `y_true` and `y_pred` arrays. Find other parameters in log_loss [documentation](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html).  

The code below computes the log loss for a logistic regression model on the Wisconsin breast cancer dataset.

In [None]:
# Import packages and functions
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
# Load the Wisconsin Breast Cancer dataset
WBCD = pd.read_csv('WisconsinBreastCancerDatabase.csv')

In [None]:
# Convert Diagnosis to 0 and 1
WBCD.loc[WBCD['Diagnosis'] == 'B', 'Diagnosis'] = 0
WBCD.loc[WBCD['Diagnosis'] == 'M', 'Diagnosis'] = 1

In [None]:
# Store relevant columns as variables
X = WBCD[['Radius mean']].values.reshape(-1, 1)
y = WBCD[['Diagnosis']].values.reshape(-1, 1).astype(int)

In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=123
)

In [None]:
# Logistic regression predicting diagnosis from tumor radius
logisticModel = LogisticRegression()
logisticModel.fit(X_train, np.ravel(y_train.astype(int)))

In [None]:
# Graph logistic regression probabilities
plt.scatter(X_test, y_test)
x_prob = np.linspace(X_test.min(), X_test.max(), 1000)
y_prob = logisticModel.predict_proba(x_prob.reshape(-1, 1))[:, 1]
plt.plot(x_prob, y_prob, color='red')
plt.xlabel('Radius mean', fontsize=14)
plt.ylabel('Probability of malignant tumor', fontsize=14)

In [None]:
# Predict the probabilities for the test set
p_hat = logisticModel.predict_proba(X_test)

In [None]:
# Display the log-loss
metrics.log_loss(y_test, p_hat)

#**Binary Classification Metrics in Python**


Python computes accuracy, precision, and recall from a binary classifier using `accuracy_score()`, `precision_score()`, and `recall_score()`. Input true and predicted classifications as 0 (neg) and 1 (pos) arrays. Generate ROC curve with `roc_curve()` and `RocCurveDisplay()`.  

The code below calculates metrics for logistic regression on Wisconsin breast cancer data. It also computes scores for varied cutoffs in the same regression.

In [None]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

In [None]:
# Load breast cancer data and hot encodes categorical variable
WBCD = pd.read_csv("WisconsinBreastCancerDatabase.csv")
WBCD.loc[WBCD['Diagnosis'] == 'B', 'Diagnosis'] = 0
WBCD.loc[WBCD['Diagnosis'] == 'M', 'Diagnosis'] = 1

In [None]:
# Store relevant columns as variables
X = WBCD[['Radius mean']].values.reshape(-1, 1)
y = WBCD[['Diagnosis']].values.reshape(-1, 1).astype(int)

In [None]:
# Logistic regression predicting diagnosis from tumor radius
logisticModel = LogisticRegression()
logisticModel.fit(X, np.ravel(y.astype(int)))
cutoff = 0.5
yPredictedProb = logisticModel.predict_proba(X)[:, 1]
yPredLowCutoff = []
for i in range(0, yPredictedProb.size):
    if yPredictedProb[i] < cutoff:
        yPredLowCutoff.append(0)
    else:
        yPredLowCutoff.append(1)

In [None]:
# Display accuracy
metrics.accuracy_score(y, yPredLowCutoff)

In [None]:
# Display precision
metrics.precision_score(y, yPredLowCutoff)

In [None]:
# Display recall
metrics.recall_score(y, yPredLowCutoff)

In [None]:
# Plot the ROC curve
pred = logisticModel.predict_proba(X)[:, 1]
fpr, tpr, thresholds = metrics.roc_curve(y, pred)
roc_auc = metrics.auc(fpr, tpr)
display = metrics.RocCurveDisplay(
    fpr=fpr, tpr=tpr, roc_auc=roc_auc, pos_label='Malignant, 1'
)
display.plot()
plt.show()

#**Cross-validation in Python**

`cross_val_score()` estimates model errors with k-fold cross-validation using specified folds and a metric. Default scorer depends on the model, like `r2` for `LinearModel()`. Note differences between scorer objects and functions, such as `neg_mean_square_error` vs. `mean_square_error`. Pairs like `r2` and `r2_score` match. Find scorer details in scikit-learn [docs](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html).  

Below, code loads bad drivers data, reserves 20% as test set, applies 10-fold cross-validation and LOOCV with mean squared error scoring. Use negative sign to fix `neg_mean_square_error`'s sign flip in `cross_val_score` calls.

In [None]:
# Import packages and functions
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score

In [None]:
# Import dataset
badDrivers = pd.read_csv('bad-drivers.csv')

# Split off 20% of the data to be left out as test data
badDriversTrainingdata, testData = train_test_split(badDrivers, test_size=0.20)

# Store relevant columns as variables
X = badDriversTrainingdata[
    ['Losses incurred by insurance companies for collisions per insured driver ($)']
].values.reshape(-1, 1)
y = badDriversTrainingdata[['Car Insurance Premiums ($)']].values.reshape(-1, 1)

In [None]:
# Fit a linear model to the data
linModel = LinearRegression()
linModel.fit(X, y)
yPredicted = linModel.predict(X)

# Plot the fitted model
plt.scatter(X, y, color='black')
plt.plot(X, yPredicted, color='blue', linewidth=1)
plt.xlabel('Losses incurred by insurance companies', fontsize=14)
plt.ylabel('Car insurance premiums', fontsize=14)

In [None]:
# neg_mean_square_error is the negative MSE, so add a - so the scores are positive.
ten_fold_scores = -cross_val_score(
    linModel, X, y, scoring='neg_mean_squared_error', cv=10
)

In [None]:
# neg_mean_square_error is the negative MSE, so add a - so the scores are positive.
LOOCV_scores = -cross_val_score(linModel, X, y, scoring='neg_mean_squared_error', cv=40)

In [None]:
# Plot the errors for both scores
plt.plot(np.zeros_like(ten_fold_scores), ten_fold_scores, '.')
plt.plot(np.zeros_like(LOOCV_scores) + 1, LOOCV_scores, '.')
plt.ylabel('Mean squared errors', fontsize=14)
plt.xticks([0, 1], ['10-fold', 'LOOCV'])

#**The bootstrap Method in Python**

`resample()` creates bootstrap sample with `replace=True` and `n_samples` as existing dataset size. Parameters in `scikit-learn` [docs](https://scikit-learn.org/stable/modules/generated/sklearn.utils.resample.html). Out-of-bag sample from complement of bootstrap using `~` on dataframe's indices.  


Code fits linear model to "bad drivers" dataset, computes mean squared error for 30 bootstrap samples.

In [None]:
# Import packages and functions
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
# Load data set
badDrivers = pd.read_csv('bad-drivers.csv')

In [None]:
# Create bootstrap samples and collect errors

bootstrapErrors = []
for i in range(0, 30):
    # Create the bootstrap sample and the out-of-bag sample
    boot = resample(badDrivers, replace=True, n_samples=51)
    oob = badDrivers[~badDrivers.index.isin(boot.index)]

    # Fit a linear model to the bootstrap sample
    XBoot = boot[
        ['Losses incurred by insurance companies for collisions per insured driver ($)']
    ].values.reshape(-1, 1)
    yBoot = boot[['Car Insurance Premiums ($)']].values.reshape(-1, 1)
    linModel = LinearRegression()
    linModel.fit(XBoot, yBoot)

    # Predict y values for the out-of-bag sample
    XOob = oob[
        ['Losses incurred by insurance companies for collisions per insured driver ($)']
    ].values.reshape(-1, 1)
    YOob = oob[['Car Insurance Premiums ($)']].values.reshape(-1, 1)
    YOobPredicted = linModel.predict(XOob)

    # Calculate the error
    bootError = mean_squared_error(YOob, YOobPredicted)
    bootstrapErrors.append(bootError)

In [None]:
# Calculate the mean of the errors
np.mean(bootstrapErrors)

In [None]:
# Calculate the standard deviation of the errors
np.std(bootstrapErrors)

In [None]:
# Plot the errors
plt.plot(bootstrapErrors, np.zeros_like(bootstrapErrors), '.')
plt.xlabel('Bootstrap errors (MSE)', fontsize=14)
plt.gca().axes.yaxis.set_ticks([])

#**Model Selection in Python**

The code below loads semiconductor data, assesses RMSE of polynomial regressions (degree 1-6) with 10-fold cross-validation, plots MSEs for model selection.

In [None]:
# Import packages
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import PolynomialFeatures

In [None]:
# Import dataset
thurber = pd.read_csv('Thurber.csv')

# Split off 20% of the data to be left out as test data
thurberTrainingData, test_data = train_test_split(thurber, test_size=0.20)

# Store relevant columns as variables
X = thurberTrainingData[['log(Density)']].values.reshape(-1, 1)
y = thurberTrainingData[['Electron mobility']].values.reshape(-1, 1)

In [None]:
# Fit a cubic regression model
polyFeatures = PolynomialFeatures(degree=3, include_bias=False)
XPoly = polyFeatures.fit_transform(X)
polyModel = LinearRegression()
polyModel.fit(XPoly, y)

# Graph the scatterplot and the polynomial regression
plt.scatter(X, y, color='black')
xDelta = np.linspace(X.min(), X.max(), 1000)
yDelta = polyModel.predict(polyFeatures.fit_transform(xDelta.reshape(-1, 1)))
plt.plot(xDelta, yDelta, color='blue', linewidth=2)
plt.xlabel('log(Density)', fontsize=14)
plt.ylabel('Electron mobility', fontsize=14)

In [None]:
# Collect cross-validation metrics
cvMeans = []
cvStdDev = []

for i in range(1, 7):
    # Fit a degree i polynomial regression model
    polyFeatures = PolynomialFeatures(degree=i, include_bias=False)
    XPoly = polyFeatures.fit_transform(X)
    polyModel = LinearRegression()
    polyModel.fit(XPoly, y)

    # Carry out 10-fold cross-validation for the a degree i polynomial regression model
    polyscore = -cross_val_score(
        polyModel, XPoly, y, scoring='neg_mean_squared_error', cv=10
    )

    # Store the mean and standard deviation of the 10-fold cross-validation for the degree i polynomial regression model
    cvMeans.append(np.mean(polyscore))
    cvStdDev.append(np.std(polyscore))

In [None]:
# Graph the errorbar chart using the cross-validation means and std deviations
plt.errorbar(x=range(1, 7), y=cvMeans, yerr=cvStdDev, marker='o', color='black')
plt.xlabel('Degree of regression polynomial', fontsize=14)
plt.ylabel('Mean squared error', fontsize=14)