#**K-nearest Neighbors Classification in Python**

The Python code below classifies beans by the bean's major and minor axis lengths.

In [None]:
# Import needed packages for classification
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Import packages for visualization of results
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from matplotlib.colors import ListedColormap

# Iport packages for evaluation
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
# Read data, clean up names

beans = pd.read_csv("https://raw.githubusercontent.com/mh2t/DS5110/main/data/Dry_Bean_Dataset.csv")
beans["Class"] = beans["Class"].str.capitalize()
print(beans.shape)
beans.describe()

In [None]:
# Initialize model
beanKnnClassifier = KNeighborsClassifier(n_neighbors=5)
# Split data
X = beans[["MajorAxisLength", "MinorAxisLength"]]
y = beans[["Class"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
# Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [None]:
# Train model and make predictions for the test set.
beanKnnClassifier.fit(X_train_scaled, np.ravel(y_train))
y_pred = beanKnnClassifier.predict(scaler.transform(X_test))

In [None]:
# Predict one bean
bean = pd.DataFrame(data={"MajorAxisLength": [400], "MinorAxisLength": [200]})
beanKnnClassifier.predict(scaler.transform(bean))

In [None]:
# Compute metrics
print(metrics.accuracy_score(y_pred, y_test))
print(metrics.confusion_matrix(y_pred, y_test))

## **Visualizing the Regions of Classification**

In [None]:
# Define function for the plot.
#  X - two feature data frame,
#  y - output feature,
#  classifier - model that has been fit,
#  le - label encoder
#  with_data - plot the data with the regions


def plot_classification_regions(X, y, classifier, le, with_data=False):

    # Predict class on a regular grid

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    x_min, x_max = X.iloc[:, 0].min() - 10, X.iloc[:, 0].max() + 10
    y_min, y_max = X.iloc[:, 1].min() - 10, X.iloc[:, 1].max() + 10
    xh = (x_max - x_min) / 200  # step size in the mesh for the x direction
    yh = (y_max - y_min) / 200  # step size in the mesh for the y direction

    xx, yy = np.meshgrid(np.arange(x_min, x_max, xh), np.arange(y_min, y_max, yh))
    Z = classifier.predict(np.c_[xx.ravel(), yy.ravel()])
    # Get outputs ready for plotting
    Z = le.transform(Z)
    Z = Z.reshape(xx.shape)
    numClasses = len(le.classes_)
    # Plot the regions classified as different classes
    plt.figure(figsize=(8, 6))
    plt.contourf(
        xx,
        yy,
        Z,
        levels=[i - 0.5 for i in range(numClasses + 1)],
        cmap=ListedColormap(
            sns.color_palette("colorblind", as_cmap=False, n_colors=numClasses)
        ),
    )

    if with_data:
        p1 = sns.scatterplot(
            data=X,
            x=X.columns[0],
            y=X.columns[1],
            hue=le.transform(np.ravel(y)),
            palette="colorblind",
            alpha=1,
            edgecolor="black",
            style=le.transform(np.ravel(y)),
        )
        leg = p1.legend()
        leg.set_title("Variety")
        for t, l in zip(leg.texts, le.inverse_transform(range(7))):
            t.set_text(l)

In [None]:
# Take a sample to keep runtime low while seeing what areas are classified as each bean
beanSample = beans.sample(750, random_state=20220509)
beanSample.describe()

In [None]:
# Create a label encoder so colors match between plots
le = preprocessing.LabelEncoder()
le.fit(beanSample["Class"])

In [None]:
# Define input and output features
X = beanSample[["MajorAxisLength", "MinorAxisLength"]]
y = beanSample[["Class"]]

# Fit model
beanKnnClassifier.fit(X, np.ravel(y))

In [None]:
# Defined in the first cell in this section
plot_classification_regions(X, y, beanKnnClassifier, le, with_data=False)

**K-nearest Neighbors Regression in Python**  

The Python code below predicts a bean's convex area from the bean's major and minor axis lengths.

In [None]:
# Import needed packages for classification
from sklearn.neighbors import KNeighborsRegressor
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Import packages for visualization of results
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from matplotlib.colors import ListedColormap

# Import packages for evaluation
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
# Read data, clean up names

beans = pd.read_csv("https://raw.githubusercontent.com/mh2t/DS5110/main/data/Dry_Bean_Dataset.csv")
print(beans.shape)
beans.describe()

In [None]:
# Initialize model
beanKnnRegressor = KNeighborsRegressor(n_neighbors=5)
# Split data
X = beans[["MajorAxisLength", "MinorAxisLength"]]
y = beans[["ConvexArea"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
# Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [None]:
# Fit model and predict on test data
beanKnnRegressor.fit(X_train, np.ravel(y_train))
y_pred = beanKnnRegressor.predict(X_test)

In [None]:
# Plot the actual value vs. the predicted value
plt.scatter(x=y_test, y=y_pred)

In [None]:
# Compute metrics
print(metrics.mean_squared_error(y_pred, y_test))
print(metrics.r2_score(y_pred, y_test))

#**Naive Bayes Classification with Python**

Initializing the naive Bayes classifier (`NBModel = MultinomialNB()`) involves utilizing the described method. Text for the model requires `CountVectorizer(ngram_range = (1,2))` processing, counting single words and word pairs. The parameters for both can be found in the [sklearn docs](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html). Fit the model (`NBModel.fit(X, y)`) after initialization, using trained vectorizer result (`X`) and corresponding class vector (`y`).  

The Python code below builds a naive Bayes model that predicts whether a text message is spam or not (ham).



In [None]:
# Import packages and functions
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [None]:
# Read in the data and check.  Since the data is tab separated, pd.read_table is used.
# File does not include column headers so they are provided via names.
messages = pd.read_table('https://raw.githubusercontent.com/mh2t/DS5110/main/data/SMSSpamCollection.csv', names=['Class', 'Message'])
messages.head()

In [None]:
# Split into testing and training sets
X_train, X_test, Y_train, Y_test = train_test_split(
    messages['Message'], messages['Class'], random_state=20220530
)

In [None]:
# Count the words that appear in the messages
vectorizer = CountVectorizer(ngram_range=(1, 1))
vectorizer.fit(X_train)
# Uncomment the line below to see the words.
#vectorizer.vocabulary_

In [None]:
# Count the words in the training set and store in a matrix
X_train_vectorized = vectorizer.transform(X_train)
X_train_vectorized

In [None]:
# Initialize the model and fit with the training data
NBmodel = MultinomialNB()
NBmodel.fit(X_train_vectorized, Y_train)

In [None]:
# Make predictions onto the training and testing sets.
trainPredictions = NBmodel.predict(vectorizer.transform(X_train))
testPredictions = NBmodel.predict(vectorizer.transform(X_test))

In [None]:
# How does the model work on the training set?
confusion_matrix(Y_train, trainPredictions)

In [None]:
# Display that in terms of correct porportions
confusion_matrix(Y_train, trainPredictions, normalize='true')

99.7% of real messages are classified correctly.
Just over 3% of spam messages are thought to be real.

In [None]:
# How does the model work on the test set?
confusion_matrix(Y_test, testPredictions, normalize='true')

About 7.5% of spam messages are classified as real in the test data and only 0.4 % of real messages are classified as spam.

In [None]:
# Predict some phrases. Add your own.
NBmodel.predict(
    vectorizer.transform(
        ["Big sale today! Free cash.",
        "I'll be there in 5"]))

#**Support Vector Machine Classification in Python**  


Initializing a linear support vector machine classifier (`SVC(C = 10, kernel = 'linear')`) sets slope of hinge loss function to 10. The rest of the parameters and matching values can be found in [scikit-learn docs](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html). After, use `model.fit(X, y`) with input dataframe (`X`) and class vector (`y`). Scale `X` with `StandardScaler()` before fitting due to SVM's distance reliance.  

The Python code below predicts a penguin's species based on a penguin's measurements using several kernels.

In [None]:
# Load packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler

In [None]:
# Load and view data
penguins = sns.load_dataset('penguins')
penguins

In [None]:
# Remove the penguins with missing data
penguinsClean = penguins[~penguins['body_mass_g'].isna()]

In [None]:
# Only use numeric values. Categorical values could be encoded as dummy variables.

X = penguinsClean[
    ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
]
Y = penguinsClean['species']

# Split the data into training and testing sets.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=20220621)

# Scale the input variable because SVM is dependent on differences in scale for distances
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## **Linear SVM**

In [None]:
# Define and fit the model.
# Adjust C from 0.01 to 100 by changing the number of decimal places or zeros.
# C controls the slope of the hinge function. Larger values make misclassification less frequent.

penguinsSVMlinear = svm.SVC(kernel='linear', C=0.01)
penguinsSVMlinear.fit(X_train_scaled, Y_train)

In [None]:
# Predict for the test set
Y_pred = penguinsSVMlinear.predict(X_test_scaled)

In [None]:
# Display the confusion matrix
confusion_matrix(Y_test, Y_pred)

## **Radial Basis Function (RBF)**  




In [None]:
# Adjust the number of decimal places in
# gamma (affects distance a point has influence, smaller value of gamma allow influence to spread more )
# and C

penguinsSVMrbf = svm.SVC(kernel='rbf', C=10, gamma=0.01)
penguinsSVMrbf.fit(X_train_scaled, Y_train)

In [None]:
# Predict for the test set
Y_pred = penguinsSVMrbf.predict(X_test_scaled)

In [None]:
# Display the confusion matrix
confusion_matrix(Y_test, Y_pred)

In [None]:
# Adjust the number of decimal places in C and change degree by steps of 1.
# Degree impacts the degree of the polynomial for the kernel.

penguinsSVMpoly = svm.SVC(kernel='poly', C=0.1, degree=5)
penguinsSVMpoly.fit(X_train_scaled, Y_train)

In [None]:
# Predict for the test set
Y_pred = penguinsSVMpoly.predict(X_test_scaled)

In [None]:
# Display the confusion matrix
confusion_matrix(Y_test, Y_pred)

## **Accessing Information**


In [None]:
# The number of support vectors for each class
penguinsSVMrbf.n_support_

In [None]:
# Which instances in the training set are support vectors
penguinsSVMrbf.support_

In [None]:
# The coefficients of the hyperplanes for each pair of classes in the form intercept = coefficient1*variable1 + coefficient2*variable2 + ...
penguinsSVMlinear.coef_

In [None]:
# The intercept of the hyperplanes for each pair of classes.
penguinsSVMlinear.intercept_