# Classification

![title](./images/Classification_01.JPG)

In [None]:
# Imports
import matplotlib.pyplot as plt  
import numpy as np  
import pandas as pd  
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns 
import sklearn
from sklearn import datasets, metrics
from sklearn import model_selection
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, roc_curve, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_predict 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
import statsmodels.api as sm



## Logistic Regression

### First, import the dataset

In [None]:
# Import data for houses in Memphis in the 38118 zip code and Amarillow in the 79106 area for Logistic Regression
cols=["SF","Price","Beds","Baths","Year_Built","Lot_Size_Acres","Garage_Size","Stories","Brick","City"]
City_Log_Reg_df = pd.read_csv('data/Logistic_Regression_Memphis_vs_Amarillo.csv', names=cols)

# See the first few lines of data
print("Overview of data:")
print(City_Log_Reg_df.head(3))



### Let's create a model to classify the house is in Memphis (38118) or Amarillo (79106)
* We will do this using Logistic Regression and the model will use all of the attributes from the Multiple Variable Linear Regression section above.
* We will divide our data up into 2 groups - the training group and testing group.  80% will be in training, 20% in testing.

In [None]:
# Convert the dataframe to a NumPy array, and split it into a training and test set
X_train = City_Log_Reg_df.drop("City", axis=1).values
y_train = City_Log_Reg_df["City"].values

# Create train/test splits
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2)

# Train the model
LogReg = LogisticRegression(penalty = 'l2')
LogReg.fit(X_train, y_train)



### Make predictions based on the model we just created and see how well it performs.
* The 2 classes are 0 and 1.  0 means the house is in Memphis, 1 means the house is in Amarillo.
* A confusion matrix will show how each house was classified.

In [None]:
# Predict
y_pred = LogReg.predict(X_test)
#print(y_pred)
y_pred_probs = LogReg.predict_proba(X_test)
#print(y_pred_probs)

# Generate the confusion matrix data
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)

# Prepare the plot
class_names=["0","1"]
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

# Create heatmap
sns.heatmap(pd.DataFrame(confusion_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion Matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

# Print additional info for the confusion matrix
print("\n" + str(confusion_matrix[0][0]) + " houses that are in Memphis (class 0) were classified as in Memphis.")
print(str(confusion_matrix[0][1]) + " houses that are in M were classified as in Amarillo.")
print(str(confusion_matrix[1][0]) + " houses that are in Amarillo (class 1) were classified as in Memphis.")
print(str(confusion_matrix[1][1]) + " houses that are in Amarillo were classified as in Amarillo.\n")

print("The accuracy of the logistic regression classifier on the test set is {:.2f}.\n".format(LogReg.score(X_test, y_test)))

print(sklearn.metrics.classification_report(y_test, y_pred))


### Evaluate the AUC
The Area Under the Receiver Operating Curve is a numerical value that tells how well a model can distinguish between the different classes. The higher the AUC, which ranges from 0 to 1, better the model.  An AUC of 0.5 indicates the model is randomly guessing.  It represents the False Positive Rate vs the True Positive Rate. 

In [None]:
# Calculate the AUC and draw the ROC curve

# Get the predicted probabilities and other data needed for AUC
y_pred_auc = LogReg.predict_proba(X_test)[::,1]

fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_auc)
auc = metrics.roc_auc_score(y_test, y_pred_auc)

fig = go.Figure(data=go.Scatter(x=fpr, y=tpr))
fig.update_layout(title="AUC - {:.3f}".format(auc),
                   xaxis_title="False Positive Rate",
                   yaxis_title="True Positive Rate")
fig.show()

print("\nThe AUC is {:.3f}.".format(auc))

 

### Let's look at the actual classifications.  These show which city the house was really in ("City") and the probability that it was in Memphis or Amarillo.

In [None]:
counter = 0
for i in range(len(y_test)):
    if (int(y_test[i]) == 0):
        city = "Memphis"
    else:
        city = "Amarillo"

    outcome = ""

    if(y_pred[i] == y_test[i]):
        outcome = " - CORRECT"
        counter += 1
        
    print("SF = {:,d}, Price = {:,d}, City = {}, Prob of Memphis = {:.3f}, Prob of Amarillo = {:.3f}{}".format(int(X_test[i][0]),
                                                                                                      int(X_test[i][1]),
                                                                                                      city,
                                                                                                      y_pred_probs[i][0],
                                                                                                      y_pred_probs[i][1],
                                                                                                      outcome))
    
print("\nAccuracy of classifications is {:.2f}.".format(counter / (len(y_test))))

### Could cross validation help?

![title](./images/Cross_Validation_01.JPG)
Image from https://en.wikipedia.org/wiki/Cross-validation_(statistics)

Let's try with Memphis vs. Newport Beach, CA - an incredibly expensive city!

In [None]:
# Import data for houses in Memphis in the 38118 zip code for Simple Linear Regression
cols=["SF","Price","Beds","Baths","Year_Built","Lot_Size_Acres","Garage_Size","Stories","Brick","City"]
City_Log_Reg_df = pd.read_csv('data/Logistic_Regression_Memphis_vs_Newport_Beach.csv', names=cols)

# Create scatterplot
fig = px.scatter(City_Log_Reg_df, x="SF", y="Price", color=City_Log_Reg_df["City"].astype(str))
fig.update_xaxes(title_text='SF')
fig.update_yaxes(title_text='Price ($)')
fig.update_layout(title_text='House prices as a function of square footage in Memphis and Newport Beach', title_x=0.5)
fig.show()


In [None]:
# See the first few lines of data
print("Overview of data:")
print(City_Log_Reg_df.head(3))

# Convert the dataframe to a NumPy array, and split it into a training and test set
X_train = City_Log_Reg_df.drop("City", axis=1).values
y_train = City_Log_Reg_df["City"].values

# Create train/test splits
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2)

# Train the model
LogReg = LogisticRegression(penalty = 'l2')
LogReg.fit(X_train, y_train)

# Predict
y_pred = LogReg.predict(X_test)
#print(y_pred)
y_pred_probs = LogReg.predict_proba(X_test)
#print(y_pred_probs)

# Generate the confustion matrix data
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)

# Prepare the plot
class_names=["0","1"] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

# Create heatmap
sns.heatmap(pd.DataFrame(confusion_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion Matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

# Print additional info for the confusion matrix
print("\n" + str(confusion_matrix[0][0]) + " houses that are in Memphis (class 0) were classified as in Memphis.")
print(str(confusion_matrix[0][1]) + " houses that are in Memphis were classified as in Newport Beach.")
print(str(confusion_matrix[1][0]) + " houses that are in Newport Beach (class 1) were classified as in Memphis.")
print(str(confusion_matrix[1][1]) + " houses that are in Newport Beach were classified as in Newport Beach.\n")

print("The accuracy of the logistic regression classifier on the test set is {:.2f}.\n".format(LogReg.score(X_test, y_test)))

print(sklearn.metrics.classification_report(y_test, y_pred))

# Calculate the AUC and draw the ROC curve

# Get the predicted probabilities and other data needed for AUC
y_pred_auc = LogReg.predict_proba(X_test)[::,1]

fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_auc)
auc = metrics.roc_auc_score(y_test, y_pred_auc)

fig = go.Figure(data=go.Scatter(x=fpr, y=tpr))
fig.update_layout(title="AUC - {:.3f}".format(auc),
                   xaxis_title="False Positive Rate",
                   yaxis_title="True Positive Rate")
fig.show()

print("\nThe AUC is {:.3f}.".format(auc))


counter = 0
for i in range(len(y_test)):
    if (int(y_test[i]) == 0):
        city = "Memphis"
    else:
        city = "Newport Beach"

    outcome = ""
    
    if ((int(y_test[i]) == 0 and y_pred_probs[i][0] > y_pred_probs[i][1]) or
       (int(y_test[i]) == 1 and y_pred_probs[i][0] < y_pred_probs[i][1])):
        outcome = " - CORRECT"
        counter += 1

    print("SF = {:,d}, Price = {:,d}, City = {}, Prob of Memphis = {:.3f}, Prob of Newport Beach = {:.3f}{}".format(int(X_test[i][0]),
                                                                                                      int(X_test[i][1]),
                                                                                                      city,
                                                                                                      y_pred_probs[i][0],
                                                                                                      y_pred_probs[i][1],
                                                                                                      outcome))
    
print("\nAccuracy of classifications is {:.2f}.".format(counter / (len(y_test))))



# Using multinomial regression to classify the city a house is located in using the variables from above. The houses can be located in Memphis, Amarillo, or Newport Beach.

### We are using a new dataset.  The dataset has 50 houses for each city.


In [None]:
# Import data for houses in Memphis, Amarillo, and Newport Beach
cols=["SF","Price","Beds","Baths","Year_Built","Lot_Size_Acres","Garage_Size","Stories","Brick","City"]
Mult_Reg_df = pd.read_csv('data/Multinomial_Regression.csv', names=cols)

# See the first few lines of data
print("Overview of data:")
print(Mult_Reg_df.head(3))



### Let's create a model to classify where the house is located
* We will do this using Multinomial Regression and the model will use all of the attributes from the Logistic Regression section above.
* We will divide our data up into 2 groups - the training group and testing group. 80% will be in training, 20% in testing.
* There are 50 houses for each city.

In [None]:
# Convert the dataframe to a NumPy array, and split it into a training and test set
X_train = Mult_Reg_df.drop("City", axis=1).values
y_train = Mult_Reg_df["City"].values

# Create train/test splits
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2)

# Create the regressor and train the model
MultReg = LogisticRegression(random_state=0, multi_class='multinomial', max_iter=7600)
MultReg.fit(X_train, y_train)

### Make predictions based on the model we just created and see how well it performs.
* The 3 classes are 0, 1, and 2.  0 means the house is in Memphis, 1 means the house is in Amarillo, and 2 means the house is in Newport Beach.
* A confusion matrix will show how each house was classified.

In [None]:
# Predict class
y_pred = MultReg.predict(X_test)

# View predicted probabilities
y_pred_probs = MultReg.predict_proba(X_test)
#print(y_pred_probs)

# Generate the confustion matrix data
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
#print(confusion_matrix)

# Prepare the plot
class_names=["0","1", "2"]
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

# Create heatmap
sns.heatmap(pd.DataFrame(confusion_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion Matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

print("The accuracy of the multinomial regression classifier on the test set is {:.2f}.\n".format(MultReg.score(X_test, y_test)))

print(sklearn.metrics.classification_report(y_test, y_pred))


### Make predictions based on the model we just created and see how well it performs.

In [None]:
counter = 0
for i in range(len(y_test)):
    if (int(y_test[i]) == 0):
        city = "Memphis"
    elif (int(y_test[i]) == 1):
        city = "Amarillo"
    else:
        city = "Newport Beach"

    outcome = ""
    
    if(y_pred[i] == y_test[i]):
        outcome = " - CORRECT"
        counter +=1
    
    print("SF = {:,d}, Price = {:,d}, City = {}, Prob Mem = {:.3f}, Prob Ama = {:.3f}, Prob NB = {:.3f}{}".format(int(X_test[i][0]), 
                                                                                                                                               int(X_test[i][1]),
                                                                                                                                               city,
                                                                                                                                               y_pred_probs[i][0],
                                                                                                                                               y_pred_probs[i][1],
                                                                                                                                               y_pred_probs[i][2],
                                                                                                                                               outcome))
    
print("\nAccuracy of classifications is {:.2f}.".format(counter / (len(y_test))))

# K-nearest neighbor classification (KNN)

* Supervised method to classify data

* The short version is that you place a new data point in space.  Then you find the K closest points, or nearest neighbors.  The new data point is put in the class that the greatest number of neighbors are in.

### Details:
  
* Choose a value of K. Make sure it is an odd number!
* Find the distance of the new point to each of the training data points.
* Find the K nearest neighbors for the new data point.
* Count the number of data points in each category among the K neighbors.  The new data point will belong to the class that has the most neighbors.

* Differs from K-means in that you already know which class each data point is in before you place the new data point.

## Choosing the value of K
![title](./images/KNN_01.JPG)

### Import the Iris dataset

In [None]:
# Load the Iris dataset
iris = datasets.load_iris()

# Print the class names 
print("The class names are:")
print(iris.target_names)

# Print the feature names
print("\nThe feature names are:")
print(iris.feature_names)
print("\n")

# Define features and target
# X will be all 4 features
X = iris.data[:, :4]
# y is the type of flower
y = iris.target


### Create a model to classify new flowers

In [None]:
# Create train/test splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate the KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=2)

# Train the model
knn.fit(X_train, y_train)


### See how well our classification model performs.

In [None]:
# Predict
y_pred = knn.predict(X_test)

# Show the accuracy of the predictions
print("Test set accuracy: {:.2f}".format(knn.score(X_test, y_test)))

### Test the new model by making a prediction based on unseen data

In [None]:
# Create a new sample and use the model built above to predict the species
x_new = np.array([[0.1, 8.6, 2.3, 4.1]])

# Predict the class for this new sample
prediction = knn.predict(x_new)
print("The predicted class for this sample is " + str(iris['target_names'][prediction][0]) + ".")
print("The correct class is setosa.")


### But what is the best K?  Let's loop through some values and find out.

In [None]:
accuracy_list = []
k_range = range(1,21)

# Try values of K from 1 to 20
for i in k_range:

    # Instantiate the KNeighborsClassifier
    knn = KNeighborsClassifier(n_neighbors=i)

    # Train the model
    knn.fit(X_train, y_train)

    # Predict
    y_pred = knn.predict(X_test)
    
    accuracy_list.append(knn.score(X_test, y_test))

    # Show the accuracy of the predictions
    print("Test set accuracy for K={:,d}: {:.2f}".format(int(i), knn.score(X_test, y_test)))
    
# Plot the accuracy vs K
plt.plot(k_range, accuracy_list)
plt.xlabel("K")
plt.ylabel("Accuracy")
plt.title("Accuracy for each value of K")
plt.xticks([0,2,4,6,8,10,12,14,16,18,20])
plt.show

# Decision Trees

* Supervised method to classify data
* Data is split continuously according to parameters, forming a classification tree.


### Decision Tree consists of :
* Nodes : Test for the value of a certain attribute.
* Edges/ Branch : Correspond to the outcome of a test and connect to the next node or leaf.
* Leaf nodes : Terminal nodes that predict the outcome (represent class labels or class distribution).

![title](./images/DT_01.JPG)

### Import the Iris dataset

In [None]:
# Load the Iris dataset
iris = datasets.load_iris()

# Print the class names 
print("The class names are:")
print(iris.target_names)

# Print the feature names
print("\nThe feature names are:")
print(iris.feature_names)
print("\n")

# Define features and target
# X will be all 4 features
X = iris.data[:, :4]
# y is the type of flower
y = iris.target


### Create a model to classify new flowers

In [None]:
# Create train/test splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
dtc.score(X_test, y_test)



### Draw the actual decision tree

In [None]:
# Create the decision tree visual
plt.figure(figsize=(25,10))
a = plot_tree(dtc, 
              feature_names=iris.feature_names, 
              class_names=iris.target_names, 
              impurity=False,
              filled=True, 
              rounded=True, 
              fontsize=14)

When building a decision tree classifier, you can set many parameters, such as:
* max_depth - The maximum depth of the tree
* min_samples_split - The minimum number of samples required to split a node.
* min_samples_leaf - The minimum number of samples required to be at a leaf node.

Credits
* https://towardsdatascience.com/a-beginners-guide-to-linear-regression-in-python-with-scikit-learn-83a8f7ae2b4f
* www.plotly.com
* https://www.theanalysisfactor.com/assessing-the-fit-of-regression-models/
* https://www.statisticshowto.com/rmse/
* https://dss.princeton.edu/online_help/analysis/interpreting_regression.htm#coefficients
* All the house information came from www.zillow.com
* https://towardsdatascience.com/knn-using-scikit-learn-c6bed765be75
* https://medium.com/datadriveninvestor/k-nearest-neighbors-knn-7b4bd0128da7
* https://www.datacamp.com/community/tutorials/k-nearest-neighbor-classification-scikit-learn
* https://www.kaggle.com/crowemi/iris-classification-k-nearest-neighbors
* https://www.datasciencecentral.com/profiles/blogs/classification-and-regression-trees
* https://towardsdatascience.com/decision-tree-classification-de64fc4d5aac
* https://towardsdatascience.com/how-to-visualize-a-decision-tree-in-5-steps-19781b28ffe2
