# HW1 - sklearn ml - development

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
%matplotlib inline

Reading in the classification housing dataset.

In [None]:
housing_df = pd.read_csv("./data/kc_house_data_classification.csv")

In [None]:
housing_df.info()

In [None]:
housing_df.head(10)

## Task 3 - EDA

### Using Sweetviz

In [None]:
import sweetviz

In [None]:
# report = sweetviz.analyze(housing_df)

In [None]:
# report.show_html("output/sweetviz_hw1report.html")

### Using Panda Profiling

In [None]:
# profile = ProfileReport(housing_df, title="Pandas Profiling Report")

In [None]:
# profile.to_file("output/pandas_profiling_report.html")

In [None]:
# profile

It seems that there is a small difference in the outputs that were generated. The Sweetviz report suggested that the variable `floors` was a categorical variable but the Pandas Profiling one dictated it as a numeric variable. For the purpose of this hw I will not dedicate `floors` as a categorical variable in my models.

## Task 4 - Categorical vs. Numeric

Here are the steps I took in order to do some data preprocessing continue the development of the models later in the hw file.

First lets see what the datatypes for are variables are.

In [None]:
housing_df.dtypes

Since all of the variables will output as numeric, we will have to make sure what variables are numeric/categorical in out lists. 

Next, I will convert the following variables `view`, `waterfront`, and `condition` into categorical data using the following code.

In [None]:
housing_df["view"] = housing_df["view"].astype("category")
housing_df["waterfront"] = housing_df["waterfront"].astype("category")
housing_df["condition"] = housing_df["condition"].astype("category")

Here is the resulting output for the numeric and categorical variables:

In [None]:
categorical_cols = housing_df.select_dtypes(include=['category']).columns.tolist()
numeric_cols = housing_df.select_dtypes(include=['number']).columns.tolist()

In [None]:
categorical_cols

In [None]:
numeric_cols

In [None]:
numeric_cols = numeric_cols[:-1]
numeric_cols

In [None]:
X = housing_df.iloc[:, 0:18]
y = housing_df.iloc[:, 18]

In [None]:
housing_df.info()

In [None]:
X.info()

In [None]:
y.info()

## Task 4 - Logistic Regression models

### Pipeline for preprocessing

In [None]:
# Create a StandardScalar object to use on our numeric variables
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)])

In [None]:
# Classifier model
clf_model = LogisticRegression(penalty='l2', C=1, solver='saga', max_iter=500)

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', clf_model)])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=73)

# Fit model on new training data - notice that clf is actually the Pipeline
clf.fit(X_train, y_train)

print(f"Training score: {clf.score(X_train, y_train):.3f}")
print(f"Test score: {clf.score(X_test, y_test):.3f}")

In [None]:
y_train.info()

#### Model 0

In [None]:
dummy_clf = DummyClassifier(strategy="most_frequent")

In [None]:
dummy_clf.fit(X, y)

In [None]:
dummy_clf.predict(X)

In [None]:
dummy_clf.score(X, y)

#### Model 1

In [None]:
# Ridge Regression
clf_model_ridge = LogisticRegression(penalty='l2', C=1.0, solver='saga', max_iter=2000)

In [None]:
# Appending classifier to preprocessing pipeline.
clf_model1 = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', clf_model_ridge)])

In [None]:
# Fitting model on training data 
clf_model1.fit(X_train, y_train)

# Output statement
print(f"Training score: {clf_model1.score(X_train, y_train):.3f}")
print(f"Test score: {clf_model1.score(X_test, y_test):.3f}")

##### Confusion Matrix - M1, Train

In [None]:
titles_options = [("Confusion matrix for train, without normalization", None),
                  ("Normalized confusion matrix for train", 'true')]

class_names = clf_model1['classifier'].classes_

for title, normalize in titles_options:
    disp = ConfusionMatrixDisplay.from_estimator(clf_model1, X_train, y_train,
                                 display_labels=class_names,
                                 cmap=plt.cm.Blues,
                                 normalize=normalize)
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()

##### Confusion Matrix - M1, Test

In [None]:
titles_options = [("Confusion matrix for train, without normalization", None),
                  ("Normalized confusion matrix for train", 'true')]

class_names = clf_model1['classifier'].classes_

for title, normalize in titles_options:
    disp = ConfusionMatrixDisplay.from_estimator(clf_model1, X_test, y_test,
                                 display_labels=class_names,
                                 cmap=plt.cm.Blues,
                                 normalize=normalize)
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()

##### Coefficient Plot

In [None]:
clf.named_steps['classifier'].coef_.shape

In [None]:
clf_model1.steps[1]

In [None]:
def coef_plot(model, title=None):
    fig, ax = plt.subplots(2, sharex=True)
       
    if title:
        ax[0].set_title(title)

    for i in range(1):
        ax[i].plot(model.steps[1][1].coef_[i])

In [None]:
coef_plot(clf_model1, 'Model 1 - L2, C=1.0')

#### Model 2

In [None]:
# Lasso Regression, C = 1.0
clf_model2_lasso = LogisticRegression(penalty='l1', C=1.0, solver='saga', max_iter=2000)

# Appending classifier to preprocessing pipeline.
clf_model2 = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', clf_model2_lasso)])

# Fitting model on training data 
clf_model2.fit(X_train, y_train)

# Output statement
print(f"Training score: {clf_model2.score(X_train, y_train):.3f}")
print(f"Test score: {clf_model2.score(X_test, y_test):.3f}")

##### CM - M2, Train

In [None]:
titles_options = [("Confusion matrix for train, without normalization", None),
                  ("Normalized confusion matrix for train", 'true')]

class_names = clf_model2['classifier'].classes_

for title, normalize in titles_options:
    disp = ConfusionMatrixDisplay.from_estimator(clf_model2, X_train, y_train,
                                 display_labels=class_names,
                                 cmap=plt.cm.Blues,
                                 normalize=normalize)
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()

##### CM - M2, Test

In [None]:
titles_options = [("Confusion matrix for train, without normalization", None),
                  ("Normalized confusion matrix for train", 'true')]

class_names = clf_model2['classifier'].classes_

for title, normalize in titles_options:
    disp = ConfusionMatrixDisplay.from_estimator(clf_model2, X_test, y_test,
                                 display_labels=class_names,
                                 cmap=plt.cm.Blues,
                                 normalize=normalize)
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()

##### Coefficient Plot

In [None]:
coef_plot(clf_model2, 'Model 2 - L1, C=1.0')

Explaination and comparison of Models 1 to 2: 

#### Model 3

In [None]:
# Lasso Regression, C = 0.01
clf_model3_lasso = LogisticRegression(penalty='l1', C=0.01, solver='saga', max_iter=2000)

# Appending classifier to preprocessing pipeline.
clf_model3 = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', clf_model3_lasso)])

# Fitting model on training data 
clf_model3.fit(X_train, y_train)

# Output statement
print(f"Training score: {clf_model3.score(X_train, y_train):.3f}")
print(f"Test score: {clf_model3.score(X_test, y_test):.3f}")

##### CM - M3, Train

In [None]:
titles_options = [("Confusion matrix for train, without normalization", None),
                  ("Normalized confusion matrix for train", 'true')]

class_names = clf_model3['classifier'].classes_

for title, normalize in titles_options:
    disp = ConfusionMatrixDisplay.from_estimator(clf_model3, X_train, y_train,
                                 display_labels=class_names,
                                 cmap=plt.cm.Blues,
                                 normalize=normalize)
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()

##### CM - M3, Test

In [None]:
titles_options = [("Confusion matrix for train, without normalization", None),
                  ("Normalized confusion matrix for train", 'true')]

class_names = clf_model3['classifier'].classes_

for title, normalize in titles_options:
    disp = ConfusionMatrixDisplay.from_estimator(clf_model3, X_test, y_test,
                                 display_labels=class_names,
                                 cmap=plt.cm.Blues,
                                 normalize=normalize)
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()

##### Coefficient Plot

In [None]:
coef_plot(clf_model3, 'Model 3 - L1, C=0.01')

ANSWER AND EXPLAINATION: Does this enforce more or less regularization? Create the same outputs and compare the performance to the first two models. Discuss why the plot looks so different than the previous plots.

#### Model 4 THIS WILL TAKE A LONG TIME TO RUN!!!!!!!

In [None]:
l1_ratios = list(np.linspace(0, 1, 5))

for penalty in ['l1', 'l2', 'elasticnet']:
    
    # LR, Optimal C value
    clf_model4_cv = LogisticRegressionCV(penalty=penalty, Cs=[0.1, 1, 10], l1_ratios=l1_ratios, solver='saga', max_iter=2000)

    # Appending classifier to preprocessing pipeline.
    clf_model4 = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', clf_model4_cv)])

    # Fitting model on training data 
    clf_model4.fit(X_train, y_train)
    print(f"Best model score for {penalty} : {clf_model4['classifier'].scores_}")

# Output statement
print(f"Training score: {clf_model4.score(X_train, y_train):.3f}")
print(f"Test score: {clf_model4.score(X_test, y_test):.3f}")

##### CM - M4, Train

In [None]:
titles_options = [("Confusion matrix for train, without normalization", None),
                  ("Normalized confusion matrix for train", 'true')]

class_names = clf_model4['classifier'].classes_

for title, normalize in titles_options:
    disp = ConfusionMatrixDisplay.from_estimator(clf_model4, X_train, y_train,
                                 display_labels=class_names,
                                 cmap=plt.cm.Blues,
                                 normalize=normalize)
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()

##### CM - M4, Test

In [None]:
titles_options = [("Confusion matrix for train, without normalization", None),
                  ("Normalized confusion matrix for train", 'true')]

class_names = clf_model4['classifier'].classes_

for title, normalize in titles_options:
    disp = ConfusionMatrixDisplay.from_estimator(clf_model4, X_test, y_test,
                                 display_labels=class_names,
                                 cmap=plt.cm.Blues,
                                 normalize=normalize)
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()

EXPLAINATION AND ANSWER: Does regularization help for this problem? Need Confusion Matrixs for problem

### Task 5 - Decision Tree

In [None]:
# Create a DecisionTreeClassifier model. 
tree_task5 = DecisionTreeClassifier(min_samples_split=20)

# Fit the model using our features and target variables
tree_task5.fit(X_train, y_train)


# Get % accuracy on the training data
# Output statement
print(f"Training score: {tree_task5.score(X_train, y_train):.3f}")
print(f"Test score: {tree_task5.score(X_test, y_test):.3f}")

In [None]:
# Making prediction
tree_testclasses = tree_task5.predict(X_test)
print(tree_testclasses[:10])

# Class probabilities - just the first three rows and first 15 cols
tree_testprobs = tree_task5.predict_proba(X_test)
tree_testprobs[:3, :2]

##### CM - DecisionTree, Train

In [None]:
plt.matshow(tree_testprobs[:10, :10], cmap='Blues')
plt.show()

##### CM - DecisionTree, Test

In [None]:
titles_options = [("Confusion matrix for train, without normalization", None),
                  ("Normalized confusion matrix for train", 'true')]

class_names = tree_task5['classifier'].classes_

for title, normalize in titles_options:
    disp = ConfusionMatrixDisplay.from_estimator(tree_task5, X_test, y_test,
                                 display_labels=class_names,
                                 cmap=plt.cm.Blues,
                                 normalize=normalize)
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()

In [None]:
# Fitting Decision Tree
clf_RF_model_final = RandomForestClassifier(oob_score=True, random_state=0)

# Append classifier to preprocessing pipeline.
clf_RF_final = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', clf_RF_model_final)])

# Fit model on training data 
clf_RF_final.fit(X_train, y_train)
print("Training score: %.3f" % clf_RF_final.score(X_train, y_train))

# Make predictions on the test data
clf_RF_final_predictions = clf_RF_final.predict(X_test)
print(clf_RF_final_predictions[:10])  # Print out a few predictions just to see what they look like

ANSWER AND EXPLAINATION: Discuss the performance relative to your logistic regression models.

In [None]:
#if 'price_gt_1M' in y:
  # print("Column 'Name' is present in the DataFrame")
#else:
 #  print("Column 'Name' is not present in the DataFrame") 

### Task 6 - Error Exploration

In [None]:
# Classifier model
clf_task6_lasso = LogisticRegression(penalty='l1', C=1.0, solver='saga', max_iter=2000)

# Append classifier to preprocessing pipeline.
clf_task6 = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', clf_task6_lasso)])

# Fit model on training data 
clf_task6.fit(X_train, y_train)

print(f"Training score: {clf_task6.score(X_train, y_train):.3f}")
print(f"Test score: {clf_task6.score(X_test, y_test):.3f}")


# Make predictions on the test data
clf_task6_predictions = clf_task6.predict(X_test)
print(clf_task6_predictions[:10])  # Print out a few predictions just to see what they look like

In [None]:
print(f"M1 - Training score: {clf_model1.score(X_train, y_train):.3f}")
print(f"M2 - Training score: {clf_model2.score(X_train, y_train):.3f}")
print(f"M3 - Training score: {clf_model3.score(X_train, y_train):.3f}")
print(f"M4 - Training score: {clf_model4.score(X_train, y_train):.3f}")
print(f"T5 - Training score: {tree_task5.score(X_train, y_train):.3f}")

print(f"M1 - Test score: {clf_model1.score(X_test, y_test):.3f}")
print(f"M2 - Test score: {clf_model2.score(X_test, y_test):.3f}")
print(f"M3 - Test score: {clf_model3.score(X_test, y_test):.3f}")
print(f"M4 - Test score: {clf_model4.score(X_test, y_test):.3f}")
print(f"T5 - Test score: {tree_task5.score(X_test, y_test):.3f}")