# Project 7

- [Report](https://docs.google.com/document/d/1d6JuigRGQrC9244Y_fzWR2EBKznfnKVnQH_Bf-OEOwI/edit?usp=sharing)
- [Slides](https://docs.google.com/presentation/d/1qbXJJV9wEzjcOUMc-ESLRDamedZsxWrcTiVqrR-AlJ8/edit?usp=sharing)
- [Dataset](https://www.kaggle.com/datasets/jonathanpilafas/2024-march-madness-statistical-analysis)

## Setup

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
import re
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
import pandas as pd
from sklearn import svm
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
import random
from sklearn.tree import export_graphviz
from IPython.display import SVG
from graphviz import Source
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

## Exploring/Cleaning Data

### Collect Utah and Colorado housing data

In [None]:
df = pd.read_csv('realtor-data.csv')
df = df[df['state'].isin(['Utah', 'Colorado'])]
df = df.dropna()
df = df.drop(df[df.status == 'sold'].index)
df = df.reset_index(drop=True)
df

### Ensure that the two states have similar data to each other and remove outliers

In [None]:
display(df.describe())
display(df[df.state == 'Utah'].describe())
display(df[df.state == 'Colorado'].describe())

In [None]:
outliers = df[df.price > df.price.mean() + 2 * df.price.std()]
outliers

In [None]:
df.drop(outliers.index)

In [None]:
display(df.describe())
display(df[df.state == 'Utah'].describe())
display(df[df.state == 'Colorado'].describe())

In [None]:
sns.displot(data=df, x='price', hue='state', kind='kde', common_norm=False, height=6, aspect=1.5)
plt.axvline(df[df.state == 'Colorado'].price.mean(), color='green', linestyle='dashed', linewidth=2, label='Colorado Mean')
plt.axvline(df[df.state == 'Colorado'].price.median(), color='blue', linestyle='dashed', linewidth=2, label='Colorado Mode')
plt.axvline(df[df.state == 'Utah'].price.mean(), color='red', linestyle='dashed', linewidth=2, label='Utah Mean')
plt.axvline(df[df.state == 'Utah'].price.median(), color='orange', linestyle='dashed', linewidth=2, label='Utah Mode')
plt.legend()
plt.title('Distribution of House Prices')
plt.xlabel('Price ($)')

In [None]:
sns.displot(data=df, x='price', hue='state', kind='kde', common_norm=False, height=6, aspect=1.5)
plt.axvline(df[df.state == 'Colorado'].price.mean(), color='green', linestyle='dashed', linewidth=2, label='Colorado Mean')
plt.axvline(df[df.state == 'Colorado'].price.median(), color='blue', linestyle='dashed', linewidth=2, label='Colorado Mode')
plt.axvline(df[df.state == 'Utah'].price.mean(), color='red', linestyle='dashed', linewidth=2, label='Utah Mean')
plt.axvline(df[df.state == 'Utah'].price.median(), color='orange', linestyle='dashed', linewidth=2, label='Utah Mode')
plt.legend()
plt.xlim((0, 2000000))
plt.title('Distribution of House Prices')
plt.xlabel('Price ($)')

### Prepare the data for prediction

In [None]:
df['price_range'] = pd.qcut(df['price'], 4)
df['price_range_encoded'] = pd.qcut(df['price'], 4, labels=False)
df

In [None]:
df['utah'] = df.apply(lambda row: 1 if row.state == 'Utah' else 0, axis=1)
df

In [None]:
df['prev_sold_date'] = pd.to_datetime(df['prev_sold_date'])
df['days_since_last_sale'] = (pd.Timestamp.today() - df['prev_sold_date']).dt.days
df

## Decision Trees

### Depth of 3

#### Training on whole dataset

In [168]:
# create a list of feature columns
feature_cols = ['bed', 'bath', 'acre_lot', 'house_size', 'utah', 'days_since_last_sale']

# define X and y
X = df[feature_cols]
y = df.price_range_encoded

# fit a classification tree on all data
treeclf = DecisionTreeClassifier(max_depth=3, random_state=1)
treeclf.fit(X.values, y)

dot = tree.export_graphviz(treeclf, out_file=None,
                                    feature_names=feature_cols,
                                    class_names=['0', '1', '2', '3'], filled = True)

# output dot code to copy/paste into https://dreampuf.github.io/GraphvizOnline
print(dot)

y_pred = treeclf.predict(X)

p,r,f,s = precision_recall_fscore_support(y, y_pred, labels=[0, 1, 2, 3])
display('precision = {}'.format(p))
display('recall = {}'.format(r))
display('f-score = {}'.format(f))

pd.DataFrame({'feature':feature_cols, 'importance':treeclf.feature_importances_})

digraph Tree {
node [shape=box, style="filled", color="black", fontname="helvetica"] ;
edge [fontname="helvetica"] ;
0 [label="house_size <= 2938.5\ngini = 0.75\nsamples = 15192\nvalue = [3915.0, 3725.0, 3754.0, 3798.0]\nclass = 0", fillcolor="#fffefd"] ;
1 [label="house_size <= 1613.5\ngini = 0.696\nsamples = 9842\nvalue = [3823.0, 3271.0, 1823.0, 925.0]\nclass = 0", fillcolor="#fdf4ee"] ;
0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
2 [label="acre_lot <= 0.745\ngini = 0.49\nsamples = 3386\nvalue = [2302, 681, 239, 164]\nclass = 0", fillcolor="#efb388"] ;
1 -> 2 ;
3 [label="gini = 0.46\nsamples = 3039\nvalue = [2137, 612, 188, 102]\nclass = 0", fillcolor="#efb083"] ;
2 -> 3 ;
4 [label="gini = 0.681\nsamples = 347\nvalue = [165, 69, 51, 62]\nclass = 0", fillcolor="#f6d3bb"] ;
2 -> 4 ;
5 [label="house_size <= 2145.5\ngini = 0.709\nsamples = 6456\nvalue = [1521, 2590, 1584, 761]\nclass = 1", fillcolor="#d9fad6"] ;
1 -> 5 ;
6 [label="gini = 0.68\nsamples = 3022\nvalue = [



'precision = [0.67985824 0.4011772  0.57587399 0.80560612]'

'recall = [0.58799489 0.69530201 0.39930741 0.58267509]'

'f-score = [0.63059855 0.50879088 0.4716061  0.67624141]'

Unnamed: 0,feature,importance
0,bed,0.0
1,bath,0.0
2,acre_lot,0.058994
3,house_size,0.941006
4,utah,0.0
5,days_since_last_sale,0.0


#### Monte Carlo cross-validation

In [None]:
# create a list of feature columns
feature_cols = ['bed', 'bath', 'acre_lot', 'house_size', 'utah', 'days_since_last_sale']

# define X and y
X = df[feature_cols]
y = df.price_range_encoded

scores = {'p': [], 'r': [], 'f': []}
for _ in range(30):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)#, random_state=42)

    # fit a classification tree on all data
    treeclf = DecisionTreeClassifier(max_depth=3, random_state=1)
    treeclf.fit(X_train, y_train)

    y_pred = treeclf.predict(X_test)

    p,r,f,s = precision_recall_fscore_support(y_test, y_pred, labels=[0, 1, 2, 3])
    scores['p'].append(p)
    scores['r'].append(r)
    scores['f'].append(f)

avgP = 0
for pre in scores['p']:
    avgP += pre
avgP /= len(scores['p'])

avgR = 0
for re in scores['r']:
    avgR += re
avgR /= len(scores['r'])

avgF = 0
for f in scores['f']:
    avgF += f
avgF /= len(scores['f'])

display('precision = {}'.format(avgP))
display('recall = {}'.format(avgR))
display('f-score = {}'.format(avgF))

### Depth of 5

#### Training on whole dataset

In [169]:
# create a list of feature columns
feature_cols = ['bed', 'bath', 'acre_lot', 'house_size', 'utah', 'days_since_last_sale']

# define X and y
X = df[feature_cols]
y = df.price_range_encoded

# fit a classification tree on all data
treeclf = DecisionTreeClassifier(max_depth=5, random_state=1)
treeclf.fit(X.values, y)

dot = tree.export_graphviz(treeclf, out_file=None,
                                    feature_names=feature_cols,
                                    class_names=['0', '1', '2', '3'], filled = True)

# output dot code to copy/paste into https://dreampuf.github.io/GraphvizOnline
print(dot)

y_pred = treeclf.predict(X)

p,r,f,s = precision_recall_fscore_support(y, y_pred, labels=[0, 1, 2, 3])
display('precision = {}'.format(p))
display('recall = {}'.format(r))
display('f-score = {}'.format(f))

pd.DataFrame({'feature':feature_cols, 'importance':treeclf.feature_importances_})

digraph Tree {
node [shape=box, style="filled", color="black", fontname="helvetica"] ;
edge [fontname="helvetica"] ;
0 [label="house_size <= 2938.5\ngini = 0.75\nsamples = 15192\nvalue = [3915.0, 3725.0, 3754.0, 3798.0]\nclass = 0", fillcolor="#fffefd"] ;
1 [label="house_size <= 1613.5\ngini = 0.696\nsamples = 9842\nvalue = [3823.0, 3271.0, 1823.0, 925.0]\nclass = 0", fillcolor="#fdf4ee"] ;
0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
2 [label="acre_lot <= 0.745\ngini = 0.49\nsamples = 3386\nvalue = [2302, 681, 239, 164]\nclass = 0", fillcolor="#efb388"] ;
1 -> 2 ;
3 [label="house_size <= 1272.5\ngini = 0.46\nsamples = 3039\nvalue = [2137, 612, 188, 102]\nclass = 0", fillcolor="#efb083"] ;
2 -> 3 ;
4 [label="utah <= 0.5\ngini = 0.377\nsamples = 1542\nvalue = [1193, 225, 77, 47]\nclass = 0", fillcolor="#eca26d"] ;
3 -> 4 ;
5 [label="gini = 0.427\nsamples = 1166\nvalue = [855, 204, 69, 38]\nclass = 0", fillcolor="#edaa79"] ;
4 -> 5 ;
6 [label="gini = 0.188\nsamples = 376



'precision = [0.64152411 0.44990099 0.5563151  0.76369863]'

'recall = [0.63218391 0.60993289 0.45524774 0.64586625]'

'f-score = [0.63681976 0.51783476 0.50073249 0.69985735]'

Unnamed: 0,feature,importance
0,bed,0.0
1,bath,0.052985
2,acre_lot,0.094323
3,house_size,0.844411
4,utah,0.006631
5,days_since_last_sale,0.00165


#### Monte Carlo cross-validation

In [167]:
# create a list of feature columns
feature_cols = ['bed', 'bath', 'acre_lot', 'house_size', 'utah', 'days_since_last_sale']

# define X and y
X = df[feature_cols]
y = df.price_range_encoded

scores = {'p': [], 'r': [], 'f': []}
for _ in range(30):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)#, random_state=42)

    # fit a classification tree on all data
    treeclf = DecisionTreeClassifier(max_depth=5, random_state=1)
    treeclf.fit(X_train, y_train)

    y_pred = treeclf.predict(X_test)

    p,r,f,s = precision_recall_fscore_support(y_test, y_pred, labels=[0, 1, 2, 3])
    scores['p'].append(p)
    scores['r'].append(r)
    scores['f'].append(f)

avgP = 0
for pre in scores['p']:
    avgP += pre
avgP /= len(scores['p'])

avgR = 0
for re in scores['r']:
    avgR += re
avgR /= len(scores['r'])

avgF = 0
for f in scores['f']:
    avgF += f
avgF /= len(scores['f'])

display('precision = {}'.format(avgP))
display('recall = {}'.format(avgR))
display('f-score = {}'.format(avgF))

'precision = [0.61506497 0.44388534 0.54552542 0.7707855 ]'

'recall = [0.68553899 0.54109085 0.45792931 0.62252932]'

'f-score = [0.64407598 0.48396581 0.49660558 0.68796472]'

### Testing the most important feature only

In [176]:
# create a list of feature columns
feature_cols = ['house_size']

# define X and y
X = df[feature_cols]
y = df.price_range_encoded

# fit a classification tree on all data
treeclf = DecisionTreeClassifier(max_depth=5, random_state=1)
treeclf.fit(X.values, y)

dot = tree.export_graphviz(treeclf, out_file=None,
                                    feature_names=feature_cols,
                                    class_names=['0', '1', '2', '3'], filled = True)

# output dot code to copy/paste into https://dreampuf.github.io/GraphvizOnline
print(dot)

y_pred = treeclf.predict(X)

p,r,f,s = precision_recall_fscore_support(y, y_pred, labels=[0, 1, 2, 3])
display('precision = {}'.format(p))
display('recall = {}'.format(r))
display('f-score = {}'.format(f))

digraph Tree {
node [shape=box, style="filled", color="black", fontname="helvetica"] ;
edge [fontname="helvetica"] ;
0 [label="house_size <= 2938.5\ngini = 0.75\nsamples = 15192\nvalue = [3915.0, 3725.0, 3754.0, 3798.0]\nclass = 0", fillcolor="#fffefd"] ;
1 [label="house_size <= 1613.5\ngini = 0.696\nsamples = 9842\nvalue = [3823.0, 3271.0, 1823.0, 925.0]\nclass = 0", fillcolor="#fdf4ee"] ;
0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
2 [label="house_size <= 1272.5\ngini = 0.49\nsamples = 3386\nvalue = [2302, 681, 239, 164]\nclass = 0", fillcolor="#efb388"] ;
1 -> 2 ;
3 [label="house_size <= 704.5\ngini = 0.418\nsamples = 1712\nvalue = [1275, 254, 105, 78]\nclass = 0", fillcolor="#eda774"] ;
2 -> 3 ;
4 [label="house_size <= 569.5\ngini = 0.161\nsamples = 127\nvalue = [116, 8, 3, 0]\nclass = 0", fillcolor="#e78d4b"] ;
3 -> 4 ;
5 [label="gini = 0.046\nsamples = 42\nvalue = [41, 1, 0, 0]\nclass = 0", fillcolor="#e6843e"] ;
4 -> 5 ;
6 [label="gini = 0.213\nsamples = 85\nva



'precision = [0.61978928 0.40738709 0.52530541 0.81219904]'

'recall = [0.69118774 0.59812081 0.4009057  0.53291206]'

'f-score = [0.65354426 0.48466391 0.45475147 0.64356121]'

In [175]:
# create a list of feature columns
feature_cols = ['house_size']

# define X and y
X = df[feature_cols]
y = df.price_range_encoded

# fit a classification tree on all data
treeclf = DecisionTreeClassifier(max_depth=4, random_state=1)
treeclf.fit(X.values, y)

dot = tree.export_graphviz(treeclf, out_file=None,
                                    feature_names=feature_cols,
                                    class_names=['0', '1', '2', '3'], filled = True)

# output dot code to copy/paste into https://dreampuf.github.io/GraphvizOnline
print(dot)

y_pred = treeclf.predict(X)

p,r,f,s = precision_recall_fscore_support(y, y_pred, labels=[0, 1, 2, 3])
display('precision = {}'.format(p))
display('recall = {}'.format(r))
display('f-score = {}'.format(f))

digraph Tree {
node [shape=box, style="filled", color="black", fontname="helvetica"] ;
edge [fontname="helvetica"] ;
0 [label="house_size <= 2938.5\ngini = 0.75\nsamples = 15192\nvalue = [3915.0, 3725.0, 3754.0, 3798.0]\nclass = 0", fillcolor="#fffefd"] ;
1 [label="house_size <= 1613.5\ngini = 0.696\nsamples = 9842\nvalue = [3823.0, 3271.0, 1823.0, 925.0]\nclass = 0", fillcolor="#fdf4ee"] ;
0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
2 [label="house_size <= 1272.5\ngini = 0.49\nsamples = 3386\nvalue = [2302, 681, 239, 164]\nclass = 0", fillcolor="#efb388"] ;
1 -> 2 ;
3 [label="house_size <= 704.5\ngini = 0.418\nsamples = 1712\nvalue = [1275, 254, 105, 78]\nclass = 0", fillcolor="#eda774"] ;
2 -> 3 ;
4 [label="gini = 0.161\nsamples = 127\nvalue = [116, 8, 3, 0]\nclass = 0", fillcolor="#e78d4b"] ;
3 -> 4 ;
5 [label="gini = 0.435\nsamples = 1585\nvalue = [1159.0, 246.0, 102.0, 78.0]\nclass = 0", fillcolor="#eda978"] ;
3 -> 5 ;
6 [label="house_size <= 1450.5\ngini = 0.55\



'precision = [0.61978928 0.40704894 0.52311436 0.81237364]'

'recall = [0.69118774 0.59838926 0.4009057  0.52896261]'

'f-score = [0.65354426 0.48451255 0.45392852 0.64072716]'

## Neural Networks