# Project 7

- [Report](https://docs.google.com/document/d/1pSs_wpHYjo1OZ9z9QwRaCo_tz4DWXqKcrAWFAfhqMJg/edit?usp=sharing)
- [Slides](https://docs.google.com/presentation/d/1DMBQFvKbaLcGgou52KNbLoPdnhCPeLiOwgPv8jg1U-E/edit?usp=sharing)
- [Dataset](https://www.kaggle.com/datasets/ahmedshahriarsakib/usa-real-estate-dataset)

## Setup

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

## Exploring/Cleaning Data

### Collect Utah and Colorado housing data

In [None]:
df = pd.read_csv('realtor-data.csv')
df = df[df['state'].isin(['Utah', 'Colorado'])]
df = df.dropna()
print(f'This dataset includes houses sold as recently as {pd.to_datetime(df["prev_sold_date"]).max().date()}')
# df = df.drop(df[df.status == 'sold'].index)
df = df.reset_index(drop=True)
df

In [None]:
print(f'This dataset includes houses sold as long ago as {pd.to_datetime(df["prev_sold_date"]).min().date()}')

### Ensure that the two states have similar data to each other and remove outliers

In [None]:
display(df.describe())
display(df[df.state == 'Utah'].describe())
display(df[df.state == 'Colorado'].describe())

In [None]:
outliers = df[df.price > df.price.mean() + 2 * df.price.std()]
outliers

In [None]:
df = df.drop(outliers.index).reset_index()
df

In [None]:
display(df.describe())
display(df[df.state == 'Utah'].describe())
display(df[df.state == 'Colorado'].describe())

In [None]:
sns.displot(data=df, x='price', hue='state', kind='kde', common_norm=False, height=6, aspect=1.5)
plt.axvline(df[df.state == 'Colorado'].price.mean(), color='green', linestyle='dashed', linewidth=2, label='Colorado Mean')
plt.axvline(df[df.state == 'Colorado'].price.median(), color='blue', linestyle='dashed', linewidth=2, label='Colorado Mode')
plt.axvline(df[df.state == 'Utah'].price.mean(), color='red', linestyle='dashed', linewidth=2, label='Utah Mean')
plt.axvline(df[df.state == 'Utah'].price.median(), color='orange', linestyle='dashed', linewidth=2, label='Utah Mode')
plt.legend()
plt.title('Distribution of House Prices')
plt.xlabel('Price ($)')

In [None]:
sns.displot(data=df, x='price', hue='state', kind='kde', common_norm=False, height=6, aspect=1.5)
plt.axvline(df[df.state == 'Colorado'].price.mean(), color='green', linestyle='dashed', linewidth=2, label='Colorado Mean')
plt.axvline(df[df.state == 'Colorado'].price.median(), color='blue', linestyle='dashed', linewidth=2, label='Colorado Median')
plt.axvline(df[df.state == 'Utah'].price.mean(), color='red', linestyle='dashed', linewidth=2, label='Utah Mean')
plt.axvline(df[df.state == 'Utah'].price.median(), color='orange', linestyle='dashed', linewidth=2, label='Utah Median')
plt.legend()
plt.xlim((0, 2000000))
plt.title('Distribution of House Prices')
plt.xlabel('Price ($)')

### Prepare the data for prediction

In [None]:
df['price_range'] = pd.qcut(df['price'], 4)
df['price_range_encoded'] = pd.qcut(df['price'], 4, labels=False)
df

In [None]:
df['price_range'].unique()

In [None]:
df['utah'] = df.apply(lambda row: 1 if row.state == 'Utah' else 0, axis=1)
df

In [None]:
# df['prev_sold_date'] = pd.to_datetime(df['prev_sold_date'])
# df['days_since_last_sale'] = (pd.Timestamp.today() - df['prev_sold_date']).dt.days
# df

## Decision Trees

### Depth of 3

#### Training on whole dataset

In [None]:
# create a list of feature columns
feature_cols = ['bed', 'bath', 'acre_lot', 'house_size', 'utah',]# 'days_since_last_sale']

# define X and y
X = df[feature_cols]
y = df.price_range_encoded

# fit a classification tree on all data
treeclf = DecisionTreeClassifier(max_depth=3, random_state=1)
treeclf.fit(X.values, y)

dot = tree.export_graphviz(treeclf, out_file=None,
                                    feature_names=feature_cols,
                                    class_names=['0', '1', '2', '3'], filled = True)

# output dot code to copy/paste into https://dreampuf.github.io/GraphvizOnline
print(dot)

y_pred = treeclf.predict(X)

p,r,f,s = precision_recall_fscore_support(y, y_pred, labels=[0, 1, 2, 3])
display('precision = {}'.format(p))
display('recall = {}'.format(r))
display('f-score = {}'.format(f))

pd.DataFrame({'feature':feature_cols, 'importance':treeclf.feature_importances_})

#### Monte Carlo cross-validation

In [None]:
# create a list of feature columns
feature_cols = ['bed', 'bath', 'acre_lot', 'house_size', 'utah',]# 'days_since_last_sale']

# define X and y
X = df[feature_cols]
y = df.price_range_encoded

scores = {'p': [], 'r': [], 'f': []}
for _ in range(30):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)#, random_state=42)

    # fit a classification tree on all data
    treeclf = DecisionTreeClassifier(max_depth=3, random_state=1)
    treeclf.fit(X_train, y_train)

    y_pred = treeclf.predict(X_test)

    p,r,f,s = precision_recall_fscore_support(y_test, y_pred, labels=[0, 1, 2, 3])
    scores['p'].append(p)
    scores['r'].append(r)
    scores['f'].append(f)

avgP = 0
for pre in scores['p']:
    avgP += pre
avgP /= len(scores['p'])

avgR = 0
for re in scores['r']:
    avgR += re
avgR /= len(scores['r'])

avgF = 0
for f in scores['f']:
    avgF += f
avgF /= len(scores['f'])

display('precision = {}'.format(avgP))
display('recall = {}'.format(avgR))
display('f-score = {}'.format(avgF))

### Depth of 5

#### Training on whole dataset

In [None]:
# create a list of feature columns
feature_cols = ['bed', 'bath', 'acre_lot', 'house_size', 'utah',]# 'days_since_last_sale']

# define X and y
X = df[feature_cols]
y = df.price_range_encoded

# fit a classification tree on all data
treeclf = DecisionTreeClassifier(max_depth=5, random_state=1)
treeclf.fit(X.values, y)

dot = tree.export_graphviz(treeclf, out_file=None,
                                    feature_names=feature_cols,
                                    class_names=['0', '1', '2', '3'], filled = True)

# output dot code to copy/paste into https://dreampuf.github.io/GraphvizOnline
print(dot)

y_pred = treeclf.predict(X)

p,r,f,s = precision_recall_fscore_support(y, y_pred, labels=[0, 1, 2, 3])
display('precision = {}'.format(p))
display('recall = {}'.format(r))
display('f-score = {}'.format(f))

pd.DataFrame({'feature':feature_cols, 'importance':treeclf.feature_importances_})

#### Monte Carlo cross-validation

In [None]:
# create a list of feature columns
feature_cols = ['bed', 'bath', 'acre_lot', 'house_size', 'utah',]# 'days_since_last_sale']

# define X and y
X = df[feature_cols]
y = df.price_range_encoded

scores = {'p': [], 'r': [], 'f': []}
for _ in range(30):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)#, random_state=42)

    # fit a classification tree on all data
    treeclf = DecisionTreeClassifier(max_depth=5, random_state=1)
    treeclf.fit(X_train, y_train)

    y_pred = treeclf.predict(X_test)

    p,r,f,s = precision_recall_fscore_support(y_test, y_pred, labels=[0, 1, 2, 3])
    scores['p'].append(p)
    scores['r'].append(r)
    scores['f'].append(f)

avgP = 0
for pre in scores['p']:
    avgP += pre
avgP /= len(scores['p'])

avgR = 0
for re in scores['r']:
    avgR += re
avgR /= len(scores['r'])

avgF = 0
for f in scores['f']:
    avgF += f
avgF /= len(scores['f'])

display('precision = {}'.format(avgP))
display('recall = {}'.format(avgR))
display('f-score = {}'.format(avgF))

### Testing the most important feature only

In [None]:
# create a list of feature columns
feature_cols = ['house_size']

# define X and y
X = df[feature_cols]
y = df.price_range_encoded

# fit a classification tree on all data
treeclf = DecisionTreeClassifier(max_depth=5, random_state=1)
treeclf.fit(X.values, y)

dot = tree.export_graphviz(treeclf, out_file=None,
                                    feature_names=feature_cols,
                                    class_names=['0', '1', '2', '3'], filled = True)

# output dot code to copy/paste into https://dreampuf.github.io/GraphvizOnline
print(dot)

y_pred = treeclf.predict(X)

p,r,f,s = precision_recall_fscore_support(y, y_pred, labels=[0, 1, 2, 3])
display('precision = {}'.format(p))
display('recall = {}'.format(r))
display('f-score = {}'.format(f))

In [None]:
# create a list of feature columns
feature_cols = ['house_size']

# define X and y
X = df[feature_cols]
y = df.price_range_encoded

# fit a classification tree on all data
treeclf = DecisionTreeClassifier(max_depth=4, random_state=1)
treeclf.fit(X.values, y)

dot = tree.export_graphviz(treeclf, out_file=None,
                                    feature_names=feature_cols,
                                    class_names=['0', '1', '2', '3'], filled = True)

# output dot code to copy/paste into https://dreampuf.github.io/GraphvizOnline
print(dot)

y_pred = treeclf.predict(X)

p,r,f,s = precision_recall_fscore_support(y, y_pred, labels=[0, 1, 2, 3])
display('precision = {}'.format(p))
display('recall = {}'.format(r))
display('f-score = {}'.format(f))

### Finding the best depth

In [None]:
fScores = {
    'depth': [],
    '0': [],
    '1': [],
    '2': [],
    '3': [],
    'avg': []
}
for i in range(2, 21):   
    # create a list of feature columns
    feature_cols = ['bed', 'bath', 'acre_lot', 'house_size', 'utah',]# 'days_since_last_sale']

    # define X and y
    X = df[feature_cols]
    y = df.price_range_encoded

    scores = {'p': [], 'r': [], 'f': []}
    for _ in range(30):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)#, random_state=42)

        # fit a classification tree on all data
        treeclf = DecisionTreeClassifier(max_depth=i, random_state=1)
        treeclf.fit(X_train, y_train)

        y_pred = treeclf.predict(X_test)

        p,r,f,s = precision_recall_fscore_support(y_test, y_pred, labels=[0, 1, 2, 3])
        scores['f'].append(f)

    avgF = 0
    for f in scores['f']:
        avgF += f
    avgF /= len(scores['f'])

    fScores['depth'].append(i)
    fScores['0'].append(avgF[0])
    fScores['1'].append(avgF[1])
    fScores['2'].append(avgF[2])
    fScores['3'].append(avgF[3])
    fScores['avg'].append((avgF[0] + avgF[1] + avgF[2] + avgF[3]) / 4)

display(fScores)

In [None]:
scoresDF = pd.DataFrame(fScores)
scoresDF = pd.melt(scoresDF, id_vars='depth', value_vars=['0', '1', '2', '3', 'avg'],
                    var_name='class', value_name='fScore')
scoresDF

In [None]:
maxAvgScore = scoresDF[scoresDF['class'] == 'avg']['fScore'].max()
display(maxAvgScore)
bestDepth = scoresDF[scoresDF['class'] == 'avg'][scoresDF['fScore'] == maxAvgScore].depth.tolist()[0]
display(bestDepth)
index = fScores['depth'].index(7)
bestDepthScores = {}
for key, value in fScores.items():
    if not key in ['depth', 'avg']:
        bestDepthScores[key] = value[index]
bestDepthScores

In [None]:
sns.lineplot(data=scoresDF, x='depth', y='fScore', hue='class')
plt.axvline(bestDepth, color='purple', linestyle='dashed', linewidth=2, label='Best Depth')
plt.title('Depth of Decision Tree vs Average F-Score of Decision Tree')
plt.xlabel('Depth')
plt.ylabel('Average F-Score')

It looks like a depth of 7 is the best generally.

## Neural Networks

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

In [None]:
display(df)
df.describe

In [None]:
# create a list of feature columns
feature_cols = ['bed', 'bath', 'acre_lot', 'house_size', 'utah',]# 'days_since_last_sale']
# Tried with added features 'zip_code','brokered_by', 'status', was not better. Most predictions were worst. 

# define X and y
X = df[feature_cols]
y = df.price_range_encoded

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
scaler = StandardScaler()

# Fit only to the training data
scaler.fit(X_train)

# Now apply the transformations to the data:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

X_train

In [None]:
# mlp = MLPClassifier(hidden_layer_sizes=(3),max_iter=500)
mlp = MLPClassifier(hidden_layer_sizes=(7, 9, 11),max_iter=50000)
mlp.fit(X_train,y_train)

In [None]:
predictions = mlp.predict(X_test)
p,r,f,s = precision_recall_fscore_support(y_test, predictions, labels=[0, 1, 2, 3])
display('precision = {}'.format(p))
display('recall = {}'.format(r))
display('f-score = {}'.format(f))

In [None]:
print('This dataset has {} input nodes and {} output node(s)'.format(len(X.columns), len(y.unique())))
print('There are {} 2D arrays of coefficients, one for each layer'.format(len(mlp.coefs_)))
print('The layers have the following number of coefficients: {}')
for l in range(len(mlp.coefs_)):
    m = len(mlp.coefs_[l])
    n = len(mlp.coefs_[l][0])
    print('  {}: {}x{} ({} nodes feeding into a layer of {} nodes)'.format(l, m, n, m, n))
# Print the actual coefficients
print(mlp.coefs_)

print()
print('There are {} 1D arrays of intercepts, one for each layer'.format(len(mlp.intercepts_)))
print('Each layer has {} intercepts, one for each node'.format([len(mlp.intercepts_[l]) for l,_ in enumerate(mlp.intercepts_)]))

In [None]:
import networkx as nx
import colorsys

def show_ann(mlp):
    hidden_layers_n = len(mlp.coefs_)-1
    layers_n = hidden_layers_n + 2
    input_neurons_n = len(mlp.coefs_[0])
    hidden_neurons_n = [len(mlp.coefs_[i+1]) for i in range(hidden_layers_n)]
    output_neurons_n = len(mlp.coefs_[-1][0])

    G = nx.DiGraph()
    pos = {}

    # Create the neurons of the input layer
    for i in range(input_neurons_n):
        pos['Layer0_{}'.format(i)] = (i,layers_n-1)

    for j in range(hidden_layers_n):
        # Create the neurons of the j'th hidden layer
        prev_layer = j
        cur_layer = j+1
        if (j == 0):
            prev_size = input_neurons_n
        else:
            prev_size = hidden_neurons_n[j-1]
        for i in range(hidden_neurons_n[j]):
            pos['Layer{}_{}'.format(cur_layer,i)] = (i,layers_n-1-cur_layer)
            for k in range(prev_size):
                w = mlp.coefs_[prev_layer][k][i]
                G.add_edge('Layer{}_{}'.format(prev_layer,k),'Layer{}_{}'.format(cur_layer,i), weight=w)

    # Create the neurons of the output layer
    prev_layer = hidden_layers_n
    cur_layer = hidden_layers_n+1
    for i in range(output_neurons_n):
        pos['Layer{}_{}'.format(cur_layer,i)] = (i,layers_n-1-cur_layer)
        for k in range(hidden_neurons_n[-1]):
            w = mlp.coefs_[prev_layer][k][i]
            G.add_edge('Layer{}_{}'.format(prev_layer,k),'Layer{}_{}'.format(cur_layer,i), weight=w)

    edges = G.edges()
    colors = [colorsys.hsv_to_rgb(0 if G[u][v]['weight'] < 0 else 0.65,
                                  1,#min(1, abs(G[u][v]['weight'])),
                                  1) for u,v in edges]
    weights = [abs(G[u][v]['weight'])*2 for u,v in edges]

    nx.draw(G, pos, node_color='y', node_size=450, width=weights, edge_color=colors)
    
show_ann(mlp)

In [None]:
# mlp = MLPClassifier(hidden_layer_sizes=(4),max_iter=500)
mlp = MLPClassifier(hidden_layer_sizes=(9, 5, 7, 10),max_iter=50000)
mlp.fit(X_train,y_train)

In [None]:
predictions = mlp.predict(X_test)
p,r,f,s = precision_recall_fscore_support(y_test, predictions, labels=[0, 1, 2, 3])
display('precision = {}'.format(p))
display('recall = {}'.format(r))
display('f-score = {}'.format(f))

In [None]:
show_ann(mlp)

In [None]:
# mlp = MLPClassifier(hidden_layer_sizes=(6),max_iter=500)
mlp = MLPClassifier(hidden_layer_sizes=(5, 11, 9, 7, 6, 10),max_iter=50000)
mlp.fit(X_train,y_train)

In [None]:
predictions = mlp.predict(X_test)
p,r,f,s = precision_recall_fscore_support(y_test, predictions, labels=[0, 1, 2, 3])
display('precision = {}'.format(p))
display('recall = {}'.format(r))
display('f-score = {}'.format(f))

In [None]:
show_ann(mlp)