Data prep

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

rawdf = pd.read_csv('../data/train.csv')

rawdf['Alley'] = rawdf['Alley'].fillna('None')
rawdf['MasVnrType'] = rawdf['MasVnrType'].fillna('None')
rawdf['BsmtQual'] = rawdf['BsmtQual'].fillna('None')
rawdf['BsmtCond'] = rawdf['BsmtCond'].fillna('None')
rawdf['BsmtExposure'] = rawdf['BsmtExposure'].fillna('None')
rawdf['BsmtFinType1'] = rawdf['BsmtFinType1'].fillna('None')
rawdf['BsmtFinType2'] = rawdf['BsmtFinType2'].fillna('None')
rawdf['Electrical'] = rawdf['Electrical'].fillna('None')
rawdf['FireplaceQu'] = rawdf['FireplaceQu'].fillna('None')
rawdf['GarageType'] = rawdf['GarageType'].fillna('None')
rawdf['GarageFinish'] = rawdf['GarageFinish'].fillna('None')
rawdf['GarageQual'] = rawdf['GarageQual'].fillna('None')
rawdf['GarageCond'] = rawdf['GarageCond'].fillna('None')
rawdf['PoolQC'] = rawdf['PoolQC'].fillna('None')
rawdf['Fence'] = rawdf['Fence'].fillna('None')
rawdf['MiscFeature'] = rawdf['MiscFeature'].fillna('None')

y_df = rawdf['SalePrice']

numcols = rawdf.dtypes[rawdf.dtypes != 'object'].keys()
textcols = rawdf.dtypes[rawdf.dtypes == 'object'].keys()

numdf = pd.DataFrame(rawdf[numcols])
numdf.drop('SalePrice', axis=1, inplace=True)
numdf.drop('Id', axis=1, inplace=True)

normalized_df=(numdf-numdf.mean())/numdf.std()
normalized_categories = pd.get_dummies(rawdf[textcols], dtype=float)

X_df = pd.concat([normalized_df, normalized_categories], axis=1)

X = np.array(X_df)
y = np.array(y_df)

# randomly split the data into training, cross validation and test sets
random_state = 42

X_train, X_blind, y_train, y_blind = train_test_split(
    X, y, test_size=0.4, random_state=random_state)

X_test, X_cv, y_test, y_cv = train_test_split(
    X_blind, y_blind, test_size=0.5, random_state=random_state)

In [7]:
rawdf.isna().any()

Id               False
MSSubClass       False
MSZoning         False
LotFrontage       True
LotArea          False
                 ...  
MoSold           False
YrSold           False
SaleType         False
SaleCondition    False
SalePrice        False
Length: 81, dtype: bool

In [6]:
import matplotlib.pyplot as plt
import math

# Get the list of feature column names
feature_cols = X.columns
n = len(feature_cols)

# Set the number of rows and columns for the subplots
num_rows = math.ceil(n / 4)
num_cols = round(math.sqrt(n))

# Create the subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 18))

# Flatten the axes array
axes = axes.flatten()

# Iterate over the feature columns and plot each feature against the target variable
for i, feature in enumerate(feature_cols):
    ax = axes[i]
    ax.scatter(X[feature], y, alpha=0.5, c=X['OverallQual'], cmap='viridis')
    ax.set_xlabel(feature)

    # Remove the y-axis labels to prevent clutter
    ax.set_yticklabels([])
    # ax.set_ylabel('SalePrice')

# Remove any extra subplots
if len(feature_cols) < num_rows * num_cols:
    for j in range(len(feature_cols), num_rows * num_cols):
        fig.delaxes(axes[j])

fig.suptitle('Feature vs SalePrice', fontsize=20, y=0.99)

# Adjust the spacing between subplots
fig.tight_layout()

# Show the plot
plt.show()


AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [None]:
from features import plot_feature_vs_target
import matplotlib.colors as mcolors

cmap = mcolors.LinearSegmentedColormap.from_list('hmm', ["red", "blue"])
# fig, axes = plot_feature_vs_target(plt, X, y, color_feature=X['OverallQual'], title='My Chart', )
fig, axes = plot_feature_vs_target(plt, X, y, color_feature=X['OverallQual'], cmap=cmap )
# fig, axes = plot_feature_vs_target(plt, X, y, color_feature=X['OverallQual'] )
# fig, axes = plot_feature_vs_target(plt, X, y , color_feature=y)
plt.show()

NN model

In [None]:
epochs = 20

model = tf.keras.Sequential([
    tf.keras.layers.Dense(50, activation='relu'),
    tf.keras.layers.Dense(50, activation='relu'),
    tf.keras.layers.Dense(1)
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.MeanSquaredError(),
              metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=epochs, verbose=2, validation_data=(X_cv, y_cv))
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2)


In [None]:
import matplotlib.pyplot as plt

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(epochs)

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()