# Notebook sniff from Chap 3 Exploring and Denoising Your Data Set

In [None]:
import pandas as pd
df = pd.read_csv("data/rent.csv")
print(df.shape) # print rows, columns

In [None]:
df.info()

In [None]:
df_num = df[['bathrooms', 'bedrooms', 'longitude', 'latitude', 'price']]

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df_num, test_size=0.15) # 15% goes into test set

In [None]:
df_train.to_csv("data/rent-train.csv")
df_test.to_csv("data/rent-test.csv")

In [None]:
X_train = df_train[['bedrooms','bathrooms','latitude','longitude']]
y_train = df_train['price']

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, n_jobs=-1)

In [None]:
rf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_absolute_error

pred_train = rf.predict(X_train)
err_train = mean_absolute_error(y_train, pred_train)
print(f"${err_train:.2f} average error on training set")

avg_train = y_train.mean()
perc_err_train = err_train*100/avg_train
print(f"{perc_err_train:0.2f}% of average {avg_train:.2f}")

In [None]:
from sklearn.model_selection import cross_val_score

k = 5
cvscore = cross_val_score(
    RandomForestRegressor(n_estimators=100, n_jobs=-1), # which model to use
    X_train, y_train,        # what training data to split up
    cv=k,                    # number of folds/chunks
    scoring='neg_mean_absolute_error') # what error metric
chunks_valid = -cvscore
print(chunks_valid)

In [None]:
avg_err_valid = chunks_valid.mean()
print(f"{avg_err_valid:.2f} average error on validation set")

In [None]:
cvscore = cross_val_score(
            RandomForestRegressor(n_estimators=100, n_jobs=-1), # which model to use
            X_train, y_train, # what training data to split up
            cv=k, # number of folds/chunks
            scoring='neg_mean_absolute_error') # what error metric
chunks_valid = -cvscore
avg_err_valid = chunks_valid.mean()
print(f"{avg_err_valid:.2f} average error on validation set")
cvscore = cross_val_score(
            RandomForestRegressor(n_estimators=100, n_jobs=-1), # which model to use
            X_train, y_train, # what training data to split up
            cv=k, # number of folds/chunks
            scoring='neg_mean_absolute_error') # what error metric
chunks_valid = -cvscore
avg_err_valid = chunks_valid.mean()
print(f"{avg_err_valid:.2f} average error on validation set")
cvscore = cross_val_score(
            RandomForestRegressor(n_estimators=100, n_jobs=-1), # which model to use
            X_train, y_train, # what training data to split up
            cv=k, # number of folds/chunks
            scoring='neg_mean_absolute_error') # what error metric
chunks_valid = -cvscore
avg_err_valid = chunks_valid.mean()
print(f"{avg_err_valid:.2f} average error on validation set")

In [None]:
plt.plot(range(1,k+1), chunks_valid, color='blue')
# plot raw line at average
plt.plot([0,k],[avg_err_valid,avg_err_valid], color='red') 
plt.show()

In [None]:
bookcolors = {
    'crimson': '#a50026', 'red': '#d73027', 'redorange': '#f46d43',
    'orange': '#fdae61', 'yellow': '#fee090', 'sky': '#e0f3f8',
    'babyblue': '#abd9e9', 'lightblue': '#74add1', 'blue': '#4575b4',
    'purple': '#313695'
}

In [None]:
chunks_valid = -cvscore             # reverse neg of neg_mean_absolute_error
avg_err_valid = chunks_valid.mean() # compute the average chunk error
std_err_valid = chunks_valid.std()  # compute standard deviation of chunk errors
print(f"${int(avg_err_valid)} average error +/-${int(std_err_valid)}")

In [None]:
plt.xlabel('Num Bedrooms')
plt.ylabel('Num Apts')
plt.hist(df_num.bedrooms, color=bookcolors['blue'])
plt.show()

In [None]:
# filter all records (training/testing)
df_clean = df_num[(df_num.price>1_000) & (df_num.price<10_000)]

In [None]:
plt.xlabel('Price')
plt.ylabel('Num Apts at that price')
plt.hist(df_clean.price, bins=45, color=bookcolors['blue'])
plt.show()

In [None]:
import numpy as np
# find middle 98% of original prices
upper, lower = np.percentile(df.price, [1,99])
clipped = np.clip(df.price, upper, lower)
plt.xlabel('Prices in middle 98% range')
plt.ylabel('Num Apts at that price')
plt.hist(clipped, bins=45, color=bookcolors['blue'])
plt.show()

In [None]:
df_missing = df_clean[(df_clean.longitude==0) | (df_clean.latitude==0)]

In [None]:
df_clean = df_clean[(df_clean.longitude!=0) | (df_clean.latitude!=0)]

In [None]:
df_clean = df_clean[(df_clean['latitude']>40.55) & (df_clean['latitude']<40.94) &
                    (df_clean['longitude']>-74.1) & (df_clean['longitude']<-73.67)]

In [None]:
df_train, df_test = train_test_split(df_clean, test_size=0.15) # split clean df

In [None]:
X_train = df_train[['bedrooms','bathrooms','latitude','longitude']]
y_train = df_train['price']

In [None]:
k=5
cvscore = cross_val_score(
    RandomForestRegressor(n_estimators=100, n_jobs=-1), # which model to use
    X_train, y_train, # what training data
    cv=k, # number of folds, k
    scoring='neg_mean_absolute_error') # what error metric
chunks_valid = -cvscore  # reverse neg of neg_mean_absolute_error
errors = list(chunks_valid.astype('int'))
avg_err_valid_denoised = chunks_valid.mean()
std_err_valid_denoised = chunks_valid.std() 
print(f"{errors} avg {avg_err_valid_denoised:.2f} +/- {std_err_valid_denoised:.2f}")

In [None]:
for i in range(3):
    rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True)
    rf.fit(X_train, y_train)
    print(rf.oob_score_)

In [None]:
for i in range(3):
    df_train_noisy, _ = train_test_split(df_clean, test_size=0.15) 
    X_train = df_train[['bedrooms','bathrooms','latitude','longitude']]
    y_train = df_train['price']
    rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True)
    rf.fit(X_train, y_train)
    print(rf.oob_score_)

In [None]:
for i in range(3):
    # split original df not df_num
    df_train_noisy, _ = train_test_split(df, test_size=0.15) 
    X_train_noisy = df_train_noisy[['bedrooms','bathrooms','latitude','longitude']]
    y_train_noisy = df_train_noisy['price']
    rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True)
    rf.fit(X_train_noisy, y_train_noisy)
    print(rf.oob_score_)

In [None]:
import numpy as np
X_test, y_test = df_test[['bedrooms','bathrooms','latitude','longitude']], df_test['price']

rf = RandomForestRegressor(n_estimators=100, n_jobs=-1)
rf.fit(X_train, y_train)

for i in range(10):
    preds = [t.predict([X_test.iloc[i]]) for t in rf.estimators_]
    a = np.mean(preds)
    s = np.std(preds)
    y = y_test.iloc[i]
    print(f"Apt {i}: true price ${y:5.0f}, predict ${a:5.0f} +/-${s:4.0f}")

In [None]:
from sklearn.linear_model import Lasso

X_test, y_test = df_test[['bedrooms','bathrooms','latitude','longitude']], df_test['price']
lm = Lasso(alpha=0.5) # create linear model
lm.fit(X_train, y_train)
print("training score", lm.score(X_train, y_train))

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(n_estimators = 500)
gbr.fit(X_train, y_train)
print("training score", gbr.score(X_train, y_train))

In [None]:
def stable_oob_score(X_train, y_train, trials = 7):
    scores = []
    rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True)
    for i in range(trials):
        rf.fit(X_train, y_train)
        scores.append(rf.oob_score_)
    return np.mean(scores), np.std(scores)

m, s = stable_oob_score(X_train, y_train)
print(f"OOB score {m:.5f} +/-{s:.5f}")

In [None]:
from rfpimp import *
I = oob_importances(rf, X_train, y_train)
I.plot(kind='barh', legend=False)
plt.show()

In [None]:
df_aug = df[['bedrooms','bathrooms','latitude','longitude',
             'features','price']].copy()
df_aug = df_aug[(df_aug.price>1_000) & (df_aug.price<10_000)]
df_aug = df_aug[(df_aug.longitude!=0) | (df_aug.latitude!=0)]
df_aug = df_aug[(df_aug['latitude']>40.55) & (df_aug['latitude']<40.94) &
                (df_aug['longitude']>-74.1) & (df_aug['longitude']<-73.67)]

In [None]:
# rewrite features column
df_aug['features'] = df_aug['features'].fillna('') # fill missing w/blanks
df_aug['features'] = df_aug['features'].str.lower() # normalize to lower case

In [None]:
df_aug['doorman'] = df_aug['features'].str.contains("doorman")
df_aug['parking'] = df_aug['features'].str.contains("parking|garage")
df_aug['laundry'] = df_aug['features'].str.contains("laundry")
del df_aug['features'] # don't need this anymore

In [None]:
df_train_aug, df_test_aug = train_test_split(df_aug, test_size=0.15) # split df_aug
X_train, y_train = df_train_aug.drop('price', axis=1), df_train_aug['price']
X_test, y_test = df_test_aug.drop('price', axis=1), df_test_aug['price']

m, s = stable_oob_score(X_train, y_train)
print(f"OOB score {m:.5f} +/-{s:.5f}")

In [None]:
rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True)
rf.fit(X_train, y_train)
I = oob_importances(rf, X_train, y_train)
I.plot(kind='barh', legend=False)
plt.savefig("/tmp/feature-importances.svg")
plt.show()

In [None]:
m, s = stable_oob_score(X_train.drop('parking', axis=1), y_train)
print(f"OOB score {m:.5f} +/-{s:.5f}")

In [None]:
X_train["beds_to_baths"] = X_train["bedrooms"]/(X_train["bathrooms"]+1)
# keep X_test in sync
X_test["beds_to_baths"] = X_test["bedrooms"]/(X_test["bathrooms"]+1)
m, s = stable_oob_score(X_train, y_train)
print(f"OOB score {m:.5f} +/-{s:.5f}")

In [None]:
rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True)
rf.fit(X_train, y_train)
I = oob_importances(rf, X_train, y_train)
I.plot(kind='barh', legend=False)
plt.show()

In [None]:
X_train = X_train.drop('beds_to_baths', axis=1)
X_test = X_test.drop('beds_to_baths', axis=1)

In [None]:
rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True)
rf.fit(X_train, y_train)
print(f"OOB R^2 {rf.oob_score_:.5f}")

cvscore = cross_val_score(
    RandomForestRegressor(n_estimators=100, n_jobs=-1), # which model to use
    X_train, y_train,        # what training data to split up
    cv=5,                    # number of folds/chunks
    scoring='neg_mean_absolute_error') # what error metric
chunks_valid = -cvscore
err_cv = chunks_valid.mean()
print(f"${err_cv:.2f} average cross-validation error")

In [None]:
print(f"Test R^2 {rf.score(X_test, y_test):.5f}")
err_test = mean_absolute_error(y_test, rf.predict(X_test))
print(f"${err_test:.2f} average error on test set")

In [None]:
errors = []
for trial in range(5):
    # split original df not filtered data frame
    df_train_noisy, df_test_noisy = train_test_split(df, test_size=0.15)
    X_train = df_train_noisy[['bedrooms','bathrooms','latitude','longitude']]
    y_train = df_train_noisy['price']
    X_test = df_test_noisy[['bedrooms','bathrooms','latitude','longitude']]
    y_test = df_test_noisy['price']
    rf = RandomForestRegressor(n_estimators=100, n_jobs=-1)
    rf.fit(X_train, y_train)
    err_test = mean_absolute_error(y_test, rf.predict(X_test))
    errors.append(err_test)
    print(f"${err_test:.2f} average error on test set, test R^2 is {rf.score(X_test, y_test):.4f}")
avg_test_error = int(np.mean(errors))
print(f"Average of average errors is ${avg_test_error}")