In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split



In [9]:
# save filepath of the file to a variable so it is easier to read
melbourne_file_path = 'Downloads/melb_data.csv'
# read the data and store data in dataframe titled melbourne_data
melbourne_data = pd.read_csv(melbourne_file_path)
#dropna drops missing values
# If you want to create a new DataFrame without modifying the original
#, you can assign the result to a new variable:
melbourne_data_clean = melbourne_data.dropna(axis=0)
#If you want to ensure that the changes are applied to the DataFrame in-place,
#you can use the inplace=True parameter:melbourne_data.dropna(axis=0, inplace=True)
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)
print(melbourne_data.describe())
print(melbourne_data.columns)
# the prediction target by convention is called y
y = melbourne_data.Price
print(y)
# choosing features
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
# by convention, the features called capitl x
X = melbourne_data[melbourne_features]
# review the features 
print(X.describe())
# review the first five features
print(X.head())
# Define model. Specify a number for random_state to ensure same results each run
#melbourne_model = DecisionTreeRegressor(random_state=1)
#fit the model
#melbourne_model.fit(X, y)
# split data into training and validation data, for both features and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)
# Define model
melbourne_model = DecisionTreeRegressor()
# Fit model
melbourne_model.fit(train_X, train_y)

# get predicted prices on validation data
val_predictions = melbourne_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))
# compare MAE with differing values of max_leaf_nodes
# # compare MAE with differing values of max_leaf_nodes
max_leaf_nodes_values = [5, 50, 500, 5000]  # Specify the values you want to test
for max_leaf_nodes in max_leaf_nodes_values:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" % (max_leaf_nodes, my_mae))

# The lesson gives an example of how to do this with an explicit loop.
scores = {leaf_size: get_mae(leaf_size, train_X, val_X, train_y, val_y) for leaf_size in max_leaf_nodes_values}
best_tree_size = min(scores, key=scores.get)
print(scores)


              Rooms         Price      Distance      Postcode      Bedroom2  \
count  13580.000000  1.358000e+04  13580.000000  13580.000000  13580.000000   
mean       2.937997  1.075684e+06     10.137776   3105.301915      2.914728   
std        0.955748  6.393107e+05      5.868725     90.676964      0.965921   
min        1.000000  8.500000e+04      0.000000   3000.000000      0.000000   
25%        2.000000  6.500000e+05      6.100000   3044.000000      2.000000   
50%        3.000000  9.030000e+05      9.200000   3084.000000      3.000000   
75%        3.000000  1.330000e+06     13.000000   3148.000000      3.000000   
max       10.000000  9.000000e+06     48.100000   3977.000000     20.000000   

           Bathroom           Car       Landsize  BuildingArea    YearBuilt  \
count  13580.000000  13518.000000   13580.000000   7130.000000  8205.000000   
mean       1.534242      1.610075     558.416127    151.967650  1964.684217   
std        0.691712      0.962634    3990.669241   

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource
import altair as alt

# Assuming you've already loaded and cleaned the data as in your code
melbourne_file_path = 'Downloads/melb_data.csv'
melbourne_data = pd.read_csv(melbourne_file_path)
melbourne_data_clean = melbourne_data.dropna(axis=0)

# Visualize data distribution using histograms
melbourne_data_clean.hist(bins=20, figsize=(15, 10))
plt.suptitle('Histograms of Numerical Columns in Melbourne Data', fontsize=16)
plt.show()

# Scatter plot to visualize the relationship between two numerical variables
plt.scatter(melbourne_data_clean['Rooms'], melbourne_data_clean['Price'])
plt.title('Scatter Plot: Rooms vs Price')
plt.xlabel('Rooms')
plt.ylabel('Price')
plt.show()

# Pair plot for selected features using Seaborn
sns.pairplot(melbourne_data_clean[['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']])
plt.suptitle('Pair Plot of Selected Features', y=1.02, fontsize=16)
plt.show()

# Scatter plot using Plotly
fig = px.scatter(melbourne_data_clean, x='Rooms', y='Price', title='Scatter Plot: Rooms vs Price')
fig.show()

# Scatter plot using Bokeh
source = ColumnDataSource(melbourne_data_clean)
p = figure(title='Scatter Plot: Rooms vs Price', x_axis_label='Rooms', y_axis_label='Price')
p.circle('Rooms', 'Price', source=source)
show(p)

# # Scatter plot using Altair
# alt.Chart(melbourne_data_clean).mark_circle().encode(
#     x='Rooms',
#     y='Price',
#     tooltip=['Rooms', 'Price']
# ).properties(title='Scatter Plot: Rooms vs Price').interactive()

In [12]:
from sklearn.ensemble import RandomForestRegressor

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
melb_preds = forest_model.predict(val_X)
print(mean_absolute_error(val_y, melb_preds))

180860.37877504269
