In [1]:
import pandas as pd
import pydataset as ds
from sklearn.linear_model import LinearRegression

### Instructions

Use the `pydataset` library to get a dataset called "diamonds". We will be using this data set for all of the models that we build in this exercise. Get the dataset, see how many records are in it, and look at the first few records.

In [2]:
diamonds = ds.data('diamonds')
len(diamonds)

53940

In [3]:
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


#### Single Feature Model
---

In [4]:
# Build a model that can predict price given only the carat size of a diamond in the set.

model_carat = LinearRegression()
model_carat.fit(X = diamonds[["carat"]], y = diamonds.price)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [5]:
# Use this model to determine how much of a given diamond's price can be determined
# by its carat size.

model_carat.score(X = diamonds[["carat"]], y = diamonds.price)

0.8493305264354857

In [6]:
# Approximately how much increase in price is a single caret worth based on this model?

print(model_carat.coef_)

[7756.42561797]


#### Multi-Feature Model
---

In [7]:
# Build a model that predicts price based on the measurement (x, y, z), depth, and table of 
# each diamond.

model_carat = LinearRegression()
model_carat.fit(X = diamonds[["x", "y", "z", "depth", "table"]], y = diamonds.price)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [8]:
# How much of the price can be explianed by these factors in a linear way.

model_carat.score(X = diamonds[["x", "y", "z", "depth", "table"]], y = diamonds.price)

0.7845908795144827

In [9]:
# What was the average amount of error the model made? (Keep in mind error can be positive or negative, 
# and we can use absolute value to take this into account.)

diamonds["prediction"] = model_carat.predict(X = diamonds[["x", "y", "z", "depth", "table"]])
diamonds["model_error"] = abs(diamonds.price - diamonds.prediction)
diamonds.model_error.mean()

1360.4012459512896

In [10]:
# Which cuts had highest and lowest error?

(diamonds
    [["cut", "price", "model_error"]]
    .groupby("cut").mean()
    .sort_values("model_error", ascending = True)
)

Unnamed: 0_level_0,price,model_error
cut,Unnamed: 1_level_1,Unnamed: 2_level_1
Ideal,3457.54197,1221.940379
Good,3928.864452,1339.932635
Very Good,3981.759891,1379.269822
Premium,4584.257704,1519.003478
Fair,4358.757764,1776.013442


#### Data Subset Models
---

In [11]:
# Build two separate linear models for Ideal and Premium cut diamonds. Base your models on
# predicting price from carat, depth, and table.
# Hint: consider getting a new copy of the diamonds data from pydataset.

diamonds = ds.data('diamonds')

dia_ideal = diamonds[diamonds.cut == "Ideal"]
print(f"Ideal:{len(dia_ideal)}")


dia_prem = diamonds[diamonds.cut == "Premium"]
print(f"Premium:{len(dia_prem)}")

model_ideal = LinearRegression()
model_ideal.fit(X = dia_ideal[["carat", "depth", "table"]], y = dia_ideal.price)

model_prem = LinearRegression()
model_prem.fit(X = dia_prem[["carat", "depth", "table"]], y = dia_prem.price)

Ideal:21551
Premium:13791


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [12]:
# Which model performs better? How much better?

print(model_ideal.score(X = dia_ideal[["carat", "depth", "table"]], y = dia_ideal.price))
print(model_prem.score(X = dia_prem[["carat", "depth", "table"]], y = dia_prem.price))

0.8673505996417938
0.8556807667008979


#### Feature Engineering
---

In [13]:
# Get a fresh copy of diamonds and compute a new column named "volume" that is computed
# from the x,y,z measurements of each diamond.

diamonds = ds.data('diamonds')
diamonds["volume"] = (diamonds.x * diamonds.y * diamonds.z)
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,volume
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,38.20203
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,34.505856
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,38.076885
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63,46.72458
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,51.91725


In [14]:
# Build separate models to model the three measurements in aggregate against price,
# and the newly computer single volume measurement.

model_parts = LinearRegression()
model_parts.fit(X = diamonds[["x", "y", "z"]], y = diamonds.price)

model_vol = LinearRegression()
model_vol.fit(X = diamonds[["volume"]], y = diamonds.price)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [15]:
# Is there a difference in the accuracy of the models? To what would this be attributed?

print(model_parts.score(X = diamonds[["x", "y", "z"]], y = diamonds.price))
print(model_vol.score(X = diamonds[["volume"]], y = diamonds.price))

0.7825425402126274
0.8142978513633492
