In [1]:
from sklearn import tree
import pandas as pd

In [2]:
file = 'diamonds.csv'
df = pd.read_csv(file)
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
data = df.drop(['Unnamed: 0', 'x', 'y', 'z', 'table', 'depth'], axis=1)
data.head()

Unnamed: 0,carat,cut,color,clarity,price
0,0.23,Ideal,E,SI2,326
1,0.21,Premium,E,SI1,326
2,0.23,Good,E,VS1,327
3,0.29,Premium,I,VS2,334
4,0.31,Good,J,SI2,335


In [4]:
data['cut'] = data['cut'].map({'Ideal': 5, 'Premium': 4, 'Very Good': 3, 'Good': 2, 'Fair': 1})
data['color'] = data['color'].map({'D': 7, 'E': 6, 'F': 5, 'G': 4, 'H': 3, 'I': 2, 'J': 1})
data['clarity'] = data['clarity'].map({'IF': 8, 'VVS1': 7, 'VVS2': 6, 'VS1': 5, 'VS2': 4, 'SI1': 3, 'SI2': 2, 'I':1})
data.dropna(inplace=True)
target = data['price']
data = data.drop(['price'], axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,carat,cut,color,clarity
0,0.23,5,6,2.0
1,0.21,4,6,3.0
2,0.23,2,6,5.0
3,0.29,4,2,4.0
4,0.31,2,1,2.0


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [6]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=200, max_depth=6)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

  from numpy.core.umath_tests import inner1d


0.963451755315996

In [7]:
importances = rf.feature_importances_
importances

array([9.22410920e-01, 5.52627219e-05, 2.33586019e-02, 5.41752155e-02])

In [8]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.9224109198915161, 'carat'),
 (0.05417521552640705, 'clarity'),
 (0.023358601860211724, 'color'),
 (5.52627218648729e-05, 'cut')]

In [17]:
rf.predict(X_test)

array([  553.01511243,  6176.81984228, 11422.2183104 , ...,
        1772.39204645,  9227.00527469,  9150.94232072])

In [18]:
test_df = pd.DataFrame(
    {"carat": ["0.41", "0.3", "0.51", "0.6", "0.97", "1.5"],
     "cut": ["Ideal", "Ideal", "Good", "Very Good", "Good", "Ideal"],
     "color": ["E", "F", "D", "D", "D", "G"],
     "clarity": ["I", "SI2", "I", "VS1", "VS1", "SI2"],
     "price": ["400", "560", "1020", "2510", "5010", "10060"]
     }
)
test_df.head()

Unnamed: 0,carat,cut,color,clarity,price
0,0.34,Ideal,E,I,400
1,0.3,Ideal,F,SI2,560
2,0.51,Good,D,I,1020
3,0.6,Very Good,D,VS1,2510
4,0.97,Good,D,VS1,5010


In [19]:
test_df['cut'] = test_df['cut'].map({'Ideal': 5, 'Premium': 4, 'Very Good': 3, 'Good': 2, 'Fair': 1})
test_df['color'] = test_df['color'].map({'D': 7, 'E': 6, 'F': 5, 'G': 4, 'H': 3, 'I': 2, 'J': 1})
test_df['clarity'] = test_df['clarity'].map({'IF': 8, 'VVS1': 7, 'VVS2': 6, 'VS1': 5, 'VS2': 4, 'SI1': 3, 'SI2': 2, 'I':1})
actuals = test_df['price']
test_df = test_df.drop(['price'], axis=1)
test_df.head()

Unnamed: 0,carat,cut,color,clarity
0,0.34,5,6,1
1,0.3,5,5,2
2,0.51,2,7,1
3,0.6,3,7,5
4,0.97,2,7,5


In [20]:
predictions = rf.predict(test_df)
pd.DataFrame({"Prediction": predictions, "Actual": actuals}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,690.228149,400
1,690.228149,560
2,1358.714021,1020
3,1772.392046,2510
4,5268.099945,5010
5,9185.998762,10060


In [21]:
rf.score(test_df, actuals)

0.9773926305699353