In [1]:
from sklearn import tree
import pandas as pd

In [2]:
file = 'diamonds.csv'
df = pd.read_csv(file)
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
data = df.drop(['Unnamed: 0', 'x', 'y', 'z', 'table', 'depth'], axis=1)
data.head()

Unnamed: 0,carat,cut,color,clarity,price
0,0.23,Ideal,E,SI2,326
1,0.21,Premium,E,SI1,326
2,0.23,Good,E,VS1,327
3,0.29,Premium,I,VS2,334
4,0.31,Good,J,SI2,335


In [4]:
data['cut'] = data['cut'].map({'Ideal': 5, 'Premium': 4, 'Very Good': 3, 'Good': 2, 'Fair': 1})
data['color'] = data['color'].map({'D': 7, 'E': 6, 'F': 5, 'G': 4, 'H': 3, 'I': 2, 'J': 1})
data['clarity'] = data['clarity'].map({'IF': 8, 'VVS1': 7, 'VVS2': 6, 'VS1': 5, 'VS2': 4, 'SI1': 3, 'SI2': 2, 'I':1})
data.dropna(inplace=True)
target = data['price']
data = data.drop(['price'], axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,carat,cut,color,clarity
0,0.23,5,6,2.0
1,0.21,4,6,3.0
2,0.23,2,6,5.0
3,0.29,4,2,4.0
4,0.31,2,1,2.0


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [6]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=200, max_depth=6)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

  from numpy.core.umath_tests import inner1d


0.9637142744956316

In [7]:
importances = rf.feature_importances_
importances

array([9.22142877e-01, 5.15513322e-05, 2.34215251e-02, 5.43840464e-02])

In [8]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.9221428771563717, 'carat'),
 (0.05438404644312232, 'clarity'),
 (0.02342152506826777, 'color'),
 (5.155133223772357e-05, 'cut')]

In [9]:
rf.predict(X_test)

array([  551.73856611,  6209.11248675, 11508.53105047, ...,
        1771.68419344,  9227.12043158,  9072.05795588])

In [10]:
test_df = pd.DataFrame(
    {"carat": ["0.94", "0.32", "0.58", "0.74", "1.31", "0.9"],
     "cut": ["Very Good", "Very Good", "Ideal", "Good", "Good", "Very Good"],
     "color": ["F", "H", "J", "E", "H", "I"],
     "clarity": ["SI1", "SI1", "SI1", "SI1", "SI1", "I"],
     "price": ["900", "1100", "2600", "5200", "11000", "4400"]
     }
)
test_df.head()

Unnamed: 0,carat,cut,color,clarity,price
0,0.41,Good,J,SI1,900
1,0.32,Very Good,H,SI1,1100
2,0.58,Ideal,J,SI1,2600
3,0.74,Good,E,SI1,5200
4,1.31,Good,H,SI1,11000


In [11]:
test_df['cut'] = test_df['cut'].map({'Ideal': 5, 'Premium': 4, 'Very Good': 3, 'Good': 2, 'Fair': 1})
test_df['color'] = test_df['color'].map({'D': 7, 'E': 6, 'F': 5, 'G': 4, 'H': 3, 'I': 2, 'J': 1})
test_df['clarity'] = test_df['clarity'].map({'IF': 8, 'VVS1': 7, 'VVS2': 6, 'VS1': 5, 'VS2': 4, 'SI1': 3, 'SI2': 2, 'I':1})
actuals = test_df['price']
test_df = test_df.drop(['price'], axis=1)
test_df.head()

Unnamed: 0,carat,cut,color,clarity
0,0.41,2,1,3
1,0.32,3,3,3
2,0.58,5,1,3
3,0.74,2,6,3
4,1.31,2,3,3


In [12]:
predictions = rf.predict(test_df)
pd.DataFrame({"Prediction": predictions, "Actual": actuals}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,791.713516,900
1,551.738566,1100
2,1277.985602,2600
3,2572.527683,5200
4,6209.112487,11000
5,3111.883291,4400


In [13]:
rf.score(test_df, actuals)

0.5226685068888839