In [1]:
from sklearn import tree
import pandas as pd

In [2]:
file = 'diamonds.csv'
df = pd.read_csv(file)
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
data = df.drop(['Unnamed: 0', 'x', 'y', 'z', 'table', 'depth'], axis=1)
data.head()

Unnamed: 0,carat,cut,color,clarity,price
0,0.23,Ideal,E,SI2,326
1,0.21,Premium,E,SI1,326
2,0.23,Good,E,VS1,327
3,0.29,Premium,I,VS2,334
4,0.31,Good,J,SI2,335


In [4]:
data['cut'] = data['cut'].map({'Ideal': 5, 'Premium': 4, 'Very Good': 3, 'Good': 2, 'Fair': 1})
data['color'] = data['color'].map({'D': 7, 'E': 6, 'F': 5, 'G': 4, 'H': 3, 'I': 2, 'J': 1})
data['clarity'] = data['clarity'].map({'IF': 8, 'VVS1': 7, 'VVS2': 6, 'VS1': 5, 'VS2': 4, 'SI1': 3, 'SI2': 2, 'I':1})
data.dropna(inplace=True)
target = data['price']
data = data.drop(['price'], axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,carat,cut,color,clarity
0,0.23,5,6,2.0
1,0.21,4,6,3.0
2,0.23,2,6,5.0
3,0.29,4,2,4.0
4,0.31,2,1,2.0


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [6]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=200, max_depth=6)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

  from numpy.core.umath_tests import inner1d


0.9634506882993247

In [7]:
importances = rf.feature_importances_
importances

array([9.22329652e-01, 6.97416030e-05, 2.33458065e-02, 5.42547997e-02])

In [8]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.9223296521214387, 'carat'),
 (0.05425479973169254, 'clarity'),
 (0.023345806543907846, 'color'),
 (6.974160296075502e-05, 'cut')]

In [9]:
rf.predict(X_test)

array([  552.51751142,  6200.53426719, 11246.48670376, ...,
        1776.32888708,  9225.14995833,  9124.09214606])

In [10]:
test_df = pd.DataFrame(
    {"carat": ["0.94", "0.25", "0.7", "2.17", "1.51", "1.1"],
     "cut": ["Very Good", "Very Good", "Very Good", "Ideal", "Very Good", "Ideal"],
     "color": ["F", "I", "G", "E", "D", "H"],
     "clarity": ["I", "SI1", "SI1", "SI2", "SI2", "SI2"],
     "price": ["3465", "460", "2205", "23920", "11860", "6010"]
     }
)
test_df.head()

Unnamed: 0,carat,cut,color,clarity,price
0,0.94,Very Good,F,I,3465
1,0.25,Very Good,I,SI1,460
2,0.7,Very Good,G,SI1,2205
3,2.17,Ideal,E,SI2,23920
4,1.51,Very Good,D,SI2,11860


In [11]:
test_df['cut'] = test_df['cut'].map({'Ideal': 5, 'Premium': 4, 'Very Good': 3, 'Good': 2, 'Fair': 1})
test_df['color'] = test_df['color'].map({'D': 7, 'E': 6, 'F': 5, 'G': 4, 'H': 3, 'I': 2, 'J': 1})
test_df['clarity'] = test_df['clarity'].map({'IF': 8, 'VVS1': 7, 'VVS2': 6, 'VS1': 5, 'VS2': 4, 'SI1': 3, 'SI2': 2, 'I':1})
actuals = test_df['price']
test_df = test_df.drop(['price'], axis=1)
test_df.head()

Unnamed: 0,carat,cut,color,clarity
0,0.94,3,5,1
1,0.25,3,2,3
2,0.7,3,4,3
3,2.17,5,6,2
4,1.51,3,7,2


In [12]:
predictions = rf.predict(test_df)
output = pd.DataFrame({"Prediction": predictions, "Actual": actuals, "Carat": test_df['carat'], "Cut":test_df['cut'],\
                      "Color": test_df['color'], "Clarity": test_df['clarity']}).reset_index(drop=True)
output

Unnamed: 0,Prediction,Actual,Carat,Cut,Color,Clarity
0,3620.422541,3465,0.94,3,5,1
1,552.517511,460,0.25,3,2,3
2,2315.093017,2205,0.7,3,4,3
3,15554.889361,23920,2.17,5,6,2
4,9222.425483,11860,1.51,3,7,2
5,4322.957339,6010,1.1,5,3,2


In [13]:
rf.score(test_df, actuals)

0.7917510893378099

In [14]:
output.to_csv("Outputs/RF_Shane.csv", index=False, header=True)