In [1]:
from sklearn import tree
import pandas as pd

In [2]:
file = 'diamonds.csv'
df = pd.read_csv(file)
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
data = df.drop(['Unnamed: 0', 'x', 'y', 'z', 'table', 'depth'], axis=1)
data.head()

Unnamed: 0,carat,cut,color,clarity,price
0,0.23,Ideal,E,SI2,326
1,0.21,Premium,E,SI1,326
2,0.23,Good,E,VS1,327
3,0.29,Premium,I,VS2,334
4,0.31,Good,J,SI2,335


In [4]:
data['cut'] = data['cut'].map({'Ideal': 5, 'Premium': 4, 'Very Good': 3, 'Good': 2, 'Fair': 1})
data['color'] = data['color'].map({'D': 7, 'E': 6, 'F': 5, 'G': 4, 'H': 3, 'I': 2, 'J': 1})
data['clarity'] = data['clarity'].map({'IF': 8, 'VVS1': 7, 'VVS2': 6, 'VS1': 5, 'VS2': 4, 'SI1': 3, 'SI2': 2, 'I':1})
data.dropna(inplace=True)
target = data['price']
data = data.drop(['price'], axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,carat,cut,color,clarity
0,0.23,5,6,2.0
1,0.21,4,6,3.0
2,0.23,2,6,5.0
3,0.29,4,2,4.0
4,0.31,2,1,2.0


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [6]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=200, max_depth=6)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

  from numpy.core.umath_tests import inner1d


0.9635466701655847

In [7]:
importances = rf.feature_importances_
importances

array([9.22163439e-01, 4.68908256e-05, 2.33926108e-02, 5.43970592e-02])

In [8]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.922163439107344, 'carat'),
 (0.05439705924971424, 'clarity'),
 (0.023392610817373333, 'color'),
 (4.689082556790026e-05, 'cut')]

In [9]:
rf.predict(X_test)

array([  552.72058589,  6200.40010504, 11422.34945951, ...,
        1774.59311138,  9239.93182391,  9148.10709285])

In [10]:
test_df = pd.DataFrame(
    {"carat": ["0.32", "0.4", "0.6", "0.73", "1.04", "1.51"],
     "cut": ["Ideal", "Very Good", "Ideal", "Ideal", "Ideal", "Ideal"],
     "color": ["J", "D", "H", "G", "H", "G"],
     "clarity": ["SI2", "VS1", "VS2", "VS2", "VVS2", "VVS2"],
     "price": ["394", "918", "1705", "2367", "6105", "13348"]
     }
)
test_df.head()

Unnamed: 0,carat,cut,color,clarity,price
0,0.32,Ideal,J,SI2,394
1,0.4,Very Good,D,VS1,918
2,0.6,Ideal,H,VS2,1705
3,0.73,Ideal,G,VS2,2367
4,1.04,Ideal,H,VVS2,6105


In [11]:
test_df['cut'] = test_df['cut'].map({'Ideal': 5, 'Premium': 4, 'Very Good': 3, 'Good': 2, 'Fair': 1})
test_df['color'] = test_df['color'].map({'D': 7, 'E': 6, 'F': 5, 'G': 4, 'H': 3, 'I': 2, 'J': 1})
test_df['clarity'] = test_df['clarity'].map({'IF': 8, 'VVS1': 7, 'VVS2': 6, 'VS1': 5, 'VS2': 4, 'SI1': 3, 'SI2': 2, 'I':1})
actuals = test_df['price']
test_df = test_df.drop(['price'], axis=1)
test_df.head()

Unnamed: 0,carat,cut,color,clarity
0,0.32,5,1,2
1,0.4,3,7,5
2,0.6,5,3,4
3,0.73,5,4,4
4,1.04,5,3,6


In [12]:
predictions = rf.predict(test_df)
output = pd.DataFrame({"Prediction": predictions, "Actual": actuals, "Carat": test_df['carat'], "Cut":test_df['cut'],\
                      "Color": test_df['color'], "Clarity": test_df['clarity']}).reset_index(drop=True)
output

Unnamed: 0,Prediction,Actual,Carat,Cut,Color,Clarity
0,552.720586,394,0.32,5,1,2
1,1018.928719,918,0.4,3,7,5
2,1373.925629,1705,0.6,5,3,4
3,3114.760164,2367,0.73,5,4,4
4,6017.961662,6105,1.04,5,3,6
5,14244.238278,13348,1.51,5,4,6


In [13]:
rf.score(test_df, actuals)

0.9875960818629058

In [14]:
output.to_csv("Outputs/RF_BlueNile.csv", index=False, header=True)