In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import pickle

In [21]:
Train = pd.read_csv("Train_Set_Non_Scaled.csv")
Train.drop(['Unnamed: 0'], axis=1, inplace=True)
Train.drop(['zip_code_A>'], axis=1, inplace=True)

int64_cols = Train.shape[1] - 6
cols_convert = Train.columns[:int64_cols]
dtype_dict = {col: np.int8 for col in cols_convert}
Train[cols_convert] = Train[cols_convert].astype(dtype_dict)

label_cols = ["label", "binned_label"]

X_Train = Train.drop(columns=label_cols)
y_binned_Train = Train['binned_label']

In [4]:
Dummy = pd.read_csv("Final_Dummy.csv")
Dummy.drop(['Unnamed: 0'], axis=1, inplace=True)

label = ["label"]
X_Test = Dummy.drop(columns=label)
Dummy = Dummy.fillna(0)

In [5]:
Ranges = y_binned_Train.unique()

In [6]:
prices = []
for p in Dummy["label"]:
    if 1000 <= p < 90000:
        prices.append("1k - 89k")
    elif 90000 <= p < 188000:
        prices.append("90k - 187k")
    elif 188000 <= p < 374000:
        prices.append("188k - 373k")
    elif 374000 <= p < 700000:
        prices.append("374k - 699k")
    elif 700000 <= p < 1425000:
        prices.append("700k - 1424k")
    else:
        prices.append("1425k - 875000k")

In [7]:
y_binned_Test = prices

In [34]:
RF = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)
RF.fit(X_Train, y_binned_Train)

Imp = RF.feature_importances_
feature_names = X_Train.columns

sorted_idx = Imp.argsort()[::-1]
for f in range(X_Train.shape[1]):
    print("%d. %s (%f)" % (f + 1, feature_names[sorted_idx[f]], Imp[sorted_idx[f]]))

1. house_size (0.231403)
2. acre_lot (0.179547)
3. bath (0.073927)
4. bed (0.057491)
5. state_Pennsylvania (0.015109)
6. city_New York City (0.013135)
7. city_New York (0.012110)
8. state_New York (0.010621)
9. sold_yes (0.007296)
10. state_New Jersey (0.007195)
11. sold_no (0.007156)
12. state_Massachusetts (0.006899)
13. city_Boston (0.005613)
14. zip_code_01 (0.005517)
15. city_Brooklyn (0.005202)
16. zip_code_10 (0.004204)
17. zip_code_03 (0.004029)
18. city_Pittsburgh (0.004021)
19. zip_code_02 (0.003893)
20. state_Connecticut (0.003725)
21. zip_code_05 (0.003628)
22. city_Bronx (0.003435)
23. zip_code_04 (0.003128)
24. zip_code_08 (0.003113)
25. zip_code_06 (0.003099)
26. zip_code_07 (0.002991)
27. zip_code_20 (0.002943)
28. zip_code_21 (0.002941)
29. status_for_sale (0.002862)
30. zip_code_11 (0.002848)
31. zip_code_32 (0.002826)
32. city_Syracuse (0.002742)
33. zip_code_17 (0.002635)
34. zip_code_09 (0.002605)
35. zip_code_22 (0.002548)
36. zip_code_14 (0.002544)
37. zip_code_1

In [35]:
y_pred = RF.predict(X_Test)
print("Metrics: \n", classification_report(y_binned_Test, y_pred, zero_division=0))

Metrics: 
                  precision    recall  f1-score   support

1425k - 875000k       1.00      1.00      1.00         1
       1k - 89k       0.00      0.00      0.00         1
    374k - 699k       0.00      0.00      0.00         1
   700k - 1424k       0.50      1.00      0.67         1

       accuracy                           0.50         4
      macro avg       0.38      0.50      0.42         4
   weighted avg       0.38      0.50      0.42         4



In [10]:
y_pred

array(['1425k - 875000k', '1425k - 875000k', '374k - 699k',
       '1425k - 875000k', '1425k - 875000k', '1425k - 875000k',
       '1425k - 875000k', '1425k - 875000k', '1425k - 875000k',
       '1425k - 875000k', '1425k - 875000k', '1425k - 875000k'],
      dtype=object)

In [11]:
prices

['700k - 1424k',
 '188k - 373k',
 '90k - 187k',
 '374k - 699k',
 '374k - 699k',
 '188k - 373k',
 '1k - 89k',
 '1k - 89k',
 '1425k - 875000k',
 '374k - 699k',
 '1425k - 875000k',
 '700k - 1424k']

In [26]:
# write model in pickle file
with open('RandomForest.pkl', 'wb') as model_pkl:
    pickle.dump(RF, model_pkl)