In [1]:
import numpy as np, pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [2]:
galaxies = pd.read_csv("../data/sdss_clean.csv", index_col=False)

In [3]:
model = RandomForestRegressor(n_estimators=40, max_depth=10, min_samples_split=20, n_jobs=6, random_state=20200310)
# instantiate optimized random forest model

In [4]:
X = galaxies.drop(columns=["ra", "dec", "redshift"])
y = galaxies["redshift"]
# create feature matrix and target vector

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=20200310)

In [6]:
model.fit(X_train, y_train);
# fit the model

In [7]:
predictions = model.predict(X_test)
# make predictions on test set

In [18]:
coords = pd.DataFrame(galaxies[["ra", "dec"]], index=X_test.index)
predicted = pd.DataFrame(predictions, index=X_test.index)
measured = pd.DataFrame(y[X_test.index])
# make DataFrames of coordinates and predicted and measured redshifts for comparison

In [19]:
coords.index[:5], predicted.index[:5], measured.index[:5]
# make sure the indices match

(Int64Index([94726, 95323, 47033, 78833, 58784], dtype='int64'),
 Int64Index([94726, 95323, 47033, 78833, 58784], dtype='int64'),
 Int64Index([94726, 95323, 47033, 78833, 58784], dtype='int64'))

In [21]:
predictions_frame = coords.join(predicted.join(measured).rename(columns={0: "predicted", "redshift": "measured"}))
# combine coordinates with predicted and measured redshifts into one DataFrame

In [22]:
predictions_frame
# make sure it worked

Unnamed: 0,ra,dec,predicted,measured
94726,130.786044,53.649087,0.628464,0.679837
95323,116.132952,39.871280,0.608557,0.474598
47033,244.272887,46.892903,0.707678,0.773484
78833,215.904144,65.132218,0.091154,0.090997
58784,256.838457,59.699704,0.060734,0.065224
...,...,...,...,...
60210,157.971169,63.769640,0.118052,0.104300
490,209.926041,0.487777,0.697768,0.715263
2673,134.095708,-0.163924,0.575788,0.566326
82274,195.546076,66.051901,0.308508,0.322330


In [23]:
predictions_frame.to_csv("../data/predictions.csv", index=False)
# export predictions to csv file