# Random Forest

In [6]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz

Other models:
- Trees (sklearn.tree)
- Support Vector machine (sklearn.svm)
- etc...
See https://scikit-learn.org/stable/modules/classes.html

## Preparing Data

In [3]:
df = pd.read_csv('data/GalaxyZoo1_DR_table2.csv')
df.head()

Unnamed: 0,OBJID,RA,DEC,NVOTE,P_EL,P_CW,P_ACW,P_EDGE,P_DK,P_MG,P_CS,P_EL_DEBIASED,P_CS_DEBIASED,SPIRAL,ELLIPTICAL,UNCERTAIN
0,587727178986356823,00:00:00.41,-10:22:25.7,59,0.61,0.034,0.0,0.153,0.153,0.051,0.186,0.61,0.186,0,0,1
1,587727227300741210,00:00:00.74,-09:13:20.2,18,0.611,0.0,0.167,0.222,0.0,0.0,0.389,0.203,0.797,1,0,0
2,587727225153257596,00:00:01.03,-10:56:48.0,68,0.735,0.029,0.0,0.147,0.074,0.015,0.176,0.432,0.428,0,0,1
3,587730774962536596,00:00:01.38,+15:30:35.3,52,0.885,0.019,0.0,0.058,0.019,0.019,0.077,0.885,0.077,0,1,0
4,587731186203885750,00:00:01.55,-00:05:33.3,59,0.712,0.0,0.0,0.22,0.068,0.0,0.22,0.64,0.29,0,0,1


In [4]:
data = df.drop(['OBJID','RA','DEC'], axis=1)

In [7]:
X = data.drop(['SPIRAL','ELLIPTICAL','UNCERTAIN'], axis=1).values  # We get rid of the labels
y = data[['SPIRAL','ELLIPTICAL','UNCERTAIN']].values # We select the labels only

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

# normalize the data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Training Model

In [10]:
classifier = RandomForestClassifier(n_estimators=30, criterion='entropy', random_state=0)

In [12]:
%%time
classifier.fit(X_train, y_train)

CPU times: user 21.7 s, sys: 152 ms, total: 21.9 s
Wall time: 21.9 s


In [13]:
y_pred = classifier.predict(X_test)
y_pred

array([[0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       ...,
       [0, 0, 1],
       [1, 0, 0],
       [1, 0, 0]])

## Evaluate Model

In [14]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [17]:
confusion_matrix(y_test.flatten(), y_pred.flatten())

array([[254268,  12910],
       [ 14235, 119354]])

In [19]:
accuracy_score(y_test.flatten(), y_pred.flatten())

0.9322673773040195

In [24]:
y_pred_p = classifier.predict_proba(X_test)