In [13]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
import numpy as np

In [43]:
# Establishes dataframe without name column.
df_raw = pd.read_csv("parkinsons.data")
df_raw.pop("name")

# Defines input and output data.
y = df_raw.iloc[:,16]
X = df_raw.iloc[:,:16]

# Creates random forest model from base data.
model = RandomForestClassifier()
cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 1)

# Prints accuracy score from cross evaluation.
n_scores = cross_val_score(model, X, y, scoring = "accuracy", cv = cv, n_jobs = -1, error_score = "raise")
print("Accuracy: %.3f (%.3f)" % (np.mean(n_scores), np.std(n_scores)))

Accuracy: 0.875 (0.080)


In [44]:
# Defines dictionary of all possible hyperparameters.
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
random_grid = {"n_estimators": [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
               "max_features": ["auto", "sqrt"],
               "max_depth": max_depth,
               "min_samples_split": [2, 5, 10],
               "min_samples_leaf": [1, 2, 4],
               "bootstrap": [True, False]}

# Creates random forest model from dictionary.
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, 
                               param_distributions = random_grid, 
                               n_iter = 100, 
                               cv = 3, 
                               verbose = 2, 
                               n_jobs = -1)

# Prints the best possible settings for the data.
rf_random.fit(X, y)
rf_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


{'n_estimators': 600,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 110,
 'bootstrap': True}

In [53]:
# Creates random forest model from base data.
model = RandomForestClassifier(n_estimators = 600,
                              min_samples_split = 10,
                              min_samples_leaf = 4,
                              max_features = "sqrt",
                              max_depth = 110,
                              bootstrap = True)
cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 1)

# Prints accuracy score from cross evaluation.
n_scores = cross_val_score(model, X, y, scoring = "accuracy", cv = cv, n_jobs = -1, error_score = "raise")
print("Accuracy: %.3f (%.3f)" % (np.mean(n_scores), np.std(n_scores)))

Accuracy: 0.877 (0.077)
