In [1]:
# A random forest is a meta estimator that fits a number of decision tree classifiers on 
# various sub-samples of the dataset and uses averaging to improve the predictive accuracy 
# and control over-fitting. The sub-sample size is controlled with the max_samples parameter 
# if bootstrap=True (default), otherwise the whole dataset is used to build each tree.

# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

# class sklearn.ensemble.RandomForestClassifier(n_estimators=100, *, criterion='gini', 
# max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
# max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, 
# oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, 
# class_weight=None, ccp_alpha=0.0, max_samples=None)[source]
import pandas as pd
import numpy as np
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

fruit_df = pd.read_csv("fruit_data_with_colors.csv")

X = fruit_df[['mass', 'width', 'height', 'color_score']]
y = fruit_df["fruit_label"]

avgAccuracy = []
for i in range(10):
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, stratify=y) 
    # n_estimators = 100 <--- # of trees
    clf = RandomForestClassifier(n_estimators=100, max_depth=2, criterion='gini', random_state=0)
    clf.fit(X_train, y_train)

    #print("accuracy for train:", clf.score(X_train, y_train)*100)
    acc = clf.score(X_test, y_test)*100
    avgAccuracy = avgAccuracy+[acc]
    print("accuracy for test ...... ", acc)

print("* Average accuracy *: ", sum(avgAccuracy)/len(avgAccuracy))

# ranked based on the average impurity decrease across all the decision trees in the forest
feature_importances = clf.feature_importances_

# Create a DataFrame to display the feature importances
feature_importance_df = pd.DataFrame({'Feature': fruit_featureNames, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print("\n", feature_importance_df)

accuracy for test ......  100.0
accuracy for test ......  93.33333333333333
accuracy for test ......  93.33333333333333
accuracy for test ......  100.0
accuracy for test ......  100.0
accuracy for test ......  93.33333333333333
accuracy for test ......  100.0
accuracy for test ......  93.33333333333333
accuracy for test ......  93.33333333333333
accuracy for test ......  86.66666666666667
* Average accuracy *:  95.33333333333333

        Feature  Importance
3  color_score    0.395460
2       height    0.266311
1        width    0.169205
0         mass    0.169023
