In [10]:
#modules for data manipulation
import pandas as pd

In [11]:
#Example input data has 10 attributes with 30 data columns - 10 mean, 10 standard error (SE), and 10 worst values
attribute_num = 10 #number of attributes

df = pd.read_csv('data.csv', delimiter = ",", low_memory=False)
df.drop('id', axis=1, inplace=True) #remove id column

X = df.iloc[:, 1 : (1 + attribute_num)] #select only the columns containing mean attribute data
y = df.iloc[:, 0 : 1] #select only the first column which contains diagnosis data

attribute_list = list(X.head())
print('Here are the attributes:', attribute_list)

Here are the attributes: ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean']


In [12]:
#modules for generating the trees
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as metrics

In [13]:
num_trees = 1000 #number of random decision trees

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

clf_RF = RandomForestClassifier(n_estimators = num_trees, max_features = "auto", random_state = 42, bootstrap = True).fit(X_train, y_train.values.ravel())

In [14]:
print(f"Random forest from all attributes - Train: {round((metrics.accuracy_score(clf_RF.predict(X_train), y_train)) * 100, 2)}%")
print(f"Random forest from all attributes - Test: {round((metrics.accuracy_score(clf_RF.predict(X_test), y_test)) * 100, 2)}%")

All Attributes Random Forest - Train: 100.0%
All Attributes Random Forest - Test: 93.57%


### Developing a random forest using only the most important attributes

In [15]:
attribute_cut_off = 0.15 #level of acceptable feature importance (must be a number between 0 and 1 - a value of 0 would include all attributes)

importances = clf_RF.feature_importances_
columns = X.columns

i = 0
best_attributes_list = []

while i < len(columns):
    print(f"The importance of attribute '{columns[i]}' is {round(importances[i] * 100, 2)}%.")
    if importances[i] > attribute_cut_off:
        best_attributes_list.append(columns[i])
    i = i + 1

The importance of feature 'radius_mean' is 9.83%.
The importance of feature 'texture_mean' is 6.02%.
The importance of feature 'perimeter_mean' is 15.09%.
The importance of feature 'area_mean' is 10.27%.
The importance of feature 'smoothness_mean' is 2.6%.
The importance of feature 'compactness_mean' is 5.15%.
The importance of feature 'concavity_mean' is 18.17%.
The importance of feature 'concave points_mean' is 29.22%.
The importance of feature 'symmetry_mean' is 1.91%.
The importance of feature 'fractal_dimension_mean' is 1.74%.


In [16]:
print(f"The attributes with {round(attribute_cut_off * 100, 2)}% or higher importance in predicting malignant samples were:")
print(best_attributes_list)

The attributes with 15.0% or higher importance in predicting malignant samples were:
['perimeter_mean', 'concavity_mean', 'concave points_mean']


In [17]:
best_X = X[best_attributes_list] #dataframe of attributes that met attribute_cut_off

best_X_train, best_X_test, best_y_train, best_y_test = train_test_split(best_X, y, test_size=0.3, random_state=0)
best_clf_RF = RandomForestClassifier(n_estimators = num_trees, max_features = "auto", random_state = 42, bootstrap = True).fit(best_X_train, best_y_train.values.ravel())

In [18]:
print(f"Random forest from the most important attributes - Train: {round((metrics.accuracy_score(best_clf_RF.predict(best_X_train), best_y_train)) * 100, 2)}%")
print(f"Random forest from the most important attributes - Test: {round((metrics.accuracy_score(best_clf_RF.predict(best_X_test), best_y_test)) * 100, 2)}%")

Best Attributes Random Forest - Train: 100.0%
Best Attributes Random Forest - Test: 90.64%
