In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
%matplotlib inline


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [5]:
features_df = pd.read_csv("cleaned_features_data.csv")
features_df.head(5)

Unnamed: 0,Polyp_id,Patient,Gender,Age,Anemia,Constipation,Location,Polyp Diameter (mm)
0,Poylp1,patient1,M,53.0,0.0,1.0,descending,3.0
1,Poylp2,patient1,M,53.0,0.0,1.0,descending,3.0
2,Poylp3,patient1,M,53.0,0.0,1.0,descending,5.0
3,Poylp4,patient2,M,49.0,0.0,1.0,ascending,5.0
4,Poylp5,patient2,M,49.0,0.0,1.0,descending,7.0


In [7]:
labels_df = pd.read_csv("labels.csv")
labels_df.tail(5)

Unnamed: 0,Type
394,neoplastic
395,neoplastic
396,neoplastic
397,neoplastic
398,neoplastic


In [8]:
features_df.rename(columns = {'Polyp Diameter (mm)':'Diameter'}, inplace = True)
features_df.head(5)

Unnamed: 0,Polyp_id,Patient,Gender,Age,Anemia,Constipation,Location,Diameter
0,Poylp1,patient1,M,53.0,0.0,1.0,descending,3.0
1,Poylp2,patient1,M,53.0,0.0,1.0,descending,3.0
2,Poylp3,patient1,M,53.0,0.0,1.0,descending,5.0
3,Poylp4,patient2,M,49.0,0.0,1.0,ascending,5.0
4,Poylp5,patient2,M,49.0,0.0,1.0,descending,7.0


In [16]:
target_names = list(labels_df.Type)
target = []
for i in target_names:
    if i == 'nonneoplastic':
        target.append(0)
    else:
        target.append(1)

In [26]:
multi_target = []
for i in target_names:
    if i == 'nonneoplastic':
        multi_target.append(0)
    elif i == 'neoplastic':
        multi_target.append(1)
    elif i == "normal":
        multi_target.append(2)
    else:
        multi_target.append(3)

In [28]:
len(multi_target)

399

In [18]:
len(target)

399

In [19]:
labels_df["Target"] = target

In [29]:
labels_df["Multi_Target"] = multi_target

In [30]:
labels_df

Unnamed: 0,Type,Target,Multi_Target
0,nonneoplastic,0,0
1,nonneoplastic,0,0
2,nonneoplastic,0,0
3,nonneoplastic,0,0
4,neoplastic,1,1
...,...,...,...
394,neoplastic,1,1
395,neoplastic,1,1
396,neoplastic,1,1
397,neoplastic,1,1


In [24]:
labels_df.to_csv("binary_labels.csv", index=False)

In [31]:
labels_df.to_csv("multi_labels.csv", index=False)

In [32]:
model_features = features_df[["Gender", "Location", "Age", "Anemia", "Constipation", "Diameter"]]
model_features

Unnamed: 0,Gender,Location,Age,Anemia,Constipation,Diameter
0,M,descending,53.0,0.0,1.0,3.0
1,M,descending,53.0,0.0,1.0,3.0
2,M,descending,53.0,0.0,1.0,5.0
3,M,ascending,49.0,0.0,1.0,5.0
4,M,descending,49.0,0.0,1.0,7.0
...,...,...,...,...,...,...
394,M,descending,0.0,0.0,1.0,0.0
395,M,descending,62.0,0.0,1.0,3.0
396,M,descending,62.0,0.0,1.0,3.0
397,M,hepatik fleksura,62.0,0.0,1.0,2.0


In [49]:
dummies = pd.get_dummies(model_features[["Location", "Gender"]], drop_first=True)
dummies

Unnamed: 0,Location_ascending,Location_cecum,Location_colon,Location_descending,Location_hepaticflexure,Location_hepatik fleksura,Location_rectosigmoid,Location_rectum,Location_rectum.1,Location_sigmoid,Location_sigmoid.1,Location_splenicflexure,Location_splenik fleksura,Location_transverse,Location_valf,Gender_M
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
395,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
396,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
397,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1


In [53]:
dummies["Age"] = model_features.Age
dummies["Diameter"] = model_features.Diameter
dummies["Anemia"] = model_features.Anemia
dummies["Constipation"] = model_features.Constipation
dummies

Unnamed: 0,Location_ascending,Location_cecum,Location_colon,Location_descending,Location_hepaticflexure,Location_hepatik fleksura,Location_rectosigmoid,Location_rectum,Location_rectum.1,Location_sigmoid,Location_sigmoid.1,Location_splenicflexure,Location_splenik fleksura,Location_transverse,Location_valf,Gender_M,Age,Diameter,Anemia,Constipation
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,53.0,3.0,0.0,1.0
1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,53.0,3.0,0.0,1.0
2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,53.0,5.0,0.0,1.0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,49.0,5.0,0.0,1.0
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,49.0,7.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0.0,0.0,0.0,1.0
395,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,62.0,3.0,0.0,1.0
396,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,62.0,3.0,0.0,1.0
397,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,62.0,2.0,0.0,1.0


In [66]:
X = dummies
y = labels_df["Target"]

In [76]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=200)

clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [77]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [78]:
clf.fit(X_train, y_train);

In [79]:
y_preds = clf.predict(X_test)
y_preds

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1], dtype=int64)

In [80]:
y_test

243    0
188    0
207    1
15     1
302    0
      ..
77     0
380    0
124    0
278    1
6      0
Name: Target, Length: 80, dtype: int64

In [81]:
clf.score(X_train, y_train)

0.9717868338557993

In [82]:
clf.score(X_test, y_test)

0.6875

In [83]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.44      0.30      0.36        23
           1       0.75      0.84      0.79        57

    accuracy                           0.69        80
   macro avg       0.59      0.57      0.58        80
weighted avg       0.66      0.69      0.67        80



In [84]:
confusion_matrix(y_test, y_preds)

array([[ 7, 16],
       [ 9, 48]], dtype=int64)

In [85]:
np.random.seed(42)
for i in range(100, 1000, 100):
    print(f"Trying model with {i} estimators...")
    clf = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)
    print(f"Model accuracy on test set: {clf.score(X_test, y_test) * 100:.2f}%")
    print("")

Trying model with 100 estimators...
Model accuracy on test set: 67.50%

Trying model with 200 estimators...
Model accuracy on test set: 71.25%

Trying model with 300 estimators...
Model accuracy on test set: 70.00%

Trying model with 400 estimators...
Model accuracy on test set: 68.75%

Trying model with 500 estimators...
Model accuracy on test set: 71.25%

Trying model with 600 estimators...
Model accuracy on test set: 68.75%

Trying model with 700 estimators...
Model accuracy on test set: 70.00%

Trying model with 800 estimators...
Model accuracy on test set: 68.75%

Trying model with 900 estimators...
Model accuracy on test set: 67.50%

