In [30]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import os
from sklearn.metrics import accuracy_score
import pickle


In [3]:
data = pd.read_csv('./Prostate_Cancer.csv')

In [5]:
data.head()

Unnamed: 0,id,diagnosis_result,radius,texture,perimeter,area,smoothness,compactness,symmetry,fractal_dimension
0,1,M,23,12,151,954,0.143,0.278,0.242,0.079
1,2,B,9,13,133,1326,0.143,0.079,0.181,0.057
2,3,M,21,27,130,1203,0.125,0.16,0.207,0.06
3,4,M,14,16,78,386,0.07,0.284,0.26,0.097
4,5,M,9,19,135,1297,0.141,0.133,0.181,0.059


In [33]:
data.describe()


Unnamed: 0,diagnosis_result,radius,area,smoothness,compactness,symmetry
count,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.38,16.85,702.88,0.10273,0.1267,0.19317
std,0.487832,4.879094,319.710895,0.014642,0.061144,0.030785
min,0.0,9.0,202.0,0.07,0.038,0.135
25%,0.0,12.0,476.75,0.0935,0.0805,0.172
50%,0.0,17.0,644.0,0.102,0.1185,0.19
75%,1.0,21.0,917.0,0.112,0.157,0.209
max,1.0,25.0,1878.0,0.143,0.345,0.304


In [6]:
data = data.drop(['id'], axis=1)


In [7]:
data['diagnosis_result'].replace({'M':0,'B':1},inplace=True)


In [8]:
corr_metrics = data.corr()
corr_metrics.style.background_gradient()

Unnamed: 0,diagnosis_result,radius,texture,perimeter,area,smoothness,compactness,symmetry,fractal_dimension
diagnosis_result,1.0,0.176967,-0.070735,-0.607498,-0.562444,-0.197616,-0.512234,-0.233028,-0.00818
radius,0.176967,1.0,0.100245,-0.238216,-0.250934,-0.127121,-0.19149,-0.039707,-0.02912
texture,-0.070735,0.100245,1.0,-0.113453,-0.113725,0.102321,0.032446,0.077912,0.139157
perimeter,-0.607498,-0.238216,-0.113453,1.0,0.976648,0.269442,0.527542,0.195539,-0.195434
area,-0.562444,-0.250934,-0.113725,0.976648,1.0,0.208438,0.42495,0.110435,-0.274344
smoothness,-0.197616,-0.127121,0.102321,0.269442,0.208438,1.0,0.465723,0.424203,0.36958
compactness,-0.512234,-0.19149,0.032446,0.527542,0.42495,0.465723,1.0,0.681123,0.647953
symmetry,-0.233028,-0.039707,0.077912,0.195539,0.110435,0.424203,0.681123,1.0,0.568608
fractal_dimension,-0.00818,-0.02912,0.139157,-0.195434,-0.274344,0.36958,0.647953,0.568608,1.0


In [9]:
data = data.drop(['fractal_dimension', 'texture', 'perimeter'], axis=1)


In [10]:
X = data.drop(['diagnosis_result'], axis=1) # Features
y = data['diagnosis_result'] # Labels

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=10)

In [12]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators = 50)
forest.fit(X_train,y_train)
pred_forest = forest.predict(X_test)

In [15]:
class_rep_forest = classification_report(y_test, pred_forest)

In [16]:
print("Forest Classifier: \n", class_rep_forest)

Forest Classifier: 
               precision    recall  f1-score   support

           0       0.89      0.89      0.89         9
           1       0.91      0.91      0.91        11

    accuracy                           0.90        20
   macro avg       0.90      0.90      0.90        20
weighted avg       0.90      0.90      0.90        20



In [17]:
forest.predict([])

array([1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1],
      dtype=int64)

In [19]:
score=accuracy_score(y_test,pred_forest)

In [23]:
X

Unnamed: 0,radius,area,smoothness,compactness,symmetry
0,23,954,0.143,0.278,0.242
1,9,1326,0.143,0.079,0.181
2,21,1203,0.125,0.160,0.207
3,14,386,0.070,0.284,0.260
4,9,1297,0.141,0.133,0.181
...,...,...,...,...,...
95,23,1264,0.091,0.131,0.210
96,22,451,0.105,0.071,0.190
97,19,295,0.102,0.053,0.135
98,21,413,0.090,0.075,0.162


In [25]:
X_test

Unnamed: 0,radius,area,smoothness,compactness,symmetry
19,17,566,0.098,0.081,0.189
14,12,578,0.113,0.229,0.207
43,15,545,0.104,0.144,0.197
37,21,524,0.09,0.038,0.147
66,12,269,0.104,0.078,0.172
3,14,386,0.07,0.284,0.26
79,22,506,0.099,0.095,0.172
41,19,371,0.123,0.122,0.19
38,11,699,0.094,0.051,0.157
68,16,251,0.107,0.141,0.211


In [28]:
print(forest.predict([[23,954,0.143,0.278,0.242]]))

[0]


In [31]:

pickle_out = open("classifier.pkl","wb")
pickle.dump(forest, pickle_out)
pickle_out.close()