In [99]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split,cross_val_score,RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [100]:
df=pd.read_csv("/content/Breast_Cancer.csv")

In [101]:
df.shape

(4024, 16)

In [102]:
df.head()

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,4,Positive,Positive,24,1,60,Alive
1,50,White,Married,T2,N2,IIIA,Moderately differentiated,2,Regional,35,Positive,Positive,14,5,62,Alive
2,58,White,Divorced,T3,N3,IIIC,Moderately differentiated,2,Regional,63,Positive,Positive,14,7,75,Alive
3,58,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,18,Positive,Positive,2,1,84,Alive
4,47,White,Married,T2,N1,IIB,Poorly differentiated,3,Regional,41,Positive,Positive,3,1,50,Alive


In [103]:
df.isnull().sum()

Unnamed: 0,0
Age,0
Race,0
Marital Status,0
T Stage,0
N Stage,0
6th Stage,0
differentiate,0
Grade,0
A Stage,0
Tumor Size,0


In [104]:
df.nunique()

Unnamed: 0,0
Age,40
Race,3
Marital Status,5
T Stage,4
N Stage,3
6th Stage,5
differentiate,4
Grade,4
A Stage,2
Tumor Size,110


In [105]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4024 entries, 0 to 4023
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Age                     4024 non-null   int64 
 1   Race                    4024 non-null   object
 2   Marital Status          4024 non-null   object
 3   T Stage                 4024 non-null   object
 4   N Stage                 4024 non-null   object
 5   6th Stage               4024 non-null   object
 6   differentiate           4024 non-null   object
 7   Grade                   4024 non-null   object
 8   A Stage                 4024 non-null   object
 9   Tumor Size              4024 non-null   int64 
 10  Estrogen Status         4024 non-null   object
 11  Progesterone Status     4024 non-null   object
 12  Regional Node Examined  4024 non-null   int64 
 13  Reginol Node Positive   4024 non-null   int64 
 14  Survival Months         4024 non-null   int64 
 15  Stat

In [106]:
df.head()

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,4,Positive,Positive,24,1,60,Alive
1,50,White,Married,T2,N2,IIIA,Moderately differentiated,2,Regional,35,Positive,Positive,14,5,62,Alive
2,58,White,Divorced,T3,N3,IIIC,Moderately differentiated,2,Regional,63,Positive,Positive,14,7,75,Alive
3,58,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,18,Positive,Positive,2,1,84,Alive
4,47,White,Married,T2,N1,IIB,Poorly differentiated,3,Regional,41,Positive,Positive,3,1,50,Alive


In [107]:
#doing label encoding
object_class=df.select_dtypes(include='object').columns

encoders={}
le=LabelEncoder()
for i in object_class:
  df[i]=le.fit_transform(df[i])
  encoders[i]=le

In [108]:
# outliers
def outlier_remove(df, col):
    Q1 = df[col].quantile(0.25)  # Calculate Q1 for the specific column
    Q3 = df[col].quantile(0.75)  # Calculate Q3 for the specific column
    IQR = Q3 - Q1

    lower_bound = Q1 - (1.5 * IQR)
    upper_bound = Q3 + (1.5 * IQR)


    median=df[col].median()


    df[col]=df[col].apply(lambda x:median if x<lower_bound or x>upper_bound else x)

    return df

In [109]:
df=outlier_remove(df,"Age")

In [110]:
df=outlier_remove(df,"Tumor Size")

In [111]:
df=outlier_remove(df,"Regional Node Examined")

In [112]:
df=outlier_remove(df,"Reginol Node Positive")

In [113]:
df=outlier_remove(df,"Survival Months")

In [114]:
#checking distribution of class data
df['Status'].value_counts()


Unnamed: 0_level_0,count
Status,Unnamed: 1_level_1
0,3408
1,616


In [115]:
x=df.drop('Status',axis=1)
y=df['Status']

In [116]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [117]:
smote=SMOTE(random_state=42)
x_new,y_new=smote.fit_resample(x_train,y_train)

In [118]:
y_new.value_counts()

Unnamed: 0_level_0,count
Status,Unnamed: 1_level_1
0,2723
1,2723


In [119]:
models={
    "decison tree":DecisionTreeClassifier(),
    "random forest":RandomForestClassifier(),

}


In [120]:
cv_scores={}

#perform 5-fold cross validation for each model
for model_name,model in models.items():
  scores=cross_val_score(model,x_new,y_new,cv=5)
  cv_scores[model_name]=scores
  print(f"{model_name} cross validation accuracy:{np.mean(scores):.2f}")
  print("-"*50 )

decison tree cross validation accuracy:0.87
--------------------------------------------------
random forest cross validation accuracy:0.92
--------------------------------------------------


In [121]:
#initialising models
decison_tree=DecisionTreeClassifier(random_state=42)
random_forest=RandomForestClassifier(random_state=42)

In [122]:
#hyperparameter grids for RandomizedSearchCV

param_grid_dt={
    "criterion":["gini","entropy"],
    "max_depth":[None,5,10,15,20,30],
    "min_samples_split":[2,5,10],
}
param_grid_rf={
    "n_estimators":[100,200,300],
    "criterion":["gini","entropy"],
     "max_depth":[None,5,10,15,20,30],
    "min_samples_split":[2,5,10],
}

In [123]:
#hyperparameter tuning for both models
#the below steps can be autyomated by using a for loop or by using a pipeine
#perform randomizedsearchcv for each model

random_search_dt=RandomizedSearchCV(estimator=decison_tree,param_distributions=param_grid_dt,n_iter=10,cv=5,scoring="accuracy",random_state=42)
random_search_dt.fit(x_new,y_new)
random_search_rf=RandomizedSearchCV(estimator=random_forest,param_distributions=param_grid_rf,n_iter=10,cv=5,scoring="accuracy",random_state=42)
random_search_rf.fit(x_new,y_new)

In [124]:
print(random_search_dt.best_params_)
print(random_search_rf.best_params_)

{'min_samples_split': 2, 'max_depth': 20, 'criterion': 'entropy'}
{'n_estimators': 100, 'min_samples_split': 2, 'max_depth': 30, 'criterion': 'gini'}


In [125]:
print(random_search_dt.best_score_)
print(random_search_rf.best_score_)

0.874054136022443
0.9256525218827137


In [126]:
#get the model with best score

best_model=None
best_score=0

if random_search_dt.best_score_>best_score:
  best_score=random_search_dt.best_score_
  best_model=random_search_dt.best_estimator_

if random_search_rf.best_score_>best_score:
  best_score=random_search_rf.best_score_
  best_model=random_search_rf.best_estimator_

In [127]:
print(best_model)
print(best_score)

RandomForestClassifier(max_depth=30, random_state=42)
0.9256525218827137


In [128]:
#save the best model
with open("breastcancer_model.sav","wb") as f:
  pickle.dump(best_model,f)

In [133]:
random_forest.fit(x_new,y_new)

In [134]:
#evaluate on test data
y_test_pred=best_model.predict(x_test)
print("accuracy score\n",accuracy_score(y_test,y_test_pred))
print("confusion matrix\n",confusion_matrix(y_test,y_test_pred))
print("classification report\n",classification_report(y_test,y_test_pred))

accuracy score
 0.8981366459627329
confusion matrix
 [[655  30]
 [ 52  68]]
classification report
               precision    recall  f1-score   support

           0       0.93      0.96      0.94       685
           1       0.69      0.57      0.62       120

    accuracy                           0.90       805
   macro avg       0.81      0.76      0.78       805
weighted avg       0.89      0.90      0.89       805



In [142]:
#building predictive model
input_data = (62,0,0,140,268,0,0,160,0,3.6,0,2,2,4,90)

# change the input data to a numpy array
input_data_as_numpy_array= np.asarray(input_data)

# reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = random_forest.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('The Person has a chance of survival from breast cancer')
else:
  print('The Person does not have a chance of survival from breast cancer')

[0]
The Person has a chance of survival from breast cancer


