In [39]:

import numpy as np 
import pandas as pd 
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
##Load Datasets
datasets = pd.read_csv("D:\DML\BRCA.csv")
datasets.head(5)

Unnamed: 0,Patient_ID,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,ER status,PR status,HER2 status,Surgery_type,Date_of_Surgery,Date_of_Last_Visit,Patient_Status
0,TCGA-D8-A1XD,36.0,FEMALE,0.080353,0.42638,0.54715,0.27368,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,15-Jan-17,19-Jun-17,Alive
1,TCGA-EW-A1OX,43.0,FEMALE,-0.42032,0.57807,0.61447,-0.031505,II,Mucinous Carcinoma,Positive,Positive,Negative,Lumpectomy,26-Apr-17,09-Nov-18,Dead
2,TCGA-A8-A079,69.0,FEMALE,0.21398,1.3114,-0.32747,-0.23426,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,08-Sep-17,09-Jun-18,Alive
3,TCGA-D8-A1XR,56.0,FEMALE,0.34509,-0.21147,-0.19304,0.12427,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,25-Jan-17,12-Jul-17,Alive
4,TCGA-BH-A0BF,56.0,FEMALE,0.22155,1.9068,0.52045,-0.31199,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,06-May-17,27-Jun-19,Dead


In [3]:
datasets.isnull().sum()

Patient_ID             7
Age                    7
Gender                 7
Protein1               7
Protein2               7
Protein3               7
Protein4               7
Tumour_Stage           7
Histology              7
ER status              7
PR status              7
HER2 status            7
Surgery_type           7
Date_of_Surgery        7
Date_of_Last_Visit    24
Patient_Status        20
dtype: int64

In [4]:
datasets = datasets.dropna()
datasets.isnull().sum()

Patient_ID            0
Age                   0
Gender                0
Protein1              0
Protein2              0
Protein3              0
Protein4              0
Tumour_Stage          0
Histology             0
ER status             0
PR status             0
HER2 status           0
Surgery_type          0
Date_of_Surgery       0
Date_of_Last_Visit    0
Patient_Status        0
dtype: int64

In [5]:
datasets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 317 entries, 0 to 333
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Patient_ID          317 non-null    object 
 1   Age                 317 non-null    float64
 2   Gender              317 non-null    object 
 3   Protein1            317 non-null    float64
 4   Protein2            317 non-null    float64
 5   Protein3            317 non-null    float64
 6   Protein4            317 non-null    float64
 7   Tumour_Stage        317 non-null    object 
 8   Histology           317 non-null    object 
 9   ER status           317 non-null    object 
 10  PR status           317 non-null    object 
 11  HER2 status         317 non-null    object 
 12  Surgery_type        317 non-null    object 
 13  Date_of_Surgery     317 non-null    object 
 14  Date_of_Last_Visit  317 non-null    object 
 15  Patient_Status      317 non-null    object 
dtypes: float

In [6]:
print(datasets["Gender"].value_counts())

FEMALE    313
MALE        4
Name: Gender, dtype: int64


In [7]:
## Visualizaton on tumour stage
tumourStage = datasets.Tumour_Stage.value_counts()
stage = tumourStage.index 
values = tumourStage.values 

figure = px.pie(tumourStage, names=stage, values=values, hole=0.5, title="Tumour Stages of Patientss")
figure.show()


In [8]:
# Histology of patients
patientHistology = datasets["Histology"].value_counts()
hist = patientHistology.index
values = patientHistology.values

figure = px.pie(patientHistology, names=hist, values=values, title="Histology of Patients",hole=0.5)
figure.show()

In [9]:
# ER status
print(datasets["ER status"].value_counts())

Positive    317
Name: ER status, dtype: int64


In [10]:
# PR status
print(datasets["PR status"].value_counts())

Positive    317
Name: PR status, dtype: int64


In [11]:
# HER2 status
print(datasets["HER2 status"].value_counts())

Negative    288
Positive     29
Name: HER2 status, dtype: int64


# Surgery Type

In [12]:
surgeryType = datasets["Surgery_type"].value_counts()
type = surgeryType.index 
values = surgeryType.values 

figure = px.pie(surgeryType, names = type , values = values, hole = 0.5, title="Type of Surgery of Patients")
figure.show()

# Data Transformation

In [13]:
datasets["Tumour_Stage"] = datasets["Tumour_Stage"].map({"I":1,"II":2,"III":3})
datasets["Histology"] = datasets["Histology"].map({"Infiltrating Ductal Carcinoma":1,"Infiltrating Lobular Carcinoma":2,"Mucinous Carcinoma":3})
datasets["ER status"] =  datasets["ER status"].map({"Positive":1})
datasets["PR status"] = datasets["PR status"].map({"Positive":1})
datasets["HER2 status"] = datasets["HER2 status"].map({"Positive":1,"Negative":2})
datasets["Surgery_type"] = datasets["Surgery_type"].map({"Simple Mastectomy":1,"Lumpectomy":2,"Modified Radical Mastectomy":3,"Other":4})
datasets["Gender"] = datasets["Gender"].map({"FEMALE":0,"MALE":1})

datasets.head()


Unnamed: 0,Patient_ID,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,ER status,PR status,HER2 status,Surgery_type,Date_of_Surgery,Date_of_Last_Visit,Patient_Status
0,TCGA-D8-A1XD,36.0,0,0.080353,0.42638,0.54715,0.27368,3,1,1,1,2,3,15-Jan-17,19-Jun-17,Alive
1,TCGA-EW-A1OX,43.0,0,-0.42032,0.57807,0.61447,-0.031505,2,3,1,1,2,2,26-Apr-17,09-Nov-18,Dead
2,TCGA-A8-A079,69.0,0,0.21398,1.3114,-0.32747,-0.23426,3,1,1,1,2,4,08-Sep-17,09-Jun-18,Alive
3,TCGA-D8-A1XR,56.0,0,0.34509,-0.21147,-0.19304,0.12427,2,1,1,1,2,3,25-Jan-17,12-Jul-17,Alive
4,TCGA-BH-A0BF,56.0,0,0.22155,1.9068,0.52045,-0.31199,2,1,1,1,2,4,06-May-17,27-Jun-19,Dead


# Prediction Model

In [44]:
x = np.array(datasets[["Age","Gender","Protein1","Protein2","Protein3","Protein4","Tumour_Stage","Histology","ER status","PR status","HER2 status","Surgery_type"]])
y = np.array(datasets["Patient_Status"])

Xtrain,Xtest,ytrain,ytest = train_test_split(x,y, test_size=0.33, random_state=42)


In [45]:
model = SVC()
model.fit(Xtrain,ytrain)
accuracy = model.score(Xtest,ytest)
print("Accuracy : ",accuracy)

Accuracy :  0.8476190476190476


In [46]:
model2 = LogisticRegression()
model2.fit(Xtrain,ytrain)
accuracy = model2.score(Xtest,ytest)
print("Accuracy : ",accuracy)

Accuracy :  0.8476190476190476



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



In [55]:
# features = [age,Gender,protein1,protein2,protein3,protein4,Tumour_Stage,Histology,ER status,PR status, HER2 status,Surgery_type]
# age = float(input("Age : "))
# gender = int(input("Gender(0: Female 1: Male) : "))
# protein1 = float(input("Protein 1 : "))
# protein2 = float(input("Protein 2 : "))
# protein3 = float(input("Protein 3 : "))
# protein4 = float(input("Protein 4 : "))
# Tumour_Stage = int(input("Tumour Stage : "))
# histology = int(input("Histology(1: Infiltrating Ductal Carcinoma 2: Infiltrating Lobular Carcinoma 3: Mucinous Carcinoma  ) : "))
# erStatus = int(input("ER Status(Positive: 1) : "))
# prStatus = int(input("PR Status(Positive: 1) : "))
# herStatus = int(input("HER2 Status(Positive: 1 , Negative: 2) : "))
# Surgery_type = int(input("Surgery Type(Simple Mastectomy: 1, Lumpectomy: 2 , Modified Radical Mastectomy: 3, Other: 4 ) : "))

# features = np.array([[age,gender,protein1,protein2,protein3,protein4,Tumour_Stage,histology,erStatus,prStatus, herStatus,Surgery_type]])
features = np.array([[80.0,1,0.420320,0.57807,0.61447,-0.031505,3,3,1,1,4,2]])
predictor = model.predict(features)
print("Predictor : ", predictor)

Predictor :  ['Alive']
