In [1]:
import pandas as pd
import numpy as np
import sklearn as sklearn

data = pd.read_csv("Breast_Cancer.csv", dtype={'Status': 'category'})

In [2]:
data.head()

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,4,Positive,Positive,24,1,60,Alive
1,50,White,Married,T2,N2,IIIA,Moderately differentiated,2,Regional,35,Positive,Positive,14,5,62,Alive
2,58,White,Divorced,T3,N3,IIIC,Moderately differentiated,2,Regional,63,Positive,Positive,14,7,75,Alive
3,58,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,18,Positive,Positive,2,1,84,Alive
4,47,White,Married,T2,N1,IIB,Poorly differentiated,3,Regional,41,Positive,Positive,3,1,50,Alive


In [10]:
len(data)

4024

In [2]:
df = pd.DataFrame(data)

Linear Regression - Status = Predict using demo 

In [3]:
# OneHot Encoding to convert to float - STATUS
status_encode = 'Status'

status_encoded = pd.get_dummies(df[status_encode], prefix = status_encode)

df = pd.concat([df, status_encoded], axis=1)

df.drop('Status', axis=1, inplace=True)


In [3]:
# OneHot Encoding to convert to float - MARITAL STATUS
maritalstatus_encode = 'Marital Status'

maritalstatus_encoded = pd.get_dummies(df[maritalstatus_encode], prefix = maritalstatus_encode)

df = pd.concat([df, maritalstatus_encoded], axis=1)

df.drop('Marital Status', axis=1, inplace=True)


In [4]:
# OneHot Encoding to convert to float - MARITAL STATUS
race_encode = 'Race'

race_encoded = pd.get_dummies(df[race_encode], prefix = race_encode)

df = pd.concat([df, race_encoded], axis=1)

df.drop('Race', axis=1, inplace=True)

In [19]:
df.head()

Unnamed: 0,Age,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,...,Status_Alive,Status_Dead,Race_Black,Race_Other,Race_White,Marital Status_Divorced,Marital Status_Married,Marital Status_Separated,Marital Status_Single,Marital Status_Widowed
0,68,T1,N1,IIA,Poorly differentiated,3,Regional,4,Positive,Positive,...,1,0,0,0,1,0,1,0,0,0
1,50,T2,N2,IIIA,Moderately differentiated,2,Regional,35,Positive,Positive,...,1,0,0,0,1,0,1,0,0,0
2,58,T3,N3,IIIC,Moderately differentiated,2,Regional,63,Positive,Positive,...,1,0,0,0,1,1,0,0,0,0
3,58,T1,N1,IIA,Poorly differentiated,3,Regional,18,Positive,Positive,...,1,0,0,0,1,0,1,0,0,0
4,47,T2,N1,IIB,Poorly differentiated,3,Regional,41,Positive,Positive,...,1,0,0,0,1,0,1,0,0,0


In [6]:
df.shape

(4024, 23)

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

X = pd.concat([df[['Age']], maritalstatus_encoded, race_encoded], axis=1)
Y = status_encoded

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=100)

In [25]:
print(df.dtypes)

Age                            int64
T Stage                       object
N Stage                       object
6th Stage                     object
differentiate                 object
Grade                         object
A Stage                       object
Tumor Size                     int64
Estrogen Status               object
Progesterone Status           object
Regional Node Examined         int64
Reginol Node Positive          int64
Survival Months                int64
Status                      category
Marital Status_Divorced        uint8
Marital Status_Married         uint8
Marital Status_Separated       uint8
Marital Status_Single          uint8
Marital Status_Widowed         uint8
Race_Black                     uint8
Race_Other                     uint8
Race_White                     uint8
dtype: object


In [8]:
model = LinearRegression()

model.fit(X_train, y_train)

LinearRegression()

In [10]:
from sklearn.metrics import r2_score

# make predictions on the testing data
y_pred = model.predict(X_test)

# calculate the R-squared score
r2 = r2_score(y_test, y_pred)
print('R-squared score:', r2)

Score: 0.01
R-squared score: 0.013331919857071772


From the r^2 score being 0.01, race, marital status, and age are not good determinants for status (Alive vs. Dead). So, predict tumor-size instead.

In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

X = pd.concat([df[['Age']], maritalstatus_encoded, race_encoded], axis=1)
Y = df[['Tumor Size']]

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=100)

In [21]:
model = LinearRegression()

model.fit(X_train, y_train)

LinearRegression()

In [22]:
from sklearn.metrics import r2_score

# make predictions on the testing data
y_pred = model.predict(X_test)

# calculate the R-squared score
r2 = r2_score(y_test, y_pred)
print('R-squared score:', r2)

R-squared score: -0.005002165103690448


R^2 score is wayyyy above 1.0, which means we are overfitting (test size .1), but when test size is changed to .2, score becomes very close to 0. Test size .3 is just negative. So, let's just try throwing the whole dataset and seeing what happens

Validate

One Hot Encoding for Rest of the Columns

In [28]:
tstage_encode = 'T Stage'

tstage_encoded = pd.get_dummies(df[tstage_encode], prefix = tstage_encode)

df = pd.concat([df, tstage_encoded], axis=1)

df.drop('T Stage', axis=1, inplace=True)

KeyError: 'T Stage'

In [29]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df, df[['Tumor Size']], test_size=0.1, random_state=100)

In [34]:
print(df.dtypes)

Age                            int64
T Stage                       object
N Stage                       object
6th Stage                     object
differentiate                 object
Grade                         object
A Stage                       object
Tumor Size                     int64
Estrogen Status               object
Progesterone Status           object
Regional Node Examined         int64
Reginol Node Positive          int64
Survival Months                int64
Status                      category
Marital Status_Divorced        uint8
Marital Status_Married         uint8
Marital Status_Separated       uint8
Marital Status_Single          uint8
Marital Status_Widowed         uint8
Race_Black                     uint8
Race_Other                     uint8
Race_White                     uint8
dtype: object


In [35]:
model = LinearRegression()

model.fit(X_train, y_train)

  X = check_array(


ValueError: Unable to convert array of bytes/strings into decimal numbers with dtype='numeric'