In [112]:
# Run but do not modify this code
import seaborn as sns
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Ridge


## Data Munging
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.feature_extraction.text import CountVectorizer

## Measurements
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay

if ('0.11' in sns.__version__) or ('0.12' in sns.__version__):
    sns.set_theme()
else:
    sns.set()

In [113]:
import pandas as pd
sns.set_theme()
tumor_df = pd.read_csv("tumor_data.csv")
print(tumor_df.shape)
tumor_df.head()

(323, 18)


Unnamed: 0,DeID,Ethnicity,Biomarker,TumorType,ER,Her2,Grade,GradeT,GradeN,GradeM,Tsizemm,Age,TILs,mits,Age50,grade1,TILs20,Tsize60
0,1,1,1,ductal,1,0,3,2,3,3,70.0,61,13,18,1,1,0,1
1,2,1,1,ductal,1,0,2,3,2,2,999.0,52,11,13,1,0,0,999
2,3,1,1,mucinous or mucinous fx\t,1,0,1,2,2,1,999.0,65,15,2,1,0,0,999
3,4,1,1,micropap or micropap fx\t,1,0,2,3,2,1,999.0,55,20,7,1,0,0,999
4,5,1,1,cribriform\t,1,0,1,1,2,2,90.0,74,7,13,1,0,0,1


In [114]:
# Features: Tumor type, HER2 and ER status, and grade of the tumor
# Target Variable: Tumor size
cleaned_df = tumor_df[(tumor_df != 999).all(axis=1)]
tanzania_df = cleaned_df[cleaned_df['Ethnicity'] == 1]

col_cat = 'Tsizemm'
all_cols5 = ['TumorType', 'ER', 'Her2', 'Grade', 'GradeT', 'GradeN', 'GradeM']
cleaned_df

Unnamed: 0,DeID,Ethnicity,Biomarker,TumorType,ER,Her2,Grade,GradeT,GradeN,GradeM,Tsizemm,Age,TILs,mits,Age50,grade1,TILs20,Tsize60
0,1,1,1,ductal,1,0,3,2,3,3,70.0,61,13,18,1,1,0,1
4,5,1,1,cribriform\t,1,0,1,1,2,2,90.0,74,7,13,1,0,0,1
5,6,1,3,ductal,0,0,3,3,3,3,50.0,72,33,42,1,1,1,0
6,7,1,2,ductal,0,1,3,2,3,3,120.0,51,44,20,1,1,1,1
7,8,1,2,micropap or micropap fx\t,1,1,3,3,3,3,120.0,43,16,25,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318,319,3,1,ductal,1,0,2,3,2,1,10.0,71,7,1,1,0,0,0
319,320,3,1,ductal,1,0,3,3,3,2,15.0,55,16,12,1,1,0,0
320,321,3,2,ductal,1,1,3,3,3,2,4.0,81,13,0,1,1,0,0
321,322,3,1,999,1,0,2,3,2,1,11.0,44,9,0,0,0,0,0


In [121]:

#  Encode 'TumorType' column
encoder = OneHotEncoder()

# Assuming black_american_df, encoder, and other variables are defined

# Encode 'TumorType' column
encoded_ttype = encoder.fit_transform(tanzania_df[['TumorType']])

# Combine encoded_ttype with other columns
data = np.concatenate((encoded_ttype.toarray(), tanzania_df[['ER', 'Her2', 'Grade', 'GradeT', 'GradeN', 'GradeM']].values), axis=1)
target = tanzania_df['Tsizemm'].values

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(data)

# Create polynomial features
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X_scaled)

# Fit Ridge regression model
ridge_model = Ridge(alpha=1.0)  # You can adjust the alpha parameter for regularization
ridge_model.fit(X_poly, target)

# Make predictions
y_pred = ridge_model.predict(X_poly)

# Evaluate the model
mse = mean_squared_error(target, y_pred)
r2 = r2_score(target, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)
len(tanzania_df)

Mean Squared Error: 209.61360721374498
R-squared: 0.8001460637472926


27

In [116]:
black_american_df = cleaned_df[cleaned_df['Ethnicity'] == 2]

In [122]:
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score

# Assuming black_american_df, encoder, and other variables are defined

# Encode 'TumorType' column
encoded_ttype = encoder.fit_transform(black_american_df[['TumorType']])

# Combine encoded_ttype with other columns
data = np.concatenate((encoded_ttype.toarray(), black_american_df[['ER', 'Her2', 'Grade', 'GradeT', 'GradeN', 'GradeM']].values), axis=1)
target = black_american_df['Tsizemm'].values

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(data)

# Create polynomial features
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X_scaled)

# Fit Ridge regression model
ridge_model = Ridge(alpha=1.0)  # You can adjust the alpha parameter for regularization
ridge_model.fit(X_poly, target)

# Make predictions
y_pred = ridge_model.predict(X_poly)

# Evaluate the model
mse = mean_squared_error(target, y_pred)
r2 = r2_score(target, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)
len(black_american_df)

Mean Squared Error: 139.60721104930207
R-squared: 0.5931850249953714


116

In [117]:
white_american_df = cleaned_df[cleaned_df['Ethnicity'] == 3]

In [123]:
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score

# Assuming black_american_df, encoder, and other variables are defined

# Encode 'TumorType' column
encoded_ttype = encoder.fit_transform(white_american_df[['TumorType']])

# Combine encoded_ttype with other columns
data = np.concatenate((encoded_ttype.toarray(), white_american_df[['ER', 'Her2', 'Grade', 'GradeT', 'GradeN', 'GradeM']].values), axis=1)
target = white_american_df['Tsizemm'].values

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(data)

# Create polynomial features
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X_scaled)

# Fit Ridge regression model
ridge_model = Ridge(alpha=1.0)  # You can adjust the alpha parameter for regularization
ridge_model.fit(X_poly, target)

# Make predictions
y_pred = ridge_model.predict(X_poly)

# Evaluate the model
mse = mean_squared_error(target, y_pred)
r2 = r2_score(target, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)
len(white_american_df)

Mean Squared Error: 127.09263901536853
R-squared: 0.623044154936


113