# Multiple Logistic Regression

## Library

In [None]:
#import the relevant libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
%matplotlib inline

## Import and Clean Data

In [None]:
df= pd.read_csv('./cancer_data.csv') #read the dataset
#Check for null entries
print("Number of null values in the data set are - ",df.isnull().values.any().sum())

#Replace yes and no entries in target to 1 and 0 repsectively
df=df.replace({'OS_STATUS':{'1:DECEASED':0, '0:LIVING':1}})
df=df.replace({'SEX':{'Female':1, 'Male':0}})

df.info()
df.head()

# select specific cancer types
df_lung = df.loc[df['CANCER_TYPE'] == 'Non-Small Cell Lung Cancer'] # 350 patients
df_skin = df.loc[df['CANCER_TYPE'] == 'Melanoma'] # 320
df_blad = df.loc[df['CANCER_TYPE'] == 'Bladder Cancer'] # 215
df_rcc = df.loc[df['CANCER_TYPE'] == 'Renal Cell Carcinoma'] # 151

Number of null values in the data set are -  0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1660 entries, 0 to 1659
Columns: 474 entries, PATIENT_ID to RRAS
dtypes: float64(1), int64(471), object(2)
memory usage: 6.0+ MB


## Logistic regression: TMB

In [None]:
def tmb_only_regression(df):
    y= df.OS_STATUS #dependent variable is Decision
    x= df.TMB_NONSYNONYMOUS
    
    # splitting the data
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size= 0.2)
    x_train = x_train.values.reshape(-1, 1)
    x_test = x_test.values.reshape(-1, 1)
    
    #Implementing Logistic Regression using sklearn
    modelLogistic = LogisticRegression(max_iter=1000)
    modelLogistic.fit(x_train, y_train)
    y_pred= modelLogistic.predict(x_test)
    
    #Creating confusion matrix
    ConfusionMatrix = confusion_matrix(y_test, y_pred)
    print(ConfusionMatrix)
    #Accuracy from confusion matrix
    TP= ConfusionMatrix[1,1] #True positive
    TN= ConfusionMatrix[0,0] #True negative
    Total=len(y_test)
    print("Accuracy from confusion matrix is ", (TN+TP)/Total)

In [None]:
tmb_only_regression(df)
tmb_only_regression(df_lung)
tmb_only_regression(df_skin)
tmb_only_regression(df_blad)
tmb_only_regression(df_rcc)

[[135  23]
 [125  49]]
Accuracy from confusion matrix is  0.5542168674698795
[[44  4]
 [20  2]]
Accuracy from confusion matrix is  0.6571428571428571
[[ 0 25]
 [ 0 39]]
Accuracy from confusion matrix is  0.609375
[[ 0 21]
 [ 0 22]]
Accuracy from confusion matrix is  0.5116279069767442
[[ 0 13]
 [ 0 18]]
Accuracy from confusion matrix is  0.5806451612903226


# feature selection for multivariate prediction
Select genes that are clinically enriched for each cancer type: (code in R)

Lung: "ZFHX3"  "EPHA7"  "NTRK3"  "EPHA5"  "NF2"    "ABL1"   "MAX"    "FLT3"   "PIK3C3" "PGR" "MRE11A" "EPHA3"  "RET"    "INHBA"  "MET"    "NOTCH1"

Skin: "TET1"   "B2M"    "BTK"    "TERT"   "DNMT3A" "ROS1"   "STAG2"  "ATR"    "PTPRD"  "CARD11" "KMT2D"  "PREX2"  "FAM46C" "NCOA3"  "EPHA7"  "RICTOR" "ATRX"   "GNAQ"   "IGF1R"  "AKT1"  "BIRC3"  "CASP8"  "INSR"   "RPTOR" 

Blad: "RNF43"  "ATM"    "NCOR1"  "PALB2"  "TERT"   "PIK3CA" "TP53"   "ERBB3"  "FGFR2"  "CREBBP"

RCC: "VHL"

In [None]:
va_lung = ["SEX","AGE_AT_SEQ_REPORT","TMB_NONSYNONYMOUS","ZFHX3","EPHA7","NTRK3","EPHA5","NF2","ABL1","MAX","FLT3","PIK3C3","PGR","MRE11A","EPHA3","RET","INHBA","MET","NOTCH1"]
va_skin = ["SEX","AGE_AT_SEQ_REPORT","TMB_NONSYNONYMOUS","TET1","B2M","BTK","TERT","DNMT3A","ROS1","STAG2","ATR","PTPRD","CARD11","KMT2D","PREX2","FAM46C","NCOA3","EPHA7","RICTOR","ATRX","GNAQ","IGF1R","AKT1","BIRC3","CASP8","INSR","RPTOR"]
va_blad = ["SEX","AGE_AT_SEQ_REPORT","TMB_NONSYNONYMOUS","RNF43","ATM","NCOR1","PALB2","TERT","PIK3CA","TP53","ERBB3","FGFR2","CREBBP"]
va_rcc = ["SEX","AGE_AT_SEQ_REPORT","TMB_NONSYNONYMOUS","VHL"]

# Multivariate logistic regression, cancer-specific - TMB, Sex, Age, clinically enriched genes

In [None]:
df.head()

Unnamed: 0,PATIENT_ID,OS_STATUS,SEX,CANCER_TYPE,AGE_AT_SEQ_REPORT,TMB_NONSYNONYMOUS,PIK3CA,BARD1,MAP3K13,NOTCH4,...,PPARG,MSI2,DUSP4,RRAS2,ERF,HLA-B,RECQL,SESN1,NTHL1,RRAS
0,P-0000057,0,1,Breast Cancer,41,5.545777,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,P-0000062,0,0,Esophagogastric Cancer,80,6.654932,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,P-0000063,1,0,Bladder Cancer,62,15.528174,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,P-0000071,1,0,Bladder Cancer,66,9.982398,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,P-0000082,1,0,Non-Small Cell Lung Cancer,61,13.309864,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
def multivar_regression(df, var_arr):
    y= df.OS_STATUS #dependent variable is Decision
#     x= df.loc[:, var_arr]
    x = df[var_arr]
    
    # splitting the data
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size= 0.2)
#     x_train = x_train.values.reshape(-1, 1)
#     x_test = x_test.values.reshape(-1, 1)
    
    #Implementing Logistic Regression using sklearn
    modelLogistic = LogisticRegression(max_iter=1000)
    modelLogistic.fit(x_train, y_train)
    y_pred= modelLogistic.predict(x_test)
    
    #Creating confusion matrix
    ConfusionMatrix = confusion_matrix(y_test, y_pred)
    print(ConfusionMatrix)
    #Accuracy from confusion matrix
    TP= ConfusionMatrix[1,1] #True positive
    TN= ConfusionMatrix[0,0] #True negative
    Total=len(y_test)
    print("Accuracy from confusion matrix is ", (TN+TP)/Total)

In [None]:
multivar_regression(df_lung, va_lung)
multivar_regression(df_skin, va_skin)
multivar_regression(df_blad, va_blad)
multivar_regression(df_rcc, va_rcc)

[[34  5]
 [23  8]]
Accuracy from confusion matrix is  0.6
[[ 8 19]
 [ 2 35]]
Accuracy from confusion matrix is  0.671875
[[15  3]
 [ 6 19]]
Accuracy from confusion matrix is  0.7906976744186046
[[ 3 13]
 [ 1 14]]
Accuracy from confusion matrix is  0.5483870967741935


# Multivariate logistic regression, pan-cancer: all variables as predictor

In [None]:
### Multiple Logistic Regression Pan-Cancer, us

# Define the independent and dependent variables
y= df.OS_STATUS #dependent variable is Decision
x= df.drop(['OS_STATUS','CANCER_TYPE','PATIENT_ID'], axis = 1)

# splitting the data
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size= 0.2)
print(x.shape)

#Implementing Logistic Regression using sklearn
modelLogistic = LogisticRegression(max_iter=1000)
modelLogistic.fit(x_train, y_train)
y_pred= modelLogistic.predict(x_test)

#Creating confusion matrix
ConfusionMatrix = confusion_matrix(y_test, y_pred)
print(ConfusionMatrix)
#Accuracy from confusion matrix
TP= ConfusionMatrix[1,1] #True positive
TN= ConfusionMatrix[0,0] #True negative
Total=len(y_test)
print("Accuracy from confusion matrix is ", (TN+TP)/Total)

(1660, 471)
[[110  64]
 [ 75  83]]
Accuracy from confusion matrix is  0.5813253012048193
