In [1]:
library(dplyr)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [2]:
data <- read.table("data/clinical_information.txt", header=T, row.names=1, 
    sep="\t", na.strings=c(999))
dim(data)
colnames(data)

In [3]:
data <- data %>%
    mutate(
        age_at_op = case_when(
            age_at_op <= 45  ~ "<= 45",
            age_at_op > 45 ~ "> 45"),
        bmi = case_when(
            bmi <= 26 ~ "<= 26",
            bmi > 26  ~ "> 26"),
        age_at_period_begin = case_when(
            age_at_period_begin <= 15 ~ "<= 15",
            age_at_period_begin > 15  ~ "> 15"),
        comments_A = case_when(
            comments_A == "A" ~ "A",
            comments_A == ""  ~ "None"),
        symp_T = case_when(
            symp_T == TRUE ~ "T",
            is.na(symp_T)  ~ "None"),
        symp_R = case_when(
            symp_R == "R" ~ "R",
            symp_R == ""  ~ "None"),
        symp_S = case_when(
            symp_S == "S" ~ "S",
            symp_S == ""  ~ "None"),
        symp_nipple_discharge = case_when(
            symp_nipple_discharge == "D" ~ "D",
            symp_nipple_discharge == ""  ~ "None"),
        symp_M = case_when(
            symp_M == "M" ~ "M",
            symp_M == ""  ~ "None"),
        KI67_category = case_when(
            KI67_category < 3  ~ "<= 25",
            KI67_category >= 3 ~ "> 25")
    )

In [4]:
table(data$tnm_n_total)


    N0 N0(i+)     N1     N2     N3 
  2393     29    112     13      9 

In [5]:
data$group <- "control"
data$group[which(data$tnm_n_total %in% c("N1", "N2", "N3"))] <- "case"
data$group <- factor(data$group, levels=c("case", "control"),
                  labels=c("case", "control"))
table(data$group)


   case control 
    134    2422 

In [6]:
data$group2 <- "NA"
data$group2[which(data$tnm_n_total == "N1")] <- "N1"
data$group2[which(data$tnm_n_total == "N2")] <- "N2"
data$group2[which(data$tnm_n_total == "N3")] <- "N3"
data$group2 <- factor(data$group2, levels=c("N1", "N2", "N3"),
                  labels=c("N1", "N2", "N3"))
table(data$group2)


 N1  N2  N3 
112  13   9 

In [7]:
##### ##### ##### ##### ##### ##### ##### ##### #####
##### ##### #####  Analysis begin!  ##### ##### #####
##### ##### ##### ##### ##### ##### ##### ##### #####

In [8]:
# age_at_op
data$age_at_op <- factor(data$age_at_op, levels=c("<= 45", "> 45"), 
                      labels=c("<= 45", "> 45"))
table(data$group, data$age_at_op)
fisher.test(table(data$group, data$age_at_op))

         
          <= 45 > 45
  case       52   82
  control   787 1635


	Fisher's Exact Test for Count Data

data:  table(data$group, data$age_at_op)
p-value = 0.1315
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.9024609 1.9085276
sample estimates:
odds ratio 
  1.317246 


In [9]:
# BMI
data$bmi <- factor(data$bmi, levels=c("<= 26", "> 26"), 
                      labels=c("<= 26", "> 26"))
table(data$group, data$bmi)
fisher.test(table(data$group, data$bmi))

         
          <= 26 > 26
  case      100   11
  control  1713  392


	Fisher's Exact Test for Count Data

data:  table(data$group, data$bmi)
p-value = 0.0222
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 1.100157 4.341949
sample estimates:
odds ratio 
  2.079736 


In [10]:
# BMI, group2
table(data$group2, data$bmi)
fisher.test(table(data$group2, data$bmi))

    
     <= 26 > 26
  N1    86    9
  N2     8    0
  N3     6    2


	Fisher's Exact Test for Count Data

data:  table(data$group2, data$bmi)
p-value = 0.3345
alternative hypothesis: two.sided


In [11]:
# Age_at_period_begin
data$age_at_period_begin <- factor(data$age_at_period_begin, 
                      levels=c("<= 15", "> 15"), 
                      labels=c("<= 15", "> 15"))
table(data$group, data$age_at_period_begin)
fisher.test(table(data$group, data$age_at_period_begin))

         
          <= 15 > 15
  case       78   19
  control  1217  456


	Fisher's Exact Test for Count Data

data:  table(data$group, data$age_at_period_begin)
p-value = 0.1004
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.9097615 2.7215320
sample estimates:
odds ratio 
  1.537893 


In [12]:
# Comments_A
data$comments_A <- factor(data$comments_A, 
                      levels=c("A", "None"), 
                      labels=c("A", "None"))
table(data$group, data$comments_A)
fisher.test(table(data$group, data$comments_A))

         
             A None
  case      29  105
  control  902 1520


	Fisher's Exact Test for Count Data

data:  table(data$group, data$comments_A)
p-value = 0.0002041
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.2949526 0.7147350
sample estimates:
odds ratio 
 0.4655459 


In [13]:
# Comments_A, group2
table(data$group2, data$comments_A)
fisher.test(table(data$group2, data$comments_A))

    
      A None
  N1 26   86
  N2  3   10
  N3  0    9


	Fisher's Exact Test for Count Data

data:  table(data$group2, data$comments_A)
p-value = 0.3267
alternative hypothesis: two.sided


In [14]:
# P_or_NP
data$P_or_NP[which(data$P_or_NP == "")] <- NA
data$P_or_NP <- factor(data$P_or_NP, 
                      levels=c("P", "NP"), 
                      labels=c("P", "NP"))
table(data$group, data$P_or_NP)
fisher.test(table(data$group, data$P_or_NP))

         
             P   NP
  case      74   33
  control 1152  749


	Fisher's Exact Test for Count Data

data:  table(data$group, data$P_or_NP)
p-value = 0.08348
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.9439751 2.2931097
sample estimates:
odds ratio 
  1.457702 


In [15]:
# Symp_T
data$symp_T <- factor(data$symp_T, 
                      levels=c("T", "None"), 
                      labels=c("T", "None"))
table(data$group, data$symp_T)
fisher.test(table(data$group, data$symp_T))

         
             T None
  case       9  125
  control   92 2330


	Fisher's Exact Test for Count Data

data:  table(data$group, data$symp_T)
p-value = 0.1058
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.7893121 3.7272022
sample estimates:
odds ratio 
  1.822838 


In [16]:
# Symp_R
data$symp_R <- factor(data$symp_R, 
                      levels=c("R", "None"), 
                      labels=c("R", "None"))
table(data$group, data$symp_R)
fisher.test(table(data$group, data$symp_R))

         
             R None
  case       3  131
  control   32 2390


	Fisher's Exact Test for Count Data

data:  table(data$group, data$symp_R)
p-value = 0.4269
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.3307696 5.5777035
sample estimates:
odds ratio 
   1.70994 


In [17]:
# Symp_S
data$symp_S <- factor(data$symp_S, 
                      levels=c("S", "None"), 
                      labels=c("S", "None"))
table(data$group, data$symp_S)
fisher.test(table(data$group, data$symp_S))

         
             S None
  case       2  132
  control   30 2392


	Fisher's Exact Test for Count Data

data:  table(data$group, data$symp_S)
p-value = 0.6833
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.1384368 4.8488599
sample estimates:
odds ratio 
  1.207982 


In [18]:
# Symp_D (nipple_discharge)
data$symp_nipple_discharge <- factor(data$symp_nipple_discharge, 
                      levels=c("D", "None"), 
                      labels=c("D", "None"))
table(data$group, data$symp_nipple_discharge)
fisher.test(table(data$group, data$symp_nipple_discharge))

         
             D None
  case      15  119
  control  184 2238


	Fisher's Exact Test for Count Data

data:  table(data$group, data$symp_nipple_discharge)
p-value = 0.1352
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.8145717 2.7001730
sample estimates:
odds ratio 
  1.532844 


In [19]:
# Symp_M
data$symp_M <- factor(data$symp_M, 
                      levels=c("M", "None"), 
                      labels=c("M", "None"))
table(data$group, data$symp_M)
fisher.test(table(data$group, data$symp_M))

         
             M None
  case      14  120
  control   33 2389


	Fisher's Exact Test for Count Data

data:  table(data$group, data$symp_M)
p-value = 4.677e-08
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
  4.054397 16.706573
sample estimates:
odds ratio 
   8.42975 


In [20]:
# Symp_M, group2
table(data$group2, data$symp_M)
fisher.test(table(data$group2, data$symp_M))

    
       M None
  N1   8  104
  N2   3   10
  N3   3    6


	Fisher's Exact Test for Count Data

data:  table(data$group2, data$symp_M)
p-value = 0.01584
alternative hypothesis: two.sided


In [21]:
# Focality
data$focality <- factor(data$focality, 
                      levels=c(1:3), 
                      labels=c("1", "2", "3"))
table(data$group, data$focality)
fisher.test(table(data$group, data$focality))

         
             1    2    3
  case     106    7    5
  control 1831  153   60


	Fisher's Exact Test for Count Data

data:  table(data$group, data$focality)
p-value = 0.5593
alternative hypothesis: two.sided


In [22]:
# Histology_grade
data$histology_grade <- factor(data$histology_grade, 
                      levels=c("G1", "G2", "G3", "GX"), 
                      labels=c("G1", "G2", "G3", "GX"))
table(data$group, data$histology_grade)
fisher.test(table(data$group, data$histology_grade))

         
           G1  G2  G3  GX
  case     17  62  19  36
  control 183 876 529 834


	Fisher's Exact Test for Count Data

data:  table(data$group, data$histology_grade)
p-value = 0.004021
alternative hypothesis: two.sided


In [23]:
# Histology_grade, group2
table(data$group2, data$histology_grade)
fisher.test(table(data$group2, data$histology_grade))

    
     G1 G2 G3 GX
  N1 16 55 16 25
  N2  0  2  2  9
  N3  1  5  1  2


	Fisher's Exact Test for Count Data

data:  table(data$group2, data$histology_grade)
p-value = 0.02624
alternative hypothesis: two.sided


In [24]:
# Nuclear_grade
data$nuclear_grade <- factor(data$nuclear_grade, 
                      levels=c("G1", "G2", "G3", "GX"), 
                      labels=c("G1", "G2", "G3", "GX"))
table(data$group, data$nuclear_grade)
fisher.test(table(data$group, data$nuclear_grade))

         
           G1  G2  G3  GX
  case      7  58  36  33
  control 150 833 910 529


	Fisher's Exact Test for Count Data

data:  table(data$group, data$nuclear_grade)
p-value = 0.05488
alternative hypothesis: two.sided


In [25]:
# ER
data$er_t <- factor(data$er_t, 
                      levels=c(0, 1), 
                      labels=c("Negative", "Positive"))
table(data$group, data$er_t)
fisher.test(table(data$group, data$er_t))

         
          Negative Positive
  case          45       89
  control     1077     1345


	Fisher's Exact Test for Count Data

data:  table(data$group, data$er_t)
p-value = 0.01551
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.4271637 0.9227489
sample estimates:
odds ratio 
 0.6315469 


In [26]:
# ER, group2
table(data$group2, data$er_t)
fisher.test(table(data$group2, data$er_t))

    
     Negative Positive
  N1       38       74
  N2        4        9
  N3        3        6


	Fisher's Exact Test for Count Data

data:  table(data$group2, data$er_t)
p-value = 1
alternative hypothesis: two.sided


In [27]:
# PR
data$pr_t <- factor(data$pr_t, 
                      levels=c(0, 1), 
                      labels=c("Negative", "Positive"))
table(data$group, data$pr_t)
fisher.test(table(data$group, data$pr_t))

         
          Negative Positive
  case          59       75
  control     1310     1112


	Fisher's Exact Test for Count Data

data:  table(data$group, data$pr_t)
p-value = 0.02595
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.4620884 0.9615960
sample estimates:
odds ratio 
 0.6678797 


In [28]:
# PR, group2
table(data$group2, data$pr_t)
fisher.test(table(data$group2, data$pr_t))

    
     Negative Positive
  N1       51       61
  N2        4        9
  N3        4        5


	Fisher's Exact Test for Count Data

data:  table(data$group2, data$pr_t)
p-value = 0.6028
alternative hypothesis: two.sided


In [29]:
# HER2
data$her2_t <- factor(data$her2_t, 
                      levels=c(0, 1), 
                      labels=c("Negative", "Positive"))
table(data$group, data$her2_t)
fisher.test(table(data$group, data$her2_t))

         
          Negative Positive
  case          96       38
  control     1344     1078


	Fisher's Exact Test for Count Data

data:  table(data$group, data$her2_t)
p-value = 0.0002243
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 1.364917 3.059280
sample estimates:
odds ratio 
   2.02575 


In [30]:
# HER2, group2
table(data$group2, data$her2_t)
fisher.test(table(data$group2, data$her2_t))

    
     Negative Positive
  N1       83       29
  N2        8        5
  N3        5        4


	Fisher's Exact Test for Count Data

data:  table(data$group2, data$her2_t)
p-value = 0.2855
alternative hypothesis: two.sided


In [31]:
# KI67
data$KI67_category <- factor(data$KI67_category, 
                      levels=c("<= 25", "> 25"), 
                      labels=c("<= 25", "> 25"))
table(data$group, data$KI67_category)
fisher.test(table(data$group, data$KI67_category))

         
          <= 25 > 25
  case       62   23
  control  1243  509


	Fisher's Exact Test for Count Data

data:  table(data$group, data$KI67_category)
p-value = 0.8066
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.6656605 1.8882105
sample estimates:
odds ratio 
   1.10379 


In [32]:
# EIC
data$EIC <- factor(data$EIC, 
                      levels=c(0, 1), 
                      labels=c("Negative", "Positive"))
table(data$group, data$EIC)
fisher.test(table(data$group, data$EIC))

         
          Negative Positive
  case          13       71
  control      168     1470


	Fisher's Exact Test for Count Data

data:  table(data$group, data$EIC)
p-value = 0.1422
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.7958459 2.9956604
sample estimates:
odds ratio 
  1.601588 
