Merge branch '2024.04' of github.com:maxplanck-ie/Rintro into 2024.04

maxplanck-ie · Apr 22, 2024 · 80adc3c · 80adc3c
2 parents 960153f + 1111479
commit 80adc3c
Show file tree

Hide file tree

Showing 5 changed files with 29 additions and 15 deletions.
diff --git a/qmd/01_FirstSteps.qmd b/qmd/01_FirstSteps.qmd
@@ -75,6 +75,8 @@ One way to communicate this context is with the output from sessionInfo().
 sessionInfo()
 ```
 
+**Query**: Using sessionInfo() determine which R-version you are running.
+
 ****
 
 # Assigning values to objects

diff --git a/qmd/03_GettingData.qmd b/qmd/03_GettingData.qmd
@@ -86,7 +86,7 @@ write.csv(iris, gzfile("output/iris.csv.gz"))
 - Save only the subset of flowers where Species="setosa" to a file setosa.tsv
 
 ```{r, eval=FALSE, echo=FALSE}
-write.csv(iris[iris$Species=="setosa",], file="setosa.csv", row.names=FALSE, quote=FALSE)
+write.csv(iris[iris$Species=="setosa",], file="output/setosa.csv", row.names=FALSE, quote=FALSE)
 ```
 
 
@@ -154,7 +154,6 @@ In Rstudio they can be created using `File > New File > R script`
   - How many different species are in the new data_frame. Save this number as variable `ns` (hint: there are two useful functions: `unique()` and `length()`)
   - write the new `data_frame d` in comma seperated file `iris_big_sepal.csv`
   - save the whole environment in a file `analysis.RData` 
-  - delete all variables in the environment
 * Save the script and run it (source)
 * Bonus: Delete all whole environment reload it from the image file
 

diff --git a/qmd/05_DataModels.qmd b/qmd/05_DataModels.qmd
@@ -167,17 +167,28 @@ anova(fit)
 Determine residual standard error `sigma` for different fits with various complexity
 
 ```{r model_comp}
-fit=lm(Petal.Width ~ Petal.Length, data=iris)
-paste(sigma(fit), deparse(formula(fit)))
-
-fit=lm(Petal.Width ~ Petal.Length + Sepal.Length, data=iris)  # function of more than one variable
-paste(sigma(fit), deparse(formula(fit)))
-
-fit=lm(Petal.Width ~ Species, data=iris)                      # function of categorical variables
-paste(sigma(fit), deparse(formula(fit)))
-
-fit=lm(Petal.Width ~ . , data=iris)                           # function of all other variable (numerical and categorical)
-paste(sigma(fit), deparse(formula(fit)))
+# A list of formulae
+formula_list = list(
+  Petal.Width ~ Petal.Length,                 # as before (single variable)
+  Petal.Width ~ Petal.Length + Sepal.Length,  # function of more than one variable
+  Petal.Width ~ Species,                      # function of categorical variables
+  Petal.Width ~ .                             # function of all other variable (numerical and categorical)
+)
+
+sig=c()
+for (f in formula_list) {
+  fit = lm(f, data=iris)
+  sig = c(sig, sigma(fit))
+  print(paste(sigma(fit), format(f)))
+}
+
+# more concise loop using lapply/sapply
+# sig = sapply(lapply(formula_list, lm, data=iris), sigma)
+
+op=par(no.readonly=TRUE) 
+par(mar=c(4,20,2,2))
+barplot(sig ~ format(formula_list), horiz=TRUE, las=2, ylab="", xlab="sigma")
+par(op)     # reset graphical parameters
 ```
 
 ... more complex models tend to have smaller residual standard error (overfitting?)
@@ -208,6 +219,6 @@ Linear models $y_i=\theta_0 + \theta_1  x_i + \epsilon_i$ make certain assumptio
 # Review
 * dependencies between variable can often be modeled
 * linear model lm(): fitting, summary and interpretation
-* correlation coefficients can be misleading
+* linear models with numerical and factorial variables
 * linear models may not be appropriate $\to$ example(anscombe)
 
diff --git a/qmd/07_DataImport.qmd b/qmd/07_DataImport.qmd
@@ -274,7 +274,7 @@ Finally, we could need to save our data as a new file for later use or sharing,
 #| echo: true
 #| eval: false
 
-write_csv(rna_wide_noNAs, file = "rna_wide.csv")
+write_csv(rna_wide_noNAs, file = "output/rna_wide.csv")
 
 ```
 

diff --git a/qmd/09_DataVisualization.qmd b/qmd/09_DataVisualization.qmd
@@ -77,6 +77,8 @@ Let's plot a MA-like plot for `GSM2545336` vs `GSM2545380`, first, we generate t
 ```{r}
 #| echo: true
 
+# log2fc:     M = log2(x/y) = log2(x) - log2(y)
+# norm_mean:  A = 1/2 ( log2(x) + log2(y) )
 ma_data = rna %>% 
           select(gene, GSM2545336, GSM2545380) %>% 
           mutate(
-Original file line number
+Diff line change
@@ Expand Up @@
     sessionInfo()
     ```
+    **Query**: Using sessionInfo() determine which R-version you are running.
     ****
     # Assigning values to objects
@@ Expand Down @@