# Find all transformation GWAS files

## Build list of all images

In [1]:
files <- list.files("/media/",
                    recursive = TRUE,
                    full.names = TRUE)

In [2]:
head(files)

In [3]:
length(files)

## Filter, parse down to transformation GWAS directories

Filter to transformation GWAS data only

In [4]:
files <- files[grepl("GWAS_Transformation", files)]

In [5]:
files <- files[!grepl("desktop.ini", files)]
files <- files[!grepl("xlsx", files)]
files <- files[!grepl("roadband", files)]
files <- files[!grepl("rgb_test", files)]
files <- files[!grepl(".csv", files)]
files <- files[!grepl("duplicate", files)]
files <- files[!grepl("Not right", files)]
files <- files[!grepl("wrong", files)]
files <- files[!grepl("screenshots", files)]
files <- files[!grepl("subset", files)]
files <- files[!grepl("Grids", files)]
files <- files[!grepl("GRIDS", files)]

In [6]:
length(files)

Parse out directory names

In [7]:
dirs <- dirname(files)

In [8]:
dirs <- unique(dirs)

In [9]:
head(dirs)

## Produce `data.frame` with directory paths and metadata for each

Desired columns: Phase, Timepoint, Path

In [10]:
library(stringr)

In [11]:
Phase_Timepoint <- str_split_fixed(dirs, "GWAS_Transformation/", 2)[,2]

In [12]:
Phase <- str_split_fixed(Phase_Timepoint, "/", 2)[,1] 
Timepoint <- str_split_fixed(Phase_Timepoint, "/", 2)[,2] 

In [13]:
df <- data.frame(dirs = dirs,
                 phase = Phase,
                 timepoint = Timepoint)

In [14]:
head(df)

dirs,phase,timepoint
/media//gmobot/Easystore_longerm_storage_2/GWAS_Transformation/GTCC/wk3,GTCC,wk3
/media//gmobot/Easystore_longerm_storage_2/GWAS_Transformation/GTDD/wk3,GTDD,wk3
/media//gmobot/Easystore_longerm_storage_2/GWAS_Transformation/GTEE/wk3,GTEE,wk3
/media//gmobot/Easystore_longerm_storage_2/GWAS_Transformation/GTX/wk10,GTX,wk10
/media//gmobot/Easystore_longerm_storage_2/GWAS_Transformation/GTX/wk7,GTX,wk7
/media//gmobot/Easystore_longerm_storage_2/GWAS_Transformation/GTY/wk10,GTY,wk10


In [15]:
df <- df[order(df$phase, df$timepoint), ]

In [16]:
head(df)

Unnamed: 0,dirs,phase,timepoint
41,/media//gmobot/Elements_10/GWAS_Transformation_samples_for_annotation,,
42,/media//gmobot/Elements_10,,
11,/media//gmobot/easystore/GWAS_Transformation/GTAA/wk10,GTAA,wk10
12,/media//gmobot/easystore/GWAS_Transformation/GTAA/wk3,GTAA,wk3
13,/media//gmobot/easystore/GWAS_Transformation/GTAA/wk7,GTAA,wk7
138,/media//gmobot/Expansio_15/Expansion_15/GWAS_Transformation/GTAAA/wk10,GTAAA,wk10


Clean up timepoint from folder names

In [17]:
df$timepoint <- gsub("/Fluorescent", "", df$timepoint)
df$timepoint <- gsub("Fluorescent/", "", df$timepoint)

In [18]:
timepoints <- levels(factor(df$timepoint))

In [19]:
timepoints

In [20]:
df <- df[!grepl("GX", df$phase), ]

In [21]:
phases <- levels(factor(Phase))

Need to clean up cases where there are files not saved in any timepoint folder

In [22]:
df[which(df$timepoint == ""), ]

Unnamed: 0,dirs,phase,timepoint
41,/media//gmobot/Elements_10/GWAS_Transformation_samples_for_annotation,,
42,/media//gmobot/Elements_10,,
8,/media//gmobot/Easystore_longerm_storage_2/GWAS_Transformation/GTZ,GTZ,


Need to identify cases where a timepoint is missing for any given phase

In [23]:
df[which(df$timepoint == "wk4"), ]

Unnamed: 0,dirs,phase,timepoint
76,/media//gmobot/Elements_13/Elements_13/GWAS_Transformation/GTJJ/wk4,GTJJ,wk4


In [24]:
df[which(df$timepoint != "wk3" &
         df$timepoint != "wk7" &
         df$timepoint != "wk10"), ]

Unnamed: 0,dirs,phase,timepoint
41,/media//gmobot/Elements_10/GWAS_Transformation_samples_for_annotation,,
42,/media//gmobot/Elements_10,,
129,/media//gmobot/Elements/GWAS_Transformation/GTCC/wk13,GTCC,wk13
144,/media//gmobot/Expansio_15/Expansion_15/GWAS_Transformation/GTCCC/wk9,GTCCC,wk9
130,/media//gmobot/Elements/GWAS_Transformation/GTDD/wk12,GTDD,wk12
146,/media//gmobot/Expansio_15/Expansion_15/GWAS_Transformation/GTDDD/wk8,GTDDD,wk8
131,/media//gmobot/Elements/GWAS_Transformation/GTEE/wk13,GTEE,wk13
134,/media//gmobot/Elements/GWAS_Transformation/GTGG/wk8,GTGG,wk8
136,/media//gmobot/Elements/GWAS_Transformation/GTII/wk5,GTII,wk5
76,/media//gmobot/Elements_13/Elements_13/GWAS_Transformation/GTJJ/wk4,GTJJ,wk4


In [25]:
timepoints <- c("wk3", "wk7", "wk10")

In [26]:
for(phase in phases){
    for(timepoint in timepoints){
        this_phase_timepoint <- df[which(df$phase == phase &
                                         df$timepoint == timepoint), ]
        if(nrow(this_phase_timepoint) == 0){
            to_append <- data.frame(dirs = c("Missing"),
                                    phase = c(phase),
                                    timepoint = c(timepoint))
            df <- rbind(df, to_append)
            }
        }
    }

In [27]:
df$phase_timepoint <- paste0(df$phase, "_", df$timepoint)

In [28]:
df$is_duplicated <- duplicated(df$phase_timepoint)

In [29]:
df <- df[order(df$phase, df$timepoint, df$dirs), ]

In [30]:
library(data.table)

In [31]:
fwrite(df, "paths_phases_timepoint_to_inspect_a3.csv")

## Search for missing data

In [32]:
head(df)

Unnamed: 0,dirs,phase,timepoint,phase_timepoint,is_duplicated
42,/media//gmobot/Elements_10,,,_,True
41,/media//gmobot/Elements_10/GWAS_Transformation_samples_for_annotation,,,_,False
189,Missing,,wk10,_wk10,False
187,Missing,,wk3,_wk3,False
188,Missing,,wk7,_wk7,False
11,/media//gmobot/easystore/GWAS_Transformation/GTAA/wk10,GTAA,wk10,GTAA_wk10,False


In [33]:
missing <- df[which(df$dirs == "Missing"), ]

In [34]:
missing_strings_to_search_for <- paste0(missing$phase, "/", missing$timepoint)

In [35]:
missing_strings_to_search_for

In [36]:
all_files <- files <- list.files("/media/",
                    recursive = TRUE,
                    full.names = TRUE)

In [37]:
all_dirs <- unique(dirname(all_files))

We will employ the method found here to search for strings in a vectors that contain any subtring in another vector

In [38]:
pattern = paste(missing_strings_to_search_for, collapse = "|")

In [39]:
dt_results <- data.table(all_dirs, result=grepl(pattern, all_dirs))

In [40]:
print(dt_results[which(result == TRUE), ])

                                                                                all_dirs
  1:             /media//gmobot/Easystore_longerm_storage_2/GWAS_Transformation/GTCC/wk3
  2:             /media//gmobot/Easystore_longerm_storage_2/GWAS_Transformation/GTDD/wk3
  3:             /media//gmobot/Easystore_longerm_storage_2/GWAS_Transformation/GTEE/wk3
  4:             /media//gmobot/Easystore_longerm_storage_2/GWAS_Transformation/GTX/wk10
  5:              /media//gmobot/Easystore_longerm_storage_2/GWAS_Transformation/GTX/wk7
 ---                                                                                    
414:                   /media//gmobot/Seagate Portable Drive/GWAS_Transformation/GTS/wk3
415:                   /media//gmobot/Seagate Portable Drive/GWAS_Transformation/GTS/wk7
416:                   /media//gmobot/Seagate Portable Drive/GWAS_Transformation/GTT/wk3
417: /media//gmobot/Seagate Portable Drive/GWAS_Transformation/GTT/wk3/GTT3_wrong_prefix
418:                 