This code will append all the files ending with '.sequence_class_scores.tsv' from the Sei outputs and will extract the sequence class and sequence class scores into a single file. Perform this step in R or using a R runtime in Google colab

In [None]:
##Sequence class extraction
install.packages('openxlsx')
install.packages('dplyr')

getwd()
ls()

library(openxlsx)
library(dplyr)

# Specify the folder path
folder_path <- '/content/drive/MyDrive/BimaProject/Modig1/vcf/exome2/chromatin-profiles-hdf5/'

# Get a list of files in the folder ending with '.sequence_class_scores.tsv'
files <- list.files(path = folder_path, pattern = "\\.sequence_class_scores.tsv$", full.names = TRUE)

# Initialize an empty list to store dataframes
dataframes_list <- list()

# Loop through each file
for (file_path in files) {
  # Extract sample name from the file name
  sample_name <- gsub('.sequence_class_scores.tsv', '', basename(file_path))

  # Read the TSV file
  sei_data <- read.table(file_path, sep='\t', header=TRUE)

  # Select columns from the 10th column onwards
  df <- sei_data[, 11:ncol(sei_data)]

  # Function to find extreme positive or negative value in a row
  find_extreme <- function(row, col_names) {
    max_value <- max(row, na.rm = TRUE)
    min_value <- min(row, na.rm = TRUE)

    # Determine if extreme value is positive or negative
    if (!is.na(min_value) && abs(min_value) > max_value) {
      return(list(value = min_value, column = col_names[which.min(row)]))
    } else {
      return(list(value = max_value, column = col_names[which.max(row)]))
    }
  }

  # Apply the function to each row of the dataframe
  extreme_values <- t(sapply(1:nrow(df), function(i) find_extreme(df[i,], names(df))))

  # Convert matrix to data frame
  df2 <- as.data.frame(extreme_values)
  custom_headers <- c("Sequence class scores", "Sequence class")  # Custom headers
  colnames(df2) <- custom_headers

  # Extract columns 3-8 from df
  subset_df <- sei_data[, 3:8]

  # Merge subset_df with df2
  merged_df <- cbind(subset_df, df2)

  # Combine the sample name with the data
  merged_df$Sample <- sample_name

  # Append the dataframe to the list
  dataframes_list[[sample_name]] <- merged_df
}

# Append all dataframes into one
final_df <- bind_rows(dataframes_list)

# Specify the output file path
output_file_path <- file.path(folder_path, 'appended_sequence_class_scores_exome.xlsx')

# Save dataframe as an Excel file
write.xlsx(final_df, output_file_path)

# ... (Previous code remains unchanged)

# Print the number of rows for each sequence class score file
for (file_path in files) {
  sei_data <- read.table(file_path, sep='\t', header=TRUE)
  sample_name <- gsub('.sequence_class_scores.tsv', '', basename(file_path))
  num_rows <- nrow(sei_data)
  cat("Number of rows in", sample_name, ":", num_rows, "\n")
}

# Print the number of rows for each unique sample name in final_df
unique_sample_names <- unique(final_df$Sample)

for (sample_name in unique_sample_names) {
  num_rows_for_sample <- sum(final_df$Sample == sample_name)
  cat("Final df: number of rows for", sample_name, ":", num_rows_for_sample, "\n")
}



Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependency ‘Rcpp’


Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)




Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




Number of rows in sorted.sample_2A_vs_2C : 255 
Number of rows in sorted.sample_2B_vs_2C : 461 
Number of rows in sorted.sample_3A_vs_3C : 302 
Number of rows in sorted.sample_3B_vs_3C : 1027 
Number of rows in sorted.sample_6B_vs_6C : 3883 
Number of rows in sorted.sample_8B_vs_8C : 462 
Number of rows in sorted.sample_9A_vs_9C : 41 
Number of rows in sorted.sample_9B_vs_9C : 362 
Final df: number of rows for sorted.sample_2A_vs_2C : 255 
Final df: number of rows for sorted.sample_2B_vs_2C : 461 
Final df: number of rows for sorted.sample_3A_vs_3C : 302 
Final df: number of rows for sorted.sample_3B_vs_3C : 1027 
Final df: number of rows for sorted.sample_6B_vs_6C : 3883 
Final df: number of rows for sorted.sample_8B_vs_8C : 462 
Final df: number of rows for sorted.sample_9A_vs_9C : 41 
Final df: number of rows for sorted.sample_9B_vs_9C : 362 


This script will merge the appended file from the previous step with your dataset, set  the parameters you want to match accordingly. The output generated should have the functional annotation including Sequence class and sequence class scores

In [None]:
########### Updated: Annotation to sei_seq_class_scores (R) 03/06/2024

import pandas as pd

# Read the data from the excel files
sei_data = pd.read_excel('/content/drive/MyDrive/BimaProject/Modig1/vcf/exome2/chromatin-profiles-hdf5/appended_sequence_class_scores_exome.xlsx')
file2_data = pd.read_excel('/content/drive/MyDrive/BimaProject/Modig1/Merged_data_exome_filtered.xlsx')

# Remove 'sorted.sample_' prefix from the 'Sample' column in sei_data
sei_data['Sample'] = sei_data['Sample'].str.replace('sorted.sample_', '')

# Count of rows per sample after modifying sample names
count_after_modification = sei_data.groupby('Sample').size().reset_index(name='Count_after_modification')

# Print the count after modifying sample names
print("\nCount of rows per sample after modifying sample names:")
print(count_after_modification)

# Extract relevant columns for merging from both files
sei_data_subset = sei_data[['chrom', 'pos','ref', 'alt','Sequence class scores', 'Sequence class', 'Sample']]
file2_data_subset = file2_data[['Variant','Chr', 'Start', 'REF allele', 'ALT allele', 'Gene','GeneID', 'Type','BioType', 'Freq ALT tumor', 'Sample']]

# Merge dataframes based on matching columns
merged_data = pd.merge(sei_data_subset, file2_data_subset, how='inner',
                       left_on=['chrom', 'pos','ref', 'alt', 'Sample'],
                       right_on=['Chr', 'Start','REF allele', 'ALT allele','Sample'])

# Drop duplicate columns from the merge
merged_data = merged_data.drop(['Chr', 'Start','REF allele', 'ALT allele'], axis=1)

# Count of rows per sample before removing values
count_before = merged_data.groupby('Sample').size().reset_index(name='Count_before')

# Save the result to a new excel file
merged_data.to_excel('/content/drive/MyDrive/BimaProject/Modig1/latest2_annotated_exome_endoseq.xlsx', index=False)

# Display counts before and after
print("Count of rows per sample after merging files values:")
print(count_before)



Count of rows per sample after modifying sample names:
     Sample  Count_after_modification
0  2A_vs_2C                       255
1  2B_vs_2C                       461
2  3A_vs_3C                       302
3  3B_vs_3C                      1027
4  6B_vs_6C                      3883
5  8B_vs_8C                       462
6  9A_vs_9C                        41
7  9B_vs_9C                       362
Count of rows per sample after merging files values:
     Sample  Count_before
0  2A_vs_2C           255
1  2B_vs_2C           461
2  3A_vs_3C           302
3  3B_vs_3C          1027
4  6B_vs_6C          3883
5  8B_vs_8C           462
6  9A_vs_9C            41
7  9B_vs_9C           362
