## Data Preprocessing

### LC

In [3]:
import pandas as pd

# Step 1: Read the annotation file
annot_file = 'LC/Human.GRCh38.p13.annot.tsv'
annot_df = pd.read_csv(annot_file, sep='\t')

# Step 2: Filter for protein-coding genes
protein_coding_df = annot_df[annot_df['GeneType'] == 'protein-coding'][['GeneID', 'Symbol']]

# Step 3: Read the second TSV file
data_file = 'LC/GSE224615_raw_counts_GRCh38.p13_NCBI.tsv'  # Replace with your actual file name
data_df = pd.read_csv(data_file, sep='\t')

# Step 4: Merge the dataframes on GeneID
merged_df = pd.merge(data_df, protein_coding_df, on='GeneID', how='inner')

# Step 5: Drop GeneID and keep Symbol
final_df = merged_df.drop(columns=['GeneID'])

# Step 6: Reorder columns to have Symbol first
cols = ['Symbol'] + [col for col in final_df.columns if col != 'Symbol']
final_df = final_df[cols]

# Step 7: Save the result to a new TSV file
final_df.to_csv('LC/GSE224615_raw_counts_GRCh38.p13_NCBI_processed.tsv', sep='\t', index=False)

# Optional: Display the first few rows
print(final_df.head())

         Symbol  HC1  HC2  HC3  HC4  HC5  LC1  HC6  HC7  LC2  ...  LC16  LC17  \
0         OR4F5    0    0    0    0    0    0    0    0    0  ...     0     0   
1  LOC112268260    0   32    9   11   14   25   61    8   16  ...    17    17   
2        OR4F29    0    0    0    0    0    0    0    0    0  ...     0     0   
3  LOC105378947    0    2    1    1    0    2   19    2    0  ...     1     0   
4        OR4F16    0   13    3    9    3    5   29    8    5  ...    10    13   

   LC18  LC19  HC12  HC13  LC20  LC21  LC22  LC23  
0     0     0     0     0     0     0     0     0  
1    11     7    26    28    21    20    28    14  
2     0     0     1     0     0     0     0     0  
3     0     0     1     4     1     0     2     2  
4     1     3    12    18    16    15     9     6  

[5 rows x 37 columns]


  annot_df = pd.read_csv(annot_file, sep='\t')


In [4]:
# Step 3: Read the second TSV file
data_file = 'LC/GSE251849_raw_counts_GRCh38.p13_NCBI.tsv'  # Replace with your actual file name
data_df = pd.read_csv(data_file, sep='\t')

# Step 4: Merge the dataframes on GeneID
merged_df = pd.merge(data_df, protein_coding_df, on='GeneID', how='inner')

# Step 5: Drop GeneID and keep Symbol
final_df = merged_df.drop(columns=['GeneID'])

# Step 6: Reorder columns to have Symbol first
cols = ['Symbol'] + [col for col in final_df.columns if col != 'Symbol']
final_df = final_df[cols]

# Step 7: Save the result to a new TSV file
final_df.to_csv('LC/GSE251849_raw_counts_GRCh38.p13_NCBI_processed.tsv', sep='\t', index=False)

# Optional: Display the first few rows
print(final_df.head())

         Symbol  HC1  HC2  HC3  HC4  HC5  HC6  HC7  LC1  LC2  LC3  LC4  LC5  \
0         OR4F5    0    0    0    0    0    0    0    0    0    0    0    0   
1  LOC112268260    2    3    3    6    0    5    3   13    7    5    3    5   
2        OR4F29    0    0    0    0    0    0    0    0    0    0    0    0   
3  LOC105378947    0    1    0    0    1    3    0    1    0    0    0    1   
4        OR4F16    1    0    1    0    0    0    1    1    1    1    1    1   

   LC6  LC7  LC8  LC9  LC10  LC11  
0    0    0    0    0     0     0  
1    9    2    8    5     6     5  
2    0    0    0    0     0     0  
3    1    1    1    1     1     0  
4    0    0    1    1     1     1  


### PsA

In [9]:
import pandas as pd
import mygene

# Step 1: Read the annotation file and filter for protein-coding genes
annot_file = 'LC/Human.GRCh38.p13.annot.tsv'
annot_df = pd.read_csv(annot_file, sep='\t')

# Debug: Check annotation file columns and unique GeneType values
print("Annotation file columns:", annot_df.columns.tolist())
print("Unique GeneType values:", annot_df['GeneType'].unique())
print("Number of protein-coding genes:", len(annot_df[annot_df['GeneType'] == 'protein-coding']))

# Filter for protein-coding genes
protein_coding_df = annot_df[annot_df['GeneType'] == 'protein-coding'][['EnsemblGeneID', 'Symbol']]
# Debug: Check for missing Symbols in annotation
print("Missing Symbols in protein-coding annotation:", protein_coding_df['Symbol'].isna().sum())

# Step 2: Read the CSV file
csv_file = 'PSA/GSE205748_read_counts.csv'
ensembl_df = pd.read_csv(csv_file, sep='\t')
ensembl_df.rename(columns={'ID': 'EnsemblGeneID'}, inplace=True)

# Debug: Check CSV columns and sample EnsemblGeneID
print("CSV columns:", ensembl_df.columns.tolist())
print("Sample EnsemblGeneID:", ensembl_df['EnsemblGeneID'].head().tolist())
print("Total genes in CSV:", len(ensembl_df))

# Step 3: Merge to map EnsemblGeneID to Symbol (use left join to keep all CSV rows)
merged_ensembl_df = pd.merge(ensembl_df, protein_coding_df, on='EnsemblGeneID', how='left')

# Debug: Check merge results
print("Rows after merge:", len(merged_ensembl_df))
print("Missing Symbols after merge:", merged_ensembl_df['Symbol'].isna().sum())
unmatched_ids = merged_ensembl_df[merged_ensembl_df['Symbol'].isna()]['EnsemblGeneID'].unique()
print("Number of unmatched EnsemblGeneID:", len(unmatched_ids))
print("Sample unmatched EnsemblGeneID:", unmatched_ids[:5].tolist())

# Step 4: Handle missing Symbols with mygene
if len(unmatched_ids) > 0:
    print(f"Querying {len(unmatched_ids)} missing Ensembl IDs with mygene...")
    mg = mygene.MyGeneInfo()
    try:
        gene_info = mg.querymany(unmatched_ids, scopes='ensembl.gene', fields='symbol', species='human', as_dataframe=True)
        if not gene_info.empty:
            gene_info = gene_info.reset_index()[['query', 'symbol']].rename(columns={'query': 'EnsemblGeneID', 'symbol': 'Symbol'})
            print("Retrieved symbols from mygene:", len(gene_info))
            # Update missing Symbols
            for _, row in gene_info.iterrows():
                if pd.notna(row['Symbol']):
                    merged_ensembl_df.loc[merged_ensembl_df['EnsemblGeneID'] == row['EnsemblGeneID'], 'Symbol'] = row['Symbol']
        else:
            print("No symbols retrieved from mygene.")
    except Exception as e:
        print(f"Mygene query failed: {e}")

# Debug: Final check for missing Symbols
print("Missing Symbols after mygene:", merged_ensembl_df['Symbol'].isna().sum())

# Step 5: Drop EnsemblGeneID and reorder columns
final_df = merged_ensembl_df.drop(columns=['EnsemblGeneID'])
cols = ['Symbol'] + [col for col in final_df.columns if col != 'Symbol']
final_df = final_df[cols]

# Step 6: Save the updated CSV
output_file = 'PSA/GSE205748_read_count_processed.csv'
final_df.to_csv(output_file, sep=',', index=False)
print(f"CSV with Symbols saved as '{output_file}'")
print(final_df.head())

# Step 7: Save unmatched IDs for inspection
if len(unmatched_ids) > 0:
    pd.DataFrame(unmatched_ids, columns=['EnsemblGeneID']).to_csv('PSA/unmatched_ensembl_ids.csv', index=False)
    print("Unmatched EnsemblGeneID saved to 'PSA/unmatched_ensembl_ids.csv'")

  annot_df = pd.read_csv(annot_file, sep='\t')
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed


Annotation file columns: ['GeneID', 'Symbol', 'Description', 'Synonyms', 'GeneType', 'EnsemblGeneID', 'Status', 'ChrAcc', 'ChrStart', 'ChrStop', 'Orientation', 'Length', 'GOFunctionID', 'GOProcessID', 'GOComponentID', 'GOFunction', 'GOProcess', 'GOComponent']
Unique GeneType values: ['pseudo' 'ncRNA' 'protein-coding' nan 'snoRNA' 'snRNA' 'tRNA' 'other'
 'unknown' 'rRNA' 'scRNA']
Number of protein-coding genes: 19416
Missing Symbols in protein-coding annotation: 0
CSV columns: ['EnsemblGeneID', 'HC1', 'HC2', 'HC3', 'HC4', 'HC5', 'HC6', 'HC7', 'HC8', 'HC9', 'PA1', 'PA2', 'PA3', 'PA4', 'PA5', 'PA6', 'PA7', 'PA8', 'PA9', 'PA10', 'PA11', 'PA12', 'PA13', 'PA14', 'PA15', 'PA16', 'PA17', 'PA18']
Sample EnsemblGeneID: ['ENSG00000282222', 'ENSG00000282221', 'ENSG00000111671', 'ENSG00000110514', 'ENSG00000086015']
Total genes in CSV: 58302
Rows after merge: 58313
Missing Symbols after merge: 39136
Number of unmatched EnsemblGeneID: 39136
Sample unmatched EnsemblGeneID: ['ENSG00000282222', 'ENSG00

30 input query terms found dup hits:	[('ENSG00000261600', 2), ('ENSG00000277927', 3), ('ENSG00000233656', 2), ('ENSG00000278932', 3), ('E
1592 input query terms found no hit:	['ENSG00000262558', 'ENSG00000262554', 'ENSG00000250567', 'ENSG00000283689', 'ENSG00000259166', 'ENS


Retrieved symbols from mygene: 39191
Missing Symbols after mygene: 14027
CSV with Symbols saved as 'PSA/GSE205748_read_count_processed.csv'
  Symbol   HC1   HC2   HC3   HC4   HC5   HC6   HC7   HC8   HC9  ...   PA9  \
0    NaN     0     0     0     0     0     0     0     0     0  ...     0   
1    NaN    17    16    10    27    31    27    23    22    21  ...    31   
2  SPSB2   157   228   245   274   253   232   327   319   262  ...   151   
3   MADD  1308  1727  1989  1017  2042  1662  2042  2075  2106  ...  1697   
4  MAST2   822  1229  1352   798  1161  1353  1801  1443  1052  ...   936   

   PA10  PA11  PA12  PA13  PA14  PA15  PA16  PA17  PA18  
0     0     0     0     0     0     0     0     0     0  
1    22    28    15    21    28    16    26    19    18  
2   167   250   205   159   344   205   203   222   138  
3  1724  2058  1700  1550  1948  2188  2136  1885  1697  
4   883   834   667   601   899   750   881   734   832  

[5 rows x 28 columns]
Unmatched EnsemblGeneID sa

In [11]:
import pandas as pd
import mygene

# Step 1: Read and deduplicate the annotation TSV
annot_file = 'LC/Human.GRCh38.p13.annot.tsv'
annot_df = pd.read_csv(annot_file, sep='\t')
# Remove duplicates in EnsemblGeneID
annot_df = annot_df.drop_duplicates(subset=['EnsemblGeneID'], keep='first')
# Filter for protein-coding genes
protein_coding_df = annot_df[annot_df['GeneType'] == 'protein-coding'][['EnsemblGeneID', 'Symbol']]
# Strip version suffixes
protein_coding_df['EnsemblGeneID'] = protein_coding_df['EnsemblGeneID'].str.split('.').str[0]
print("Number of protein-coding genes in TSV:", len(protein_coding_df))
print("Missing Symbols in TSV protein-coding:", protein_coding_df['Symbol'].isna().sum())

# Step 2: Read and deduplicate the idmap XLSX
idmap_file = 'PSA/idmap.xlsx'
idmap_df = pd.read_excel(idmap_file)
# Remove duplicates in query (EnsemblGeneID)
idmap_df = idmap_df.drop_duplicates(subset=['query'], keep='first')
# Rename query to EnsemblGeneID and select relevant columns
idmap_df = idmap_df[['query', 'symbol']].rename(columns={'query': 'EnsemblGeneID', 'symbol': 'Symbol'})
# Strip version suffixes
idmap_df['EnsemblGeneID'] = idmap_df['EnsemblGeneID'].str.split('.').str[0]
print("Number of unique EnsemblGeneID in idmap:", len(idmap_df))
print("Missing Symbols in idmap:", idmap_df['Symbol'].isna().sum())

# Step 3: Read the CSV
csv_file = 'PSA/GSE205748_read_counts.csv'
ensembl_df = pd.read_csv(csv_file, sep='\t')
ensembl_df.rename(columns={'ID': 'EnsemblGeneID'}, inplace=True)
# Strip version suffixes
ensembl_df['EnsemblGeneID'] = ensembl_df['EnsemblGeneID'].str.split('.').str[0]
print("Total genes in CSV:", len(ensembl_df))
print("Sample EnsemblGeneID:", ensembl_df['EnsemblGeneID'].head().tolist())

# Step 4: Merge with TSV annotation (protein-coding only)
merged_df = pd.merge(ensembl_df, protein_coding_df, on='EnsemblGeneID', how='inner')
print("Rows after TSV merge (protein-coding):", len(merged_df))
print("Missing Symbols after TSV merge:", merged_df['Symbol'].isna().sum())

# Step 5: Check unmatched IDs against idmap.xlsx
unmatched_ids = ensembl_df[~ensembl_df['EnsemblGeneID'].isin(protein_coding_df['EnsemblGeneID'])]['EnsemblGeneID'].unique()
print("Number of unmatched EnsemblGeneID:", len(unmatched_ids))
print("Sample unmatched EnsemblGeneID:", unmatched_ids[:5].tolist())

# Merge unmatched IDs with idmap.xlsx
unmatched_df = pd.DataFrame({'EnsemblGeneID': unmatched_ids})
idmap_merged = pd.merge(unmatched_df, idmap_df, on='EnsemblGeneID', how='left')
# Filter for protein-coding by cross-referencing with TSV GeneType
idmap_protein_coding = idmap_merged[idmap_merged['EnsemblGeneID'].isin(protein_coding_df['EnsemblGeneID'])]
print("Protein-coding genes from idmap:", len(idmap_protein_coding))

# Step 6: Combine TSV and idmap results
# Append idmap protein-coding matches to main merge (if any)
if not idmap_protein_coding.empty:
    idmap_to_add = pd.merge(ensembl_df, idmap_protein_coding, on='EnsemblGeneID', how='inner')
    idmap_to_add = idmap_to_add[ensembl_df.columns.tolist() + ['Symbol']]  # Align columns
    merged_df = pd.concat([merged_df, idmap_to_add], ignore_index=True)
    print("Rows after adding idmap protein-coding:", len(merged_df))

# Step 7: Drop EnsemblGeneID and reorder columns
final_df = merged_df.drop(columns=['EnsemblGeneID'])
cols = ['Symbol'] + [col for col in final_df.columns if col != 'Symbol']
final_df = final_df[cols]

# Step 8: Save the output
output_file = 'PSA/GSE205748_read_count_processed.csv'
final_df.to_csv(output_file, sep=',', index=False)
print(f"CSV with Symbols saved as '{output_file}'")
print(final_df.head())

  annot_df = pd.read_csv(annot_file, sep='\t')


Number of protein-coding genes in TSV: 19213
Missing Symbols in TSV protein-coding: 0
Number of unique EnsemblGeneID in idmap: 58302
Missing Symbols in idmap: 14031
Total genes in CSV: 58302
Sample EnsemblGeneID: ['ENSG00000282222', 'ENSG00000282221', 'ENSG00000111671', 'ENSG00000110514', 'ENSG00000086015']
Rows after TSV merge (protein-coding): 19152
Missing Symbols after TSV merge: 0
Number of unmatched EnsemblGeneID: 39150
Sample unmatched EnsemblGeneID: ['ENSG00000282222', 'ENSG00000282221', 'ENSG00000211769', 'ENSG00000211768', 'ENSG00000211767']
Protein-coding genes from idmap: 0
CSV with Symbols saved as 'PSA/GSE205748_read_count_processed.csv'
    Symbol   HC1   HC2   HC3   HC4   HC5   HC6   HC7   HC8   HC9  ...   PA9  \
0    SPSB2   157   228   245   274   253   232   327   319   262  ...   151   
1     MADD  1308  1727  1989  1017  2042  1662  2042  2075  2106  ...  1697   
2    MAST2   822  1229  1352   798  1161  1353  1801  1443  1052  ...   936   
3  CSNK2A2  3221  5079  

In [13]:
import pandas as pd

# Step 1: Read and clean the CSV
csv_file = 'PSA/GSE179800_SKB-counts.csv'
counts_df = pd.read_csv(csv_file)

# Drop the first (index) column
counts_df = counts_df.drop(counts_df.columns[0], axis=1)

# Check for duplicates in Gene
print("Duplicate Gene values:", counts_df['Gene'].duplicated().sum())
# Deduplicate, keeping first occurrence
counts_df = counts_df.drop_duplicates(subset=['Gene'], keep='first')
print("Total genes in CSV:", len(counts_df))
print("Sample Gene values:", counts_df['Gene'].head().tolist())

# Step 2: Read the annotation TSV and filter for protein-coding genes
annot_file = 'LC/Human.GRCh38.p13.annot.tsv'
annot_df = pd.read_csv(annot_file, sep='\t')
# Remove duplicates in Symbol
annot_df = annot_df.drop_duplicates(subset=['Symbol'], keep='first')
# Filter for protein-coding genes
protein_coding_df = annot_df[annot_df['GeneType'] == 'protein-coding'][['Symbol']]
print("Number of protein-coding genes in TSV:", len(protein_coding_df))
print("Missing Symbols in TSV protein-coding:", protein_coding_df['Symbol'].isna().sum())

# Step 3: Merge to keep only protein-coding genes
merged_df = pd.merge(counts_df, protein_coding_df, left_on='Gene', right_on='Symbol', how='inner')
print("Rows after merge (protein-coding):", len(merged_df))
print("Missing Symbols after merge:", merged_df['Symbol'].isna().sum())

# Step 4: Identify unmatched genes
unmatched_genes = counts_df[~counts_df['Gene'].isin(protein_coding_df['Symbol'])]['Gene'].unique()
print("Number of unmatched Gene values:", len(unmatched_genes))
print("Sample unmatched Gene values:", unmatched_genes[:5].tolist())

# Step 5: Prepare output (keep Gene column as Symbol)
final_df = merged_df[['Gene', 'PA1', 'PA2', 'PA3', 'PA4']]
# Rename Gene to Symbol for clarity
final_df = final_df.rename(columns={'Gene': 'Symbol'})

# Step 6: Save the output
output_file = 'GSE179800_SKB-counts_processed.csv'
final_df.to_csv(output_file, sep=',', index=False)
print(f"Processed CSV saved as '{output_file}'")
print(final_df.head())

Duplicate Gene values: 0
Total genes in CSV: 26485
Sample Gene values: ['DDX11L1', 'WASH7P', 'MIR6859-3', 'MIR6859-2', 'MIR6859-1']
Number of protein-coding genes in TSV: 19416
Missing Symbols in TSV protein-coding: 0
Rows after merge (protein-coding): 18060
Missing Symbols after merge: 0
Number of unmatched Gene values: 8425
Sample unmatched Gene values: ['DDX11L1', 'WASH7P', 'MIR6859-3', 'MIR6859-2', 'MIR6859-1']
Processed CSV saved as 'GSE179800_SKB-counts_processed.csv'
   Symbol  PA1  PA2  PA3  PA4
0   OR4F5    1    0    0    0
1  OR4F29    0    0    0    0
2   OR4F3    0    0    0    0
3  OR4F16    0    0    0    0
4  SAMD11    0    0    0    0
Matched genes saved to 'matched_genes.csv'
Unmatched genes saved to 'unmatched_genes.csv'


  annot_df = pd.read_csv(annot_file, sep='\t')


In [2]:
import pandas as pd

# Step 1: Read the annotation TSV and filter for protein-coding genes
annot_file = 'LC/Human.GRCh38.p13.annot.tsv'
annot_df = pd.read_csv(annot_file, sep='\t')
# Remove duplicates in Symbol
annot_df = annot_df.drop_duplicates(subset=['Symbol'], keep='first')
protein_coding_df = annot_df[annot_df['GeneType'] == 'protein-coding'][['Symbol']]
print("Number of protein-coding genes in TSV:", len(protein_coding_df))
print("Missing Symbols in TSV protein-coding:", protein_coding_df['Symbol'].isna().sum())

# Step 2: Load and clean each file
files = {
    'LC/GSE224615_raw_counts_GRCh38.p13_NCBI_processed.tsv': {'type': 'tsv', 'sep': '\t'},
    'LC/GSE251849_raw_counts_GRCh38.p13_NCBI_processed.tsv': {'type': 'tsv', 'sep': '\t'},
    'PSA/GSE205748_read_count_processed.csv': {'type': 'csv', 'sep': ','},
    'PSA/GSE179800_SKB-counts.csv': {'type': 'csv', 'sep': ','}
}

dfs = {}
gene_sets = {}
for file_path, config in files.items():
    # Read file
    df = pd.read_csv(file_path, sep=config['sep'])
    
    # Debug: Print columns
    print(f"\nFile: {file_path}")
    print("Columns:", df.columns.tolist())
    
    # Rename first column to Gene if needed
    if df.columns[0].lower() in ['symbol', 'id', 'geneid']:
        df = df.rename(columns={df.columns[0]: 'Gene'})
    elif df.columns[0] != 'Gene':
        print(f"Warning: First column in {file_path} is '{df.columns[0]}', assuming it's Gene")
        df = df.rename(columns={df.columns[0]: 'Gene'})
    
    # Check for duplicates
    print("Duplicate Gene values:", df['Gene'].duplicated().sum())
    df = df.drop_duplicates(subset=['Gene'], keep='first')
    
    # Filter for protein-coding genes
    df = pd.merge(df, protein_coding_df, left_on='Gene', right_on='Symbol', how='inner')
    df = df.drop(columns=['Symbol'])  # Drop extra Symbol column
    print("Rows after protein-coding filter:", len(df))
    print("Sample Gene values:", df['Gene'].head().tolist())
    
    # Store DataFrame and gene set
    dfs[file_path] = df
    gene_sets[file_path] = set(df['Gene'])
    print("Unique Gene symbols:", len(gene_sets[file_path]))

# Step 3: Find overlapping Gene symbols
common_genes = set.intersection(*gene_sets.values())
print("\nNumber of common protein-coding Gene symbols:", len(common_genes))
print("Sample common Gene symbols:", list(common_genes)[:5])

# Step 4: Subset each file to common Gene symbols
for file_path, df in dfs.items():
    # Filter to common genes
    df_common = df[df['Gene'].isin(common_genes)]
    print(f"\nFile: {file_path}")
    print("Rows after common genes filter:", len(df_common))
    
    # Save processed file
    output_file = file_path.replace('.tsv', '_common.tsv').replace('.csv', '_common.csv')
    df_common.to_csv(output_file, sep=files[file_path]['sep'], index=False)
    print(f"Processed file saved as '{output_file}'")
    print(df_common.head())

# Step 5: Save common and unmatched genes
pd.DataFrame(list(common_genes), columns=['Gene']).to_csv('common_gene_symbols.csv', index=False)
print("Common Gene symbols saved to 'common_gene_symbols.csv'")

# Save unmatched genes per file
for file_path, gene_set in gene_sets.items():
    unmatched = gene_set - common_genes
    pd.DataFrame(list(unmatched), columns=['Gene']).to_csv(
        file_path.replace('.tsv', '_unmatched.tsv').replace('.csv', '_unmatched.csv'),
        index=False
    )
    print(f"Unmatched genes for {file_path} saved to '{file_path.replace('.tsv', '_unmatched.tsv').replace('.csv', '_unmatched.csv')}'")

  annot_df = pd.read_csv(annot_file, sep='\t')


Number of protein-coding genes in TSV: 19416
Missing Symbols in TSV protein-coding: 0

File: LC/GSE224615_raw_counts_GRCh38.p13_NCBI_processed.tsv
Columns: ['Gene', 'HC1', 'HC2', 'HC3', 'HC4', 'HC5', 'LC1', 'HC6', 'HC7', 'LC2', 'HC8', 'LC3', 'LC4', 'LC5', 'LC6', 'LC7', 'HC9', 'LC8', 'LC9', 'LC10', 'HC10', 'HC11', 'LC11', 'LC12', 'LC13', 'LC14', 'LC15', 'LC16', 'LC17', 'LC18', 'LC19', 'HC12', 'HC13', 'LC20', 'LC21', 'LC22', 'LC23']
Duplicate Gene values: 0
Rows after protein-coding filter: 19416
Sample Gene values: ['OR4F5', 'LOC112268260', 'OR4F29', 'LOC105378947', 'OR4F16']
Unique Gene symbols: 19416

File: LC/GSE251849_raw_counts_GRCh38.p13_NCBI_processed.tsv
Columns: ['Gene', 'HC1', 'HC2', 'HC3', 'HC4', 'HC5', 'HC6', 'HC7', 'LC1', 'LC2', 'LC3', 'LC4', 'LC5', 'LC6', 'LC7', 'LC8', 'LC9', 'LC10', 'LC11']
Duplicate Gene values: 0
Rows after protein-coding filter: 19416
Sample Gene values: ['OR4F5', 'LOC112268260', 'OR4F29', 'LOC105378947', 'OR4F16']
Unique Gene symbols: 19416

File: PSA

### DEG

In [1]:
# Load libraries
library(DESeq2)
library(dplyr)
library(ggplot2)
library(EnhancedVolcano)
library(pheatmap)

"package 'DESeq2' was built under R version 3.5.2"
Loading required package: S4Vectors

Loading required package: stats4

Loading required package: BiocGenerics

Loading required package: parallel


Attaching package: 'BiocGenerics'


The following objects are masked from 'package:parallel':

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


The following objects are masked from 'package:stats':

    IQR, mad, sd, var, xtabs


The following objects are masked from 'package:base':

    anyDuplicated, append, as.data.frame, basename, cbind, colMeans,
    colnames, colSums, dirname, do.call, duplicated, eval, evalq,
    Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply,
    lengths, Map, mapply, match, mget, order, paste, pmax, pmax.int,
    pmin, pmin.int, Position, rank, rbind, Reduce, rowMeans, rownames,
    rowSums, sapply, setdiff, sort, table, t

ERROR: Error: package or namespace load failed for 'DESeq2' in loadNamespace(i, c(lib.loc, .libPaths()), versionCheck = vI[[i]]):
 there is no package called 'htmlwidgets'


In [6]:
# Step 1: Read datasets
gse224615 <- read.csv("LC/GSE224615_raw_counts_GRCh38.p13_NCBI_processed_common.tsv", sep="\t", row.names=1, check.names=FALSE)
gse251849 <- read.csv("LC/GSE251849_raw_counts_GRCh38.p13_NCBI_processed_common.tsv", sep="\t", row.names=1, check.names=FALSE)
cat("GSE224615 dimensions:", dim(gse224615), "\n")
cat("GSE251849 dimensions:", dim(gse251849), "\n")

GSE224615 dimensions: 17998 36 
GSE251849 dimensions: 17998 18 


In [7]:
# Verify gene alignment
if (!identical(rownames(gse224615), rownames(gse251849))) {
  common_genes <- intersect(rownames(gse224615), rownames(gse251849))
  gse224615 <- gse224615[common_genes, ]
  gse251849 <- gse251849[common_genes, ]
  cat("Aligned to common genes:", length(common_genes), "\n")
}

In [8]:
# Step 2: Merge datasets
count_data <- cbind(gse224615, gse251849)
cat("Merged count data dimensions:", dim(count_data), "\n")

Merged count data dimensions: 17998 54 


In [9]:
# Step 3: Prepare metadata
samples <- colnames(count_data)
condition <- ifelse(grepl("^HC", samples, ignore.case=TRUE), "Healthy",
                   ifelse(grepl("^LC", samples, ignore.case=TRUE), "LongCOVID", NA))
if (any(is.na(condition))) {
  stop("Some samples lack HC or LC prefix: ", paste(samples[is.na(condition)], collapse=", "))
}
col_data <- data.frame(
  sample = samples,
  condition = factor(condition, levels=c("Healthy", "LongCOVID")),
  row.names = samples
)
cat("Sample counts:\n")
print(table(col_data$condition))

Sample counts:

  Healthy LongCOVID 
       20        34 


In [10]:
# Step 4: Verify raw counts
if (any(count_data < 0 | count_data %% 1 != 0)) {
  stop("Count data contains non-integer or negative values")
}
cat("Count data verified as integers\n")

Count data verified as integers


In [11]:
# Step 5: Create DESeq2 dataset
dds <- DESeqDataSetFromMatrix(
  countData = round(count_data),
  colData = col_data,
  design = ~ condition
)

ERROR: Error in DESeqDataSetFromMatrix(countData = round(count_data), colData = col_data, : could not find function "DESeqDataSetFromMatrix"
