In [1]:
import pandas as pd
import subprocess

"""
script in order to work must be in dir: script/parent_folder/idfilter/.log_files
what script does: 1. for each parent_folder, for each idfilter folder, for each log_file, it takes spectra_num (number of spectra) found after filtering (check fourth comment in the bash code below)
2. for each parent_folder, it sums the different spectra_num and that's it. 3. then concats all the sums in a df and calcs the %change based on the default sum
"""



# Run the bash cell and capture the output
result = subprocess.run(
    ['bash', '-c', '''
        # Loop through each folder in the parent directory
        for dir in */idfilter; do
    # Check if the idfilter/log_files directory exists and contains .log files
    if [ -d "$dir" ] && ls "$dir"/*.log 1> /dev/null 2>&1; then
        total=0  # Initialize a total variable for the current directory
        # Loop through each log file in the idfilter/log_files directory
        for file in "$dir"/*.log; do
            if [[ -f "$file" ]]; then
                # Extract the number from the next-to-last line
                num=$(tail -n 2 "$file" | head -n 1 | awk '{print $1}')
                
                # Check if the extracted value is a number and add it to the total
                if [[ "$num" =~ ^[0-9]+$ ]]; then
                    total=$((total + num))
                else
                    echo "Warning: Skipping file $file due to non-numeric value '$num'"
                fi
            fi
        done
        
        # Extract and print the parent directory name
        parent_dir=$(basename $(dirname "$dir"))
        echo "${parent_dir} $total"
    fi
done
    '''], 
    capture_output=True, 
    text=True
)

# Split the output into lines
output_lines = result.stdout.split('\n')

# Parse the output and create a list of dictionaries
data = []
for line in output_lines:
    if ' ' in line:
        dir_name, total_sum = line.split(' ')
        data.append({'Modification': dir_name, 'Total PSMs': int(total_sum)})

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(data)
# Get the 'Total' value for the 'default' row
default_total = df.loc[df['Modification'] == 'default', 'Total PSMs'].values[0]

# Calculate the percentage increase for each row based on the 'default' row
df['Percentage Increase'] = ((df['Total PSMs'] - default_total) / default_total) * 100
df['Percentage Increase'] = df['Percentage Increase'].round(2)

display(df)


Unnamed: 0,Modification,Total PSMs,Percentage Increase
0,deamidation,246222,9.85
1,deamidation_plus_pyroglu,249786,11.44
2,default,224143,0.0
3,oxidationW,227556,1.52
4,pyroglu,227386,1.45
5,quantms,242778,8.31


In [4]:
#give an idfilter path w/ log files and get the sum of proteins and spectra before and after filtering

import os, re
import pandas as pd

# Initialize an empty list to store the data
data = []

# Directory containing the .log files
directory = '../noMBR/idfilter'

# Regular expressions to extract the needed data
protein_re = re.compile(r'with (\d+) proteins')
spectra_re = re.compile(r'(\d+) spectra identified')

# Iterate over each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".log"):
        file_path = os.path.join(directory, filename)
        
        with open(file_path, 'r') as file:
            content = file.read()

            # Try to extract proteins and spectra counts
            proteins_matches = protein_re.findall(content)
            spectra_matches = spectra_re.findall(content)
            
            # Check if we have at least two matches for proteins and spectra
            if len(proteins_matches) >= 2 and len(spectra_matches) >= 2:
                proteins_before = proteins_matches[0]
                proteins_after = proteins_matches[1]
                spectra_before = spectra_matches[0]
                spectra_after = spectra_matches[1]

                # Append the results to the data list
                data.append([filename, proteins_before, proteins_after, spectra_before, spectra_after])
            else:
                # Log files that don't match the expected format
                print(f"Warning: '{filename}' does not have the expected format.")
                # Optionally append a placeholder row for missing data
                data.append([filename, 'N/A', 'N/A', 'N/A', 'N/A'])

# Create a pandas DataFrame
df1 = pd.DataFrame(data, columns=['Filename', 'Proteins Before', 'Proteins After', 'Spectra Before', 'Spectra After'])

# Convert the relevant columns to numeric
df1[['Proteins Before', 'Proteins After', 'Spectra Before', 'Spectra After']] = df1[['Proteins Before', 'Proteins After', 'Spectra Before', 'Spectra After']].apply(pd.to_numeric, errors='coerce')

# Calculate the sum for each numeric column
sum_row = df1[['Proteins Before', 'Proteins After', 'Spectra Before', 'Spectra After']].sum()

# Create a DataFrame for the sum row
sum_row_df = pd.DataFrame(sum_row).T
sum_row_df['Filename'] = 'Total'

# Concatenate the sum row to the original DataFrame
df1 = pd.concat([df1, sum_row_df], ignore_index=True)

display(df1)


Unnamed: 0,Filename,Proteins Before,Proteins After,Spectra Before,Spectra After
0,E26698_2p_50uPAC12_trap10_PRC-5442_2_consensus...,14122,6018,55560,35587
1,B28545_Ap_IonOpt_PRC-6063__newprep_6_10ul_cons...,19897,6399,70672,29760
2,B28549_Ap_IonOpt_PRC-6063__newprep_2_10ul_cons...,16212,3561,40249,13877
3,B28553_Ap_IonOpt_PRC-6063__newprep_4_10ul_cons...,17204,5184,53628,23187
4,B28551_Ap_IonOpt_PRC-6063__newprep_3_10ul_cons...,17198,2950,47871,13922
5,E28115_1p_50uPAC13__trap9_PRC-5590_1_consensus...,11450,5607,58630,45060
6,B28547_Ap_IonOpt_PRC-6063_newprep_1_10ul_conse...,17424,3376,47429,14424
7,B28543_Ap_IonOpt_PRC-6063__newprep_5_10ul_cons...,19501,6492,69999,30394
8,E26704_2p_50uPAC12_trap10_PRC-5442_5_consensus...,13966,6725,56949,39460
9,Total,146974,46312,500987,245671


In [3]:
import pandas as pd

# Define the URL
url = "https://www.ncbi.nlm.nih.gov/datasets/genome/?taxon=5320"

# Read the HTML tables from the URL
tables = pd.read_html(url)

# Assuming the first table is the one you want
#df = tables[0]

# Display the first few rows of the DataFrame
# print(df.head())


ValueError: No tables found

In [13]:
import xml.etree.ElementTree as ET

# Load the XML file
# Ensure the XML file is well-formed and does not contain extraneous characters or elements
with open('../../../../Downloads/pleurotus_genomes.xml', 'r') as file:
    content = file.read()

# Remove any extraneous characters after the root element
content = content.split('</root>')[0] + '</root>'

# Parse the cleaned XML content
tree = ET.ElementTree(ET.fromstring(content))
root = tree.getroot()

# Prepare a list to hold the extracted data
genomes_data = []

# FILEPATH: /home/pgiannikos/thesis/pci/secretomics/test_modifications/parse_quantms_testmods.ipynb

# Iterate over each 'DocSum' element to extract the required fields
for docsum in root.findall('DocSum'):
    accession = docsum.find('Accession').text if docsum.find('Accession') is not None else ''
    organism = docsum.find('Organism').text if docsum.find('Organism') is not None else ''
    taxid = docsum.find('TaxId').text if docsum.find('TaxId') is not None else ''
    assembly = docsum.find('Assembly').text if docsum.find('Assembly') is not None else ''
    submitter = docsum.find('Submitter').text if docsum.find('Submitter') is not None else ''
    pubmed_id = docsum.find('PubMedId').text if docsum.find('PubMedId') is not None else ''
    status = docsum.find('Status').text if docsum.find('Status') is not None else ''
    release_date = docsum.find('ReleaseDate').text if docsum.find('ReleaseDate') is not None else ''
    technology = docsum.find('Technology').text if docsum.find('Technology') is not None else ''
    project_id = docsum.find('ProjectID').text if docsum.find('ProjectID') is not None else ''
    sample_id = docsum.find('SampleID').text if docsum.find('SampleID') is not None else ''
    
    # Append extracted data to the list
    genomes_data.append([
        accession, assembly, organism, taxid, submitter, pubmed_id,
        status, release_date, technology, project_id, sample_id
    ])


ParseError: junk after document element: line 2350, column 0 (<string>)

In [8]:
print(genomes_data)

NameError: name 'genomes_data' is not defined