# File Parser to DataFrame Generator

This notebook parses a file and creates a pandas DataFrame where each compartment starting with '//' is treated as a new row.

# Import Required Libraries
Import pandas and other necessary libraries for file parsing.

In [1]:
# Import necessary libraries
import pandas as pd
import re
import os

# Load and Read the File

Load the file and read its content line by line.

In [2]:
# Define the file path
# Replace with your actual file path
file_path = "Pfam-A.hmm.dat"

# Check if file exists
if os.path.exists(file_path):
    # Read the content of the file
    with open(file_path, 'r') as file:
        content = file.read()
    print(f"File loaded successfully: {file_path}")
else:
    print(f"Error: File not found at {file_path}")
    # Sample content for demonstration if file doesn't exist
    content = """//
ID   SAMPLE1
AC   AC123456;
DE   Sample protein 1.
GA   C123
TP   Type1
ML   100
CL   Class A
//
ID   SAMPLE2
AC   AC789012;
DE   Sample protein 2.
GA   C456
TP   Type2
ML   200
CL   Class B
//"""
    print("Using sample content for demonstration")

File loaded successfully: Pfam-A.hmm.dat


# Parse the File into Rows

Split the file into compartments using '//' as the delimiter, where each compartment represents a new row.

In [3]:
# Split the content by '//' to get compartments
compartments = content.split('//')

# Remove empty compartments and strip whitespace
compartments = [comp.strip() for comp in compartments if comp.strip()]

print(f"Number of compartments found: {len(compartments)}")

# Display the first compartment as an example
if compartments:
    print("\nExample of first compartment:")
    print(compartments[0])

Number of compartments found: 24424

Example of first compartment:
# STOCKHOLM 1.0
#=GF ID   1-cysPrx_C
#=GF AC   PF10417.14
#=GF DE   C-terminal domain of 1-Cys peroxiredoxin
#=GF GA   21.1; 21.1;
#=GF TP   Domain
#=GF ML   41


# Extract Data for Columns

Extract the values for the columns (ID, AC, DE, GA, TP, ML, CL) from each compartment.

In [4]:
# Define the columns we want to extract
columns = ['ID', 'AC', 'DE', 'GA', 'TP', 'ML', 'CL']

# Function to extract data from a compartment
def extract_data(compartment):
    data = {}
    
    # Parse each line in the compartment
    for line in compartment.split('\n'):
        if line.strip():
            # Extract the field code (first 2 characters) and the value
            field_code = line[0:2].strip()
            value = line[5:].strip() if len(line) > 5 else ""
            
            # Store in data dictionary if it's one of our columns
            if field_code in columns:
                # Remove trailing semicolons if present
                value = value.rstrip(';')
                data[field_code] = value
    
    return data

# Extract data from all compartments
extracted_data = []
for compartment in compartments:
    data = extract_data(compartment)
    extracted_data.append(data)

# Display the extracted data for the first compartment
if extracted_data:
    print("\nExtracted data from first compartment:")
    print(extracted_data[0])


Extracted data from first compartment:
{}


# Create the DataFrame

Use pandas to create a DataFrame with the extracted data and assign the specified column names.

In [5]:
# Create DataFrame from extracted data
df = pd.DataFrame(extracted_data)

# Display the DataFrame
print("DataFrame created with the following data:")
display(df)

# Basic DataFrame info
print(f"\nDataFrame shape: {df.shape}")
print("\nDataFrame columns:")
for col in df.columns:
    print(f" - {col}")

# Save DataFrame to CSV (optional)
# df.to_csv('parsed_data.csv', index=False)
# print("\nDataFrame saved to 'parsed_data.csv'")

DataFrame created with the following data:


0
1
2
3
4
...
24419
24420
24421
24422
24423



DataFrame shape: (24424, 0)

DataFrame columns:


## Summary

This notebook has:
1. Imported necessary libraries
2. Loaded and read a file
3. Parsed the file into compartments using '//' as delimiter
4. Extracted data for specific columns from each compartment
5. Created a pandas DataFrame with the extracted data

You can now use this DataFrame for further analysis or visualization.

# Parse Pfam-A.hmm.dat File into a Pandas DataFrame

This notebook parses the Pfam-A.hmm.dat file and creates a pandas DataFrame where each record (separated by '//') is represented as a row with columns for ID, AC, DE, GA, TP, ML, and CL.

In [6]:
import pandas as pd
import re

In [7]:
# File path
file_path = '/home/markus/MPI_local/Pfam-A.hmm.dat'

In [8]:
def parse_pfam_file(file_path):
    # Initialize lists to store record data
    ids = []
    acs = []
    des = []
    gas = []
    tps = []
    mls = []
    cls = []
    
    current_record = {}
    
    with open(file_path, 'r') as file:
        content = file.read()
    
    # Split content into records (separated by //)
    records = content.split('//\n')
    
    # Process each record
    for record in records:
        if not record.strip():
            continue
        
        # Extract fields from the record
        id_match = re.search(r'#=GF ID\s+(.*?)\s*$', record, re.MULTILINE)
        ac_match = re.search(r'#=GF AC\s+(.*?)\s*$', record, re.MULTILINE)
        de_match = re.search(r'#=GF DE\s+(.*?)\s*$', record, re.MULTILINE)
        ga_match = re.search(r'#=GF GA\s+(.*?)\s*$', record, re.MULTILINE)
        tp_match = re.search(r'#=GF TP\s+(.*?)\s*$', record, re.MULTILINE)
        ml_match = re.search(r'#=GF ML\s+(.*?)\s*$', record, re.MULTILINE)
        cl_match = re.search(r'#=GF CL\s+(.*?)\s*$', record, re.MULTILINE)
        
        # Add extracted values to lists, using None if not found
        ids.append(id_match.group(1) if id_match else None)
        acs.append(ac_match.group(1) if ac_match else None)
        des.append(de_match.group(1) if de_match else None)
        gas.append(ga_match.group(1) if ga_match else None)
        tps.append(tp_match.group(1) if tp_match else None)
        mls.append(ml_match.group(1) if ml_match else None)
        cls.append(cl_match.group(1) if cl_match else None)
    
    # Create DataFrame from the extracted data
    df = pd.DataFrame({
        'ID': ids,
        'AC': acs,
        'DE': des,
        'GA': gas,
        'TP': tps,
        'ML': mls,
        'CL': cls
    })
    
    return df

In [9]:
# Parse the file and create the DataFrame
pfam_df = parse_pfam_file(file_path)

# Display the first few rows of the DataFrame
pfam_df.head()

Unnamed: 0,ID,AC,DE,GA,TP,ML,CL
0,1-cysPrx_C,PF10417.14,C-terminal domain of 1-Cys peroxiredoxin,21.1; 21.1;,Domain,41,
1,10_blade,PF21734.3,10-bladed beta propeller domain,27; 27;,Repeat,345,CL0186
2,117-like_vir,PF21578.3,"Virus, 117-like",27; 27;,Family,101,CL0070
3,120_Rick_ant,PF12574.13,120 KDa Rickettsia surface antigen,25; 25;,Family,240,
4,12TM_1,PF09847.15,Membrane protein of 12 TMs,33.2; 33.2;,Family,448,CL0181


In [10]:
# Get info about the DataFrame
print(f"Total number of records: {len(pfam_df)}")
pfam_df.info()

Total number of records: 24424
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24424 entries, 0 to 24423
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      24424 non-null  object
 1   AC      24424 non-null  object
 2   DE      24424 non-null  object
 3   GA      24424 non-null  object
 4   TP      24424 non-null  object
 5   ML      24424 non-null  object
 6   CL      11425 non-null  object
dtypes: object(7)
memory usage: 1.3+ MB


In [11]:
# Check for any missing values
pfam_df.isnull().sum()

ID        0
AC        0
DE        0
GA        0
TP        0
ML        0
CL    12999
dtype: int64

In [12]:
# Optional: Save the DataFrame to a CSV file
pfam_df.to_csv('/home/markus/MPI_local/pfam_parsed_data.csv', index=False)