In [2]:
import pandas as pd
from Bio import SeqIO

# File path
thermo_gff = '/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_July_2023/OORB003_TnSeq/for_analysis/modified_gff_file_finalized_08_08_2023'

# Load the GFF file
thermo = pd.read_csv(thermo_gff, sep='\t', header=None, comment='#')

# Inspect the file to confirm loading
print("Initial DataFrame:")
print(thermo.head())

# Rename columns
thermo.columns = ['scaffold', 'source', 'type', 'start', 'stop', 'strand', 'info', 'ID']

# Extract the ID from the 'info' column
thermo['ID'] = thermo['info'].str.split(pat=';', expand=True)[0]

# Ensure numeric types for 'start' and 'stop'
thermo['start'] = pd.to_numeric(thermo['start'], errors='coerce')
thermo['stop'] = pd.to_numeric(thermo['stop'], errors='coerce')

# Drop rows with invalid numeric values
thermo = thermo.dropna(subset=['start', 'stop'])

# Define promoter size
promoter_size = 1000

# Prepare an empty DataFrame for promoters
promoter_gff = pd.DataFrame(columns=thermo.columns)

# Generate promoter entries
for i, row in thermo.iterrows():
    promoter_row = row.copy()
    promoter_row['type'] = 'promoter'
    promoter_row['ID'] = row['ID'] + '_promoter'
    
    # Calculate promoter coordinates
    if promoter_row['strand'] == '+':
        promoter_row['start'] = promoter_row['start'] - promoter_size
        promoter_row['stop'] = promoter_row['start']
    else:
        promoter_row['start'] = promoter_row['stop']
        promoter_row['stop'] = promoter_row['stop'] + promoter_size
    
    # Append the new row to the promoter DataFrame
    promoter_gff = pd.concat([promoter_gff, pd.DataFrame([promoter_row])], ignore_index=True)
    
    # Debugging limit: break after 100 rows
    if i > 100:
        break

# Define combined_gff to combine thermo and promoter_gff
combined_gff = pd.concat([thermo, promoter_gff], ignore_index=True).sort_values(['scaffold', 'start'])

# Display the first few rows of the combined DataFrame
print("Combined GFF:")
print(combined_gff.head())
combined_gff['scaffold'] = combined_gff['scaffold'].apply(str)
combined_gff.to_csv('Thermophilus_genes_and_promoters_12_19_2024.gff', sep='\t', index=None)

Initial DataFrame:
          0       1     2       3       4       5          6  \
0  scaffold  source  type   start    stop  strand       info   
1      chr1     JGI   CDS  1381.0  2142.0       +  ID=gene_1   
2      chr1     JGI   CDS  2744.0  3343.0       -  ID=gene_2   
3      chr1     JGI   CDS  3344.0  3817.0       -  ID=gene_3   
4      chr1     JGI   CDS  5330.0  6385.0       -  ID=gene_4   

                   7  
0                 ID  
1  proteinId=2114025  
2  proteinId=2293935  
3  proteinId=2293936  
4  proteinId=2051335  
Combined GFF:
     scaffold source      type   start    stop strand       info  \
9110     chr1    JGI  promoter   381.0   381.0      +  ID=gene_1   
0        chr1    JGI       CDS  1381.0  2142.0      +  ID=gene_1   
1        chr1    JGI       CDS  2744.0  3343.0      -  ID=gene_2   
9111     chr1    JGI  promoter  3343.0  4343.0      -  ID=gene_2   
2        chr1    JGI       CDS  3344.0  3817.0      -  ID=gene_3   

                      ID  
9110  ID

In [3]:
import pandas as pd
from Bio import SeqIO

# File path
thermo_gff = '/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_July_2023/OORB003_TnSeq/for_analysis/modified_gff_file_finalized_08_08_2023'

# Load the GFF file
thermo = pd.read_csv(thermo_gff, sep='\t', header=None, comment='#')

# Assign column names
thermo.columns = ['scaffold', 'source', 'type', 'start', 'stop', 'strand', 'info', 'ID']

# Ensure numeric types for 'start' and 'stop'
thermo['start'] = pd.to_numeric(thermo['start'], errors='coerce')
thermo['stop'] = pd.to_numeric(thermo['stop'], errors='coerce')

# Drop rows with invalid numeric values
thermo = thermo.dropna(subset=['start', 'stop'])

# Define promoter size
promoter_size = 1000

# Prepare an empty DataFrame for promoters
promoter_gff = pd.DataFrame(columns=thermo.columns)

# Generate promoter entries
for i, row in thermo.iterrows():
    promoter_row = row.copy()
    promoter_row['type'] = 'promoter'
    promoter_row['ID'] = row['ID'] + '_promoter'  # Add _promoter to the original ID
    
    # Calculate promoter coordinates
    if promoter_row['strand'] == '+':
        promoter_row['start'] = promoter_row['start'] - promoter_size
        promoter_row['stop'] = promoter_row['start']
    else:
        promoter_row['start'] = promoter_row['stop']
        promoter_row['stop'] = promoter_row['stop'] + promoter_size
    
    # Append the new row to the promoter DataFrame
    promoter_gff = pd.concat([promoter_gff, pd.DataFrame([promoter_row])], ignore_index=True)
    
    # Debugging limit: break after 100 rows
    if i > 100:
        break

# Combine the original DataFrame and promoter DataFrame
combined_gff = pd.concat([thermo, promoter_gff], ignore_index=True).sort_values(['scaffold', 'start'])

# Display the first few rows of the combined DataFrame
print("Combined GFF:")
print(combined_gff.head())

# Save the output to a file
combined_gff.to_csv('Thermophilus_genes_and_promoters_12_19_2024_mod.gff', sep='\t', index=None)


Combined GFF:
     scaffold source      type   start    stop strand       info  \
9110     chr1    JGI  promoter   381.0   381.0      +  ID=gene_1   
0        chr1    JGI       CDS  1381.0  2142.0      +  ID=gene_1   
1        chr1    JGI       CDS  2744.0  3343.0      -  ID=gene_2   
9111     chr1    JGI  promoter  3343.0  4343.0      -  ID=gene_2   
2        chr1    JGI       CDS  3344.0  3817.0      -  ID=gene_3   

                              ID  
9110  proteinId=2114025_promoter  
0              proteinId=2114025  
1              proteinId=2293935  
9111  proteinId=2293935_promoter  
2              proteinId=2293936  


In [1]:
import pandas as pd
from Bio import SeqIO

# File path
thermo_gff = '/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_July_2023/OORB003_TnSeq/for_analysis/modified_gff_file_finalized_08_08_2023'

# Load the GFF file
thermo = pd.read_csv(thermo_gff, sep='\t', header=None, comment='#')

# Assign column names
thermo.columns = ['scaffold', 'source', 'type', 'start', 'stop', 'strand', 'info', 'ID']

# Ensure numeric types for 'start' and 'stop'
thermo['start'] = pd.to_numeric(thermo['start'], errors='coerce')
thermo['stop'] = pd.to_numeric(thermo['stop'], errors='coerce')

# Drop rows with invalid numeric values
thermo = thermo.dropna(subset=['start', 'stop'])

# Define promoter size
promoter_size = 1000

# Prepare an empty DataFrame for promoters
promoter_gff = pd.DataFrame(columns=thermo.columns)

# Generate promoter entries
for i, row in thermo.iterrows():
    promoter_row = row.copy()
    promoter_row['type'] = 'promoter'
    promoter_row['ID'] = row['ID'] + '_promoter'  # Add _promoter to the original ID
    
    # Calculate promoter coordinates
    if promoter_row['strand'] == '+':
        promoter_row['start'] = promoter_row['start'] - promoter_size
        promoter_row['stop'] = promoter_row['start']
    else:
        promoter_row['start'] = promoter_row['stop']
        promoter_row['stop'] = promoter_row['stop'] + promoter_size
    
    # Append the new row to the promoter DataFrame
    promoter_gff = pd.concat([promoter_gff, pd.DataFrame([promoter_row])], ignore_index=True)
    

# Combine the original DataFrame and promoter DataFrame
combined_gff = pd.concat([thermo, promoter_gff], ignore_index=True).sort_values(['scaffold', 'start'])

# Display the first few rows of the combined DataFrame
print("Combined GFF:")
print(combined_gff.head())

# Save the output to a file
combined_gff.to_csv('Thermophilus_genes_and_promoters_12_19_2024_finalized.gff', sep='\t', index=None)


Combined GFF:
     scaffold source      type   start    stop strand       info  \
9110     chr1    JGI  promoter   381.0   381.0      +  ID=gene_1   
0        chr1    JGI       CDS  1381.0  2142.0      +  ID=gene_1   
1        chr1    JGI       CDS  2744.0  3343.0      -  ID=gene_2   
9111     chr1    JGI  promoter  3343.0  4343.0      -  ID=gene_2   
2        chr1    JGI       CDS  3344.0  3817.0      -  ID=gene_3   

                              ID  
9110  proteinId=2114025_promoter  
0              proteinId=2114025  
1              proteinId=2293935  
9111  proteinId=2293935_promoter  
2              proteinId=2293936  


In [2]:
import pandas as pd
from Bio import SeqIO

# File path
thermo_gff = '/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_July_2023/OORB003_TnSeq/for_analysis/modified_gff_file_finalized_08_08_2023'

# Load the GFF file
thermo = pd.read_csv(thermo_gff, sep='\t', header=None, comment='#')

# Assign column names
thermo.columns = ['scaffold', 'source', 'type', 'start', 'stop', 'strand', 'info', 'ID']

# Ensure numeric types for 'start' and 'stop'
thermo['start'] = pd.to_numeric(thermo['start'], errors='coerce')
thermo['stop'] = pd.to_numeric(thermo['stop'], errors='coerce')

# Drop rows with invalid numeric values
thermo = thermo.dropna(subset=['start', 'stop'])

# Define promoter size
promoter_size = 1000

# Prepare an empty DataFrame for promoters
promoter_gff = pd.DataFrame(columns=thermo.columns)

# Generate promoter entries
for i, row in thermo.iterrows():
    promoter_row = row.copy()
    promoter_row['type'] = 'promoter'
    promoter_row['ID'] = row['ID'] + '_promoter'  # Add _promoter to the original ID
    
    # Calculate promoter coordinates
    if promoter_row['strand'] == '+':
        promoter_row['stop'] = promoter_row['start'] - 1  # The promoter ends right before the gene
        promoter_row['start'] = promoter_row['stop'] - promoter_size + 1  # Extend upstream for promoter length
    else:
        promoter_row['start'] = promoter_row['stop'] + 1  # The promoter starts right after the gene
        promoter_row['stop'] = promoter_row['start'] + promoter_size - 1  # Extend downstream for promoter length
    
    # Append the new row to the promoter DataFrame
    promoter_gff = pd.concat([promoter_gff, pd.DataFrame([promoter_row])], ignore_index=True)

    

# Combine the original DataFrame and promoter DataFrame
combined_gff = pd.concat([thermo, promoter_gff], ignore_index=True).sort_values(['scaffold', 'start'])

# Display the first few rows of the combined DataFrame
print("Combined GFF:")
print(combined_gff.head())

# Save the output to a file
combined_gff.to_csv('Thermophilus_genes_and_promoters_12_19_2024_finalized_1.gff', sep='\t', index=None)


Combined GFF:
     scaffold source      type   start    stop strand       info  \
9110     chr1    JGI  promoter   381.0  1380.0      +  ID=gene_1   
0        chr1    JGI       CDS  1381.0  2142.0      +  ID=gene_1   
1        chr1    JGI       CDS  2744.0  3343.0      -  ID=gene_2   
2        chr1    JGI       CDS  3344.0  3817.0      -  ID=gene_3   
9111     chr1    JGI  promoter  3344.0  4343.0      -  ID=gene_2   

                              ID  
9110  proteinId=2114025_promoter  
0              proteinId=2114025  
1              proteinId=2293935  
2              proteinId=2293936  
9111  proteinId=2293935_promoter  


### Troubleshooting

In [3]:
import pandas as pd

# File path
thermo_gff = '/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_July_2023/OORB003_TnSeq/for_analysis/modified_gff_file_finalized_08_08_2023'

# Load the GFF file
thermo = pd.read_csv(thermo_gff, sep='\t', header=None, comment='#', names=['scaffold', 'source', 'type', 'start', 'stop', 'strand', 'info', 'ID'])

# Ensure numeric types for 'start' and 'stop'
thermo['start'] = pd.to_numeric(thermo['start'], errors='coerce')
thermo['stop'] = pd.to_numeric(thermo['stop'], errors='coerce')

# Drop rows with invalid numeric values
thermo = thermo.dropna(subset=['start', 'stop'])

# Define promoter size
promoter_size = 1000

# Prepare an empty DataFrame for promoters
promoter_gff = pd.DataFrame(columns=thermo.columns)

# Generate promoter entries
for i, row in thermo.iterrows():
    promoter_row = row.copy()
    promoter_row['type'] = 'promoter'
    promoter_row['ID'] = row['ID'] + '_promoter'  # Add _promoter to the original ID

    # Calculate promoter coordinates
    if row['strand'] == '+':
        promoter_row['stop'] = row['start'] - 1  # The promoter ends right before the gene
        promoter_row['start'] = max(promoter_row['stop'] - promoter_size + 1, 1)  # Extend upstream, ensure start >= 1
    else:
        promoter_row['start'] = row['stop'] + 1  # The promoter starts right after the gene
        promoter_row['stop'] = promoter_row['start'] + promoter_size - 1  # Extend downstream
    
    # Append the new row to the promoter DataFrame
    promoter_gff = pd.concat([promoter_gff, pd.DataFrame([promoter_row])], ignore_index=True)

# Combine the original DataFrame and promoter DataFrame
combined_gff = pd.concat([thermo, promoter_gff], ignore_index=True).sort_values(['scaffold', 'start'])

# Validate combined GFF
if combined_gff.isnull().any().any():
    print("Warning: Combined GFF contains null values. Please check the input data.")

# Display the first few rows of the combined DataFrame
print("Combined GFF:")
print(combined_gff.head())

# Save the output to a file
output_path = 'Thermophilus_genes_and_promoters_12_19_2024_finalized_2.gff'
combined_gff.to_csv(output_path, sep='\t', index=False)
print(f"Combined GFF file saved to: {output_path}")


Combined GFF:
     scaffold source      type   start    stop strand       info  \
9110     chr1    JGI  promoter   381.0  1380.0      +  ID=gene_1   
0        chr1    JGI       CDS  1381.0  2142.0      +  ID=gene_1   
1        chr1    JGI       CDS  2744.0  3343.0      -  ID=gene_2   
2        chr1    JGI       CDS  3344.0  3817.0      -  ID=gene_3   
9111     chr1    JGI  promoter  3344.0  4343.0      -  ID=gene_2   

                              ID  
9110  proteinId=2114025_promoter  
0              proteinId=2114025  
1              proteinId=2293935  
2              proteinId=2293936  
9111  proteinId=2293935_promoter  
Combined GFF file saved to: Thermophilus_genes_and_promoters_12_19_2024_finalized_2.gff


##### To Remove Header Row:

The script now checks if the first row of the file contains the column headers accidentally included as data, and removes it if detected.
Convert Coordinates to Numeric:

The start and stop columns are explicitly converted to numeric with errors='coerce', and rows with invalid values are dropped.
Promoter and Gene Logic Check:

For each promoter:
Checks for overlaps with genes based on strand-specific logic.
Reports overlaps if any are found.
Also checks for boundary issues (e.g., promoter start < 1).
Validation:

Added warnings if the combined GFF contains NaN values.
Efficiency:

Loops through promoters and performs targeted overlap checks only with relevant genes, minimizing computational overhead.

In [4]:
import pandas as pd

# File path
thermo_gff = '/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_July_2023/OORB003_TnSeq/for_analysis/modified_gff_file_finalized_08_08_2023'

# Load the GFF file
thermo = pd.read_csv(
    thermo_gff,
    sep='\t',
    header=None,
    comment='#',
    names=['scaffold', 'source', 'type', 'start', 'stop', 'strand', 'info', 'ID']
)

# Drop the first row if it contains header data mistakenly included in the file
if (thermo.iloc[0] == ['scaffold', 'source', 'type', 'start', 'stop', 'strand', 'info', 'ID']).all():
    thermo = thermo.iloc[1:].reset_index(drop=True)

# Ensure numeric types for 'start' and 'stop'
thermo['start'] = pd.to_numeric(thermo['start'], errors='coerce')
thermo['stop'] = pd.to_numeric(thermo['stop'], errors='coerce')

# Drop rows with invalid numeric values
thermo = thermo.dropna(subset=['start', 'stop'])

# Define promoter size
promoter_size = 1000

# Prepare an empty DataFrame for promoters
promoter_gff = pd.DataFrame(columns=thermo.columns)

# Generate promoter entries
for i, row in thermo.iterrows():
    promoter_row = row.copy()
    promoter_row['type'] = 'promoter'
    promoter_row['ID'] = row['ID'] + '_promoter'  # Add _promoter to the original ID

    # Calculate promoter coordinates
    if row['strand'] == '+':
        promoter_row['stop'] = row['start'] - 1  # The promoter ends right before the gene
        promoter_row['start'] = max(promoter_row['stop'] - promoter_size + 1, 1)  # Extend upstream, ensure start >= 1
    else:
        promoter_row['start'] = row['stop'] + 1  # The promoter starts right after the gene
        promoter_row['stop'] = promoter_row['start'] + promoter_size - 1  # Extend downstream

    # Append the new row to the promoter DataFrame
    promoter_gff = pd.concat([promoter_gff, pd.DataFrame([promoter_row])], ignore_index=True)

# Combine the original DataFrame and promoter DataFrame
combined_gff = pd.concat([thermo, promoter_gff], ignore_index=True).sort_values(['scaffold', 'start'])

# Check for overlapping promoters and genes
overlap_issues = []
genes = combined_gff[combined_gff['type'] != 'promoter']
promoters = combined_gff[combined_gff['type'] == 'promoter']

for _, promoter in promoters.iterrows():
    if promoter['strand'] == '+':
        overlap = genes[
            (genes['scaffold'] == promoter['scaffold']) &
            (genes['strand'] == '+') &
            (promoter['start'] < genes['stop']) &
            (promoter['stop'] >= genes['start'])
        ]
    else:
        overlap = genes[
            (genes['scaffold'] == promoter['scaffold']) &
            (genes['strand'] == '-') &
            (promoter['start'] <= genes['stop']) &
            (promoter['stop'] > genes['start'])
        ]
    if not overlap.empty:
        overlap_issues.append((promoter, overlap))

# Check for promoters exceeding scaffold boundaries
boundary_issues = promoters[(promoters['start'] < 1) | (promoters['stop'] < 1)]

# Print any detected issues
if overlap_issues:
    print(f"Overlapping issues found: {len(overlap_issues)}")
if not boundary_issues.empty:
    print(f"Boundary issues found: {len(boundary_issues)}")

# Validate combined GFF
if combined_gff.isnull().any().any():
    print("Warning: Combined GFF contains null values. Please check the input data.")

# Display the first few rows of the combined DataFrame
print("Combined GFF:")
print(combined_gff.head())

# Save the output to a file
output_path = 'Thermophilus_genes_and_promoters_12_19_2024_finalized_3.gff'
combined_gff.to_csv(output_path, sep='\t', index=False)
print(f"Combined GFF file saved to: {output_path}")


Overlapping issues found: 2049
Combined GFF:
     scaffold source      type   start    stop strand       info  \
9110     chr1    JGI  promoter   381.0  1380.0      +  ID=gene_1   
0        chr1    JGI       CDS  1381.0  2142.0      +  ID=gene_1   
1        chr1    JGI       CDS  2744.0  3343.0      -  ID=gene_2   
2        chr1    JGI       CDS  3344.0  3817.0      -  ID=gene_3   
9111     chr1    JGI  promoter  3344.0  4343.0      -  ID=gene_2   

                              ID  
9110  proteinId=2114025_promoter  
0              proteinId=2114025  
1              proteinId=2293935  
2              proteinId=2293936  
9111  proteinId=2293935_promoter  
Combined GFF file saved to: Thermophilus_genes_and_promoters_12_19_2024_finalized_3.gff


### Trying to reduce the number of overlapping genes by reducing the promoter size from 1000 to 500

In [5]:
import pandas as pd

# File path
thermo_gff = '/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_July_2023/OORB003_TnSeq/for_analysis/modified_gff_file_finalized_08_08_2023'

# Load the GFF file
thermo = pd.read_csv(
    thermo_gff,
    sep='\t',
    header=None,
    comment='#',
    names=['scaffold', 'source', 'type', 'start', 'stop', 'strand', 'info', 'ID']
)

# Drop the first row if it contains header data mistakenly included in the file
if (thermo.iloc[0] == ['scaffold', 'source', 'type', 'start', 'stop', 'strand', 'info', 'ID']).all():
    thermo = thermo.iloc[1:].reset_index(drop=True)

# Ensure numeric types for 'start' and 'stop'
thermo['start'] = pd.to_numeric(thermo['start'], errors='coerce')
thermo['stop'] = pd.to_numeric(thermo['stop'], errors='coerce')

# Drop rows with invalid numeric values
thermo = thermo.dropna(subset=['start', 'stop'])

# Define promoter size
promoter_size = 500

# Prepare an empty DataFrame for promoters
promoter_gff = pd.DataFrame(columns=thermo.columns)

# Generate promoter entries
for i, row in thermo.iterrows():
    promoter_row = row.copy()
    promoter_row['type'] = 'promoter'
    promoter_row['ID'] = row['ID'] + '_promoter'  # Add _promoter to the original ID

    # Calculate promoter coordinates
    if row['strand'] == '+':
        promoter_row['stop'] = row['start'] - 1  # The promoter ends right before the gene
        promoter_row['start'] = max(promoter_row['stop'] - promoter_size + 1, 1)  # Extend upstream, ensure start >= 1
    else:
        promoter_row['start'] = row['stop'] + 1  # The promoter starts right after the gene
        promoter_row['stop'] = promoter_row['start'] + promoter_size - 1  # Extend downstream

    # Append the new row to the promoter DataFrame
    promoter_gff = pd.concat([promoter_gff, pd.DataFrame([promoter_row])], ignore_index=True)

# Combine the original DataFrame and promoter DataFrame
combined_gff = pd.concat([thermo, promoter_gff], ignore_index=True).sort_values(['scaffold', 'start'])

# Check for overlapping promoters and genes
overlap_issues = []
genes = combined_gff[combined_gff['type'] != 'promoter']
promoters = combined_gff[combined_gff['type'] == 'promoter']

for _, promoter in promoters.iterrows():
    if promoter['strand'] == '+':
        overlap = genes[
            (genes['scaffold'] == promoter['scaffold']) &
            (genes['strand'] == '+') &
            (promoter['start'] < genes['stop']) &
            (promoter['stop'] >= genes['start'])
        ]
    else:
        overlap = genes[
            (genes['scaffold'] == promoter['scaffold']) &
            (genes['strand'] == '-') &
            (promoter['start'] <= genes['stop']) &
            (promoter['stop'] > genes['start'])
        ]
    if not overlap.empty:
        overlap_issues.append((promoter, overlap))

# Check for promoters exceeding scaffold boundaries
boundary_issues = promoters[(promoters['start'] < 1) | (promoters['stop'] < 1)]

# Print any detected issues
if overlap_issues:
    print(f"Overlapping issues found: {len(overlap_issues)}")
if not boundary_issues.empty:
    print(f"Boundary issues found: {len(boundary_issues)}")

# Validate combined GFF
if combined_gff.isnull().any().any():
    print("Warning: Combined GFF contains null values. Please check the input data.")

# Display the first few rows of the combined DataFrame
print("Combined GFF:")
print(combined_gff.head())

# Save the output to a file
output_path = 'Thermophilus_genes_and_promoters_12_19_2024_finalized_4.gff'
combined_gff.to_csv(output_path, sep='\t', index=False)
print(f"Combined GFF file saved to: {output_path}")


Overlapping issues found: 1057
Combined GFF:
     scaffold source      type   start    stop strand       info  \
9110     chr1    JGI  promoter   881.0  1380.0      +  ID=gene_1   
0        chr1    JGI       CDS  1381.0  2142.0      +  ID=gene_1   
1        chr1    JGI       CDS  2744.0  3343.0      -  ID=gene_2   
2        chr1    JGI       CDS  3344.0  3817.0      -  ID=gene_3   
9111     chr1    JGI  promoter  3344.0  3843.0      -  ID=gene_2   

                              ID  
9110  proteinId=2114025_promoter  
0              proteinId=2114025  
1              proteinId=2293935  
2              proteinId=2293936  
9111  proteinId=2293935_promoter  
Combined GFF file saved to: Thermophilus_genes_and_promoters_12_19_2024_finalized_4.gff


#### To truncate overlapping promoters

In [7]:
import pandas as pd

# File path
thermo_gff = '/usr2/people/shollyt22/shollyt22/TnSeq_BarSeq_sequencings/Barseq_July_2023/OORB003_TnSeq/for_analysis/modified_gff_file_finalized_08_08_2023'

# Load the GFF file
thermo = pd.read_csv(
    thermo_gff,
    sep='\t',
    header=None,
    comment='#',
    names=['scaffold', 'source', 'type', 'start', 'stop', 'strand', 'info', 'ID']
)

# Drop the first row if it contains header data mistakenly included in the file
if (thermo.iloc[0] == ['scaffold', 'source', 'type', 'start', 'stop', 'strand', 'info', 'ID']).all():
    thermo = thermo.iloc[1:].reset_index(drop=True)

# Ensure numeric types for 'start' and 'stop'
thermo['start'] = pd.to_numeric(thermo['start'], errors='coerce')
thermo['stop'] = pd.to_numeric(thermo['stop'], errors='coerce')

# Drop rows with invalid numeric values
thermo = thermo.dropna(subset=['start', 'stop'])

# Define promoter size
promoter_size = 500  # Adjusted size

# Prepare an empty DataFrame for promoters
promoter_gff = pd.DataFrame(columns=thermo.columns)

# Generate promoter entries
for i, row in thermo.iterrows():
    promoter_row = row.copy()
    promoter_row['type'] = 'promoter'
    promoter_row['ID'] = row['ID'] + '_promoter'  # Add _promoter to the original ID

    # Calculate promoter coordinates
    if row['strand'] == '+':
        promoter_row['stop'] = row['start'] - 1  # The promoter ends right before the gene
        promoter_row['start'] = max(promoter_row['stop'] - promoter_size + 1, 1)  # Extend upstream, ensure start >= 1
    else:
        promoter_row['start'] = row['stop'] + 1  # The promoter starts right after the gene
        promoter_row['stop'] = promoter_row['start'] + promoter_size - 1  # Extend downstream

    # Append the new row to the promoter DataFrame
    promoter_gff = pd.concat([promoter_gff, pd.DataFrame([promoter_row])], ignore_index=True)

# Combine the original DataFrame and promoter DataFrame
combined_gff = pd.concat([thermo, promoter_gff], ignore_index=True).sort_values(['scaffold', 'start'])

# Separate promoters and genes
genes = combined_gff[combined_gff['type'] != 'promoter']
promoters = combined_gff[combined_gff['type'] == 'promoter']

# Resolve overlaps by truncating promoters
fixed_promoters = []
for _, promoter in promoters.iterrows():
    if promoter['strand'] == '+':
        # Find overlapping genes on the + strand
        overlap = genes[
            (genes['scaffold'] == promoter['scaffold']) &
            (genes['strand'] == '+') &
            (promoter['start'] < genes['stop']) &
            (promoter['stop'] >= genes['start'])
        ]
        if not overlap.empty:
            # Adjust promoter stop to avoid overlap
            closest_gene = overlap.iloc[0]
            promoter['stop'] = closest_gene['start'] - 1
    else:
        # Find overlapping genes on the - strand
        overlap = genes[
            (genes['scaffold'] == promoter['scaffold']) &
            (genes['strand'] == '-') &
            (promoter['start'] <= genes['stop']) &
            (promoter['stop'] > genes['start'])
        ]
        if not overlap.empty:
            # Adjust promoter start to avoid overlap
            closest_gene = overlap.iloc[0]
            promoter['start'] = closest_gene['stop'] + 1

    # Only keep valid promoters (start < stop)
    if promoter['start'] < promoter['stop']:
        fixed_promoters.append(promoter)

# Convert the fixed promoters list back to a DataFrame
fixed_promoters_df = pd.DataFrame(fixed_promoters)

# Combine fixed promoters with genes
resolved_gff = pd.concat([genes, fixed_promoters_df], ignore_index=True).sort_values(['scaffold', 'start'])

# Save the resolved GFF file
resolved_gff_file_path = 'Thermophilus_genes_and_promoters_01_06_2025_finalized_overlpap_truncated.gff'
resolved_gff.to_csv(resolved_gff_file_path, sep='\t', index=False, header=False)

print(f"Resolved GFF file saved to: {resolved_gff_file_path}")


Resolved GFF file saved to: Thermophilus_genes_and_promoters_01_06_2025_finalized_overlpap_truncated.gff
