# Gene Positions Notebook

## Data Description

## .bed -> .csv

In [1]:
import pandas as pd

In [2]:
file_path = '~/Desktop/geometric-omics/UKBB-fine-mapping/data/hg19-genes.txt'
df = pd.read_csv(file_path, sep='\t')

In [3]:
df.head()

Unnamed: 0,#hg19.knownGene.name,hg19.knownGene.chrom,hg19.knownGene.strand,hg19.knownGene.txStart,hg19.knownGene.txEnd,hg19.knownGene.cdsStart,hg19.knownGene.cdsEnd,hg19.knownGene.proteinID,hg19.kgXref.geneSymbol
0,uc001aaa.3,chr1,+,11873,14409,11873,11873,,DDX11L1
1,uc010nxr.1,chr1,+,11873,14409,11873,11873,,DDX11L1
2,uc010nxq.1,chr1,+,11873,14409,12189,13639,B7ZGX9,DDX11L1
3,uc009vis.3,chr1,-,14361,16765,14361,14361,,WASH7P
4,uc009vjc.1,chr1,-,16857,17751,16857,16857,,WASH7P


In [4]:
# Define a dictionary to map the old column names to the new shortened names
column_mapping = {
    '#hg19.knownGene.name': 'name',
    'hg19.knownGene.chrom': 'chrom',
    'hg19.knownGene.strand': 'strand',
    'hg19.knownGene.txStart': 'txStart',
    'hg19.knownGene.txEnd': 'txEnd',
    'hg19.knownGene.cdsStart': 'cdsStart',
    'hg19.knownGene.cdsEnd': 'cdsEnd',
    'hg19.knownGene.proteinID': 'proteinID',
    'hg19.kgXref.geneSymbol': 'geneSymbol'
}

# Use the rename() function with the column_mapping dictionary
df = df.rename(columns=column_mapping)

# Print the updated DataFrame with shortened column names
print(df.columns)

Index(['name', 'chrom', 'strand', 'txStart', 'txEnd', 'cdsStart', 'cdsEnd',
       'proteinID', 'geneSymbol'],
      dtype='object')


In [5]:
# Create a subset with unique geneSymbol values and average of txStart
subset_start = df.groupby('geneSymbol')['txStart'].mean().reset_index()
subset_end = df.groupby('geneSymbol')['txEnd'].mean().reset_index()

# Ensure that each geneSymbol is associated with only one chrom
subset_chrom = df.drop_duplicates(subset=['geneSymbol'], keep='first')[['geneSymbol', 'chrom']]

combined_subset = pd.merge(subset_start, subset_end, on='geneSymbol')
combined_subset = pd.merge(combined_subset, subset_chrom, on='geneSymbol')

# Print the subset DataFrame
combined_subset.head()

Unnamed: 0,geneSymbol,txStart,txEnd,chrom
0,5S_rRNA,83435940.0,83436050.0,chr1
1,6M1-18,734785.4,764213.3,chr6_apd_hap1
2,7SK,89911090.0,89911330.0,chr1
3,A1BG,58858170.0,58864860.0,chr19
4,A1BG-AS1,58861230.0,58866550.0,chr19


In [6]:
combined_subset.to_csv('hg19-gene-positions.csv', index=False)