# Loading data to the database

## Importing required libraries

In [7]:
import pip

def import_or_install(package):
    try:
        __import__(package)
    except ImportError:
        pip.main(['install', package])  

In [1]:
import pandas as pd
from sqlalchemy import create_engine

In [None]:
import json

In [8]:
import_or_install("goatools")
from goatools.anno.gaf_reader import GafReader

## Establishing database connection

In [54]:
# Set up your database connection
DATABASE_URI = 'postgresql+psycopg2://postgres:postgres@localhost:5432/genoquery'
engine = create_engine(DATABASE_URI)

## Defining functions to load txt data to the database

In [52]:
def load_data_to_db(file_path, table_name, sep='\t', header=None, comment_char=None, column_names=None):
    """
    Loads data from a file into a PostgreSQL database table, handling variable columns.

    Parameters:
    - file_path: str, path to the file to load.
    - table_name: str, database table to which the data will be loaded.
    - sep: str, delimiter to use.
    - header: int or None, row number to use as column names; None if no headers.
    - comment_char: str or None, character used to identify comments to skip.
    - column_names: list of str or None, names to use for the DataFrame columns.
    """
    # Load data, skipping any comment lines
    df = pd.read_csv(file_path, sep=sep, header=header, comment=comment_char, names=column_names, engine='python')
    print(df.head())
    # Upload data
    df.to_sql(table_name, engine, if_exists='append', index=False)
    print(f"Data from {file_path} loaded successfully into {table_name}")

### Loading the CodonUsage and GeneAliases txt files to Postgresql DB

In [56]:
column_names = ['codon', 'aa', 'freq', 'abundance']
# Load Codon Usage Data (assuming no comment lines and headers exist)
load_data_to_db('../data/txt/PlasmoDB-68_Pfalciparum3D7_CodonUsage.txt', 'codon_usage', sep='\t', header=0, column_names=column_names)

  codon aa  freq  abundance
0   UAA  *  0.91       0.69
1   UGA  *  0.27       0.21
2   UAG  *  0.13       0.10
3   GCU  A  8.12       0.41
4   GCC  A  2.09       0.11
Data from ../data/txt/PlasmoDB-68_Pfalciparum3D7_CodonUsage.txt loaded successfully into codon_usage


In [58]:
# Define column names based on the maximum number of expected aliases
column_names = ['gene_id', 'alias1', 'alias2', 'alias3', 'alias4', 'alias5', 'alias6', 'alias7', 'alias8', 'alias9']

# Load Gene Aliases Data (assuming no headers and no comment lines)
load_data_to_db('../data/txt/PlasmoDB-68_Pfalciparum3D7_GeneAliases.txt', 'gene_aliases', sep='\t', header=None, column_names=column_names)

         gene_id     alias1    alias2    alias3 alias4 alias5 alias6 alias7  \
0  PF3D7_1314600  PF13_0083      None      None   None   None   None   None   
1  PF3D7_0209400  PF02_0090  PFB0423c  PFB0425c   None   None   None   None   
2  PF3D7_1142200  PF11_0434      None      None   None   None   None   None   
3  PF3D7_0507200  MAL5P1.73  PFE0355c      None   None   None   None   None   
4  PF3D7_0806000  PF08_0117      None      None   None   None   None   None   

  alias8  alias9  
0   None     NaN  
1   None     NaN  
2   None     NaN  
3   None     NaN  
4   None     NaN  
Data from ../data/txt/PlasmoDB-68_Pfalciparum3D7_GeneAliases.txt loaded successfully into gene_aliases


## Defining functions to read GAF files
Reference : https://github.com/tanghaibao/goatools/blob/main/notebooks/annotations_gaf_file.ipynb

In [9]:
ogaf = GafReader("../data/gaf/PlasmoDB-CURRENT_Pfalciparum3D7_GO.gaf")

HMS:0:00:00.538194  31,735 annotations READ: ../data/gaf/PlasmoDB-CURRENT_Pfalciparum3D7_GO.gaf 


### Converting gaf file to named touples

In [10]:
# Sort the list of GAF namedtuples by ID
nts = sorted(ogaf.associations, key=lambda nt:nt.DB_ID)

# Print one namedtuple
print(nts[0])

ntgafobj(DB='VEuPathDB', DB_ID='PF3D7_0100100', DB_Symbol='VAR', Qualifier=set(), GO_ID='GO:0009405', DB_Reference={'VEuPathDB:PF3D7_0100100'}, Evidence_Code='IEA', With_From=set(), NS='BP', DB_Name={'erythrocyte membrane protein 1, PfEMP1'}, DB_Synonym={'VAR-UPSB1'}, DB_Type='protein_coding', Taxon=[36329], Date=datetime.date(2024, 3, 21), Assigned_By='gouniprot', Extension=None, Gene_Product_Form_ID=set())


### Converting named tuple to dataframes

In [12]:
# Assuming `nt` is the namedtuple list from the GAF reader
def convert_to_dataframe(namedtuples):
    data = [nt._asdict() for nt in namedtuples]  # Convert each namedtuple to a dictionary
    df = pd.DataFrame(data)  # Create a DataFrame from the list of dictionaries
    return df

In [64]:
# Convert GAF data to DataFrame
df = convert_to_dataframe(nts)  # Replace `gaf_data` with your actual data variable
print(df.head())

          DB          DB_ID DB_Symbol Qualifier       GO_ID  \
0  VEuPathDB  PF3D7_0100100       VAR        {}  GO:0009405   
1  VEuPathDB  PF3D7_0100100       VAR        {}  GO:0020013   
2  VEuPathDB  PF3D7_0100100       VAR        {}  GO:0020013   
3  VEuPathDB  PF3D7_0100100       VAR        {}  GO:0020033   
4  VEuPathDB  PF3D7_0100100       VAR        {}  GO:0020033   

                DB_Reference Evidence_Code With_From  NS  \
0  {VEuPathDB:PF3D7_0100100}           IEA        {}  BP   
1             {PMID:9230440}           TAS        {}  BP   
2  {VEuPathDB:PF3D7_0100100}           TAS        {}  BP   
3            {PMID:18785843}           TAS        {}  BP   
4  {VEuPathDB:PF3D7_0100100}           TAS        {}  BP   

                                    DB_Name   DB_Synonym         DB_Type  \
0  {erythrocyte membrane protein 1, PfEMP1}  {VAR-UPSB1}  protein_coding   
1  {erythrocyte membrane protein 1, PfEMP1}  {VAR-UPSB1}  protein_coding   
2  {erythrocyte membrane protein

### Pre processing the dataframe to get the right column names and format to update in database

In [63]:
def preprocess_dataframe(df):
    # Convert sets and dictionaries to JSON strings, handle empty sets as NULL or empty JSON
    def convert_to_json_or_null(data):
        if not data:  # Check if the set or dict is empty
            return None  # Change this to json.dumps({}) for empty JSON objects if preferable
        return list(data)[0]

    df['DB_Reference'] = df['DB_Reference'].apply(convert_to_json_or_null)
    df['With_From'] = df['With_From'].apply(convert_to_json_or_null)
    df['DB_Name'] = df['DB_Name'].apply(convert_to_json_or_null)
    df['DB_Synonym'] = df['DB_Synonym'].apply(convert_to_json_or_null)
    df['Qualifier'] = df['Qualifier'].apply(convert_to_json_or_null)
    df['With_From'] = df['With_From'].apply(convert_to_json_or_null)
    df['Gene_Product_Form_ID'] = df['Gene_Product_Form_ID'].apply(convert_to_json_or_null)
    

    # Convert Taxon to int, assuming it needs similar handling
    df['Taxon'] = df['Taxon'].apply(lambda x: int(x[0]) if x else None)

    return df

In [65]:
# Assume df is your DataFrame loaded from the GAF file
df = preprocess_dataframe(df)  # Call the preprocessing function
print(df.head())

          DB          DB_ID DB_Symbol Qualifier       GO_ID  \
0  VEuPathDB  PF3D7_0100100       VAR      None  GO:0009405   
1  VEuPathDB  PF3D7_0100100       VAR      None  GO:0020013   
2  VEuPathDB  PF3D7_0100100       VAR      None  GO:0020013   
3  VEuPathDB  PF3D7_0100100       VAR      None  GO:0020033   
4  VEuPathDB  PF3D7_0100100       VAR      None  GO:0020033   

              DB_Reference Evidence_Code With_From  NS  \
0  VEuPathDB:PF3D7_0100100           IEA      None  BP   
1             PMID:9230440           TAS      None  BP   
2  VEuPathDB:PF3D7_0100100           TAS      None  BP   
3            PMID:18785843           TAS      None  BP   
4  VEuPathDB:PF3D7_0100100           TAS      None  BP   

                                  DB_Name DB_Synonym         DB_Type  Taxon  \
0  erythrocyte membrane protein 1, PfEMP1  VAR-UPSB1  protein_coding  36329   
1  erythrocyte membrane protein 1, PfEMP1  VAR-UPSB1  protein_coding  36329   
2  erythrocyte membrane protein 1, 

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31735 entries, 0 to 31734
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   DB                    31735 non-null  object
 1   DB_ID                 31735 non-null  object
 2   DB_Symbol             31735 non-null  object
 3   Qualifier             10 non-null     object
 4   GO_ID                 31735 non-null  object
 5   DB_Reference          31735 non-null  object
 6   Evidence_Code         31735 non-null  object
 7   With_From             2757 non-null   object
 8   NS                    31735 non-null  object
 9   DB_Name               31735 non-null  object
 10  DB_Synonym            7988 non-null   object
 11  DB_Type               31735 non-null  object
 12  Taxon                 31735 non-null  int64 
 13  Date                  31735 non-null  object
 14  Assigned_By           31735 non-null  object
 15  Extension             0 non-null    

### Performing the same for another GAF dataset

In [48]:
ogafCurated = GafReader("../data/gaf/PlasmoDB-CURRENT_Pfalciparum3D7_Curated_GO.gaf")

HMS:0:00:00.300519  10,655 annotations READ: ../data/gaf/PlasmoDB-CURRENT_Pfalciparum3D7_Curated_GO.gaf 


In [49]:
# Sort the list of GAF namedtuples by ID
nts_curated = sorted(ogafCurated.associations, key=lambda nt:nt.DB_ID)

# Print one namedtuple
print(nts_curated[0])

ntgafobj(DB='VEuPathDB', DB_ID='PF3D7_0100100', DB_Symbol='VAR', Qualifier=set(), GO_ID='GO:0020013', DB_Reference={'PMID:9230440'}, Evidence_Code='TAS', With_From=set(), NS='BP', DB_Name={'erythrocyte membrane protein 1, PfEMP1'}, DB_Synonym={'VAR-UPSB1'}, DB_Type='protein_coding', Taxon=[36329], Date=datetime.date(2024, 3, 21), Assigned_By='GeneDB', Extension=None, Gene_Product_Form_ID=set())


In [80]:
# Convert GAF data to DataFrame
df_curated = convert_to_dataframe(nts_curated)

In [81]:
# Assume df is your DataFrame loaded from the GAF file
df_curated = preprocess_dataframe(df_curated)  # Call the preprocessing function
print(df_curated.head())

          DB          DB_ID DB_Symbol Qualifier       GO_ID   DB_Reference  \
0  VEuPathDB  PF3D7_0100100       VAR      None  GO:0020013   PMID:9230440   
1  VEuPathDB  PF3D7_0100100       VAR      None  GO:0020033  PMID:18785843   
2  VEuPathDB  PF3D7_0100100       VAR      None  GO:0020035  PMID:11420100   
3  VEuPathDB  PF3D7_0100100       VAR      None  GO:0098609  PMID:11316367   
4  VEuPathDB  PF3D7_0100100       VAR      None  GO:0020002  PMID:11420100   

  Evidence_Code With_From  NS                                 DB_Name  \
0           TAS      None  BP  erythrocyte membrane protein 1, PfEMP1   
1           TAS      None  BP  erythrocyte membrane protein 1, PfEMP1   
2           TAS      None  BP  erythrocyte membrane protein 1, PfEMP1   
3           TAS      None  BP  erythrocyte membrane protein 1, PfEMP1   
4           TAS      None  CC  erythrocyte membrane protein 1, PfEMP1   

  DB_Synonym         DB_Type  Taxon        Date Assigned_By Extension  \
0  VAR-UPSB1  prote

In [76]:
df_curated.iloc[0]

DB                                                   VEuPathDB
DB_ID                                            PF3D7_0100100
DB_Symbol                                                  VAR
Qualifier                                                 None
GO_ID                                               GO:0020013
DB_Reference                                      PMID:9230440
Evidence_Code                                              TAS
With_From                                                 None
NS                                                          BP
DB_Name                 erythrocyte membrane protein 1, PfEMP1
DB_Synonym                                           VAR-UPSB1
DB_Type                                         protein_coding
Taxon                                                    36329
Date                                                2024-03-21
Assigned_By                                             GeneDB
Extension                                              

In [77]:
df_curated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10655 entries, 0 to 10654
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   DB                    10655 non-null  object
 1   DB_ID                 10655 non-null  object
 2   DB_Symbol             10655 non-null  object
 3   Qualifier             10 non-null     object
 4   GO_ID                 10655 non-null  object
 5   DB_Reference          10655 non-null  object
 6   Evidence_Code         10655 non-null  object
 7   With_From             2757 non-null   object
 8   NS                    10655 non-null  object
 9   DB_Name               10655 non-null  object
 10  DB_Synonym            3533 non-null   object
 11  DB_Type               10655 non-null  object
 12  Taxon                 10655 non-null  int64 
 13  Date                  10655 non-null  object
 14  Assigned_By           10655 non-null  object
 15  Extension             0 non-null    

In [82]:
df_curated.rename(columns={
    'DB': 'source',
    'DB_ID': 'gene_id',
    'DB_Symbol': 'symbol',
    'GO_ID': 'go_id',
    'DB_Reference': 'reference',
    'Evidence_Code': 'evidence',
    'NS': 'aspect',  # Assuming 'NS' maps to 'aspect'
    'DB_Name': 'description',
    'DB_Synonym': 'synonym',
    'DB_Type': 'gene_type',
    'Taxon': 'taxon',
    'Date': 'date',
    'Assigned_By': 'assigned_by'
}, inplace=True)

df_curated.drop(columns=['Extension', 'Gene_Product_Form_ID', 'Qualifier', 'With_From'], inplace=True)

df_curated['taxon'] = df_curated['taxon'].astype(int)

df_curated.fillna(value=pd.NA, inplace=True)


In [59]:
def upload_data(df, table_name, engine):
    df.to_sql(table_name, con=engine, if_exists='append', index=False)
    print(f"Data successfully uploaded to {table_name}.")

In [84]:
upload_data(df_curated, 'gene_annotations', engine) 

Data successfully uploaded to gene_annotations.
