# Data curation

Goal: Process the data from the raw database to make it ready for data analysis. This new database will be called prod

In [15]:
# Data curation
import sqlite3

# Used for pretty printing
import pandas as pd

# Creating empty data base
con = sqlite3.connect("unified.db")
cur = con.cursor()

# Enable REGEX for sqlite
import re


def regexp(expr, item):
    reg = re.compile(expr)
    return reg.search(item) is not None

con.create_function("REGEXP", 2, regexp)

# Create new table and check for multiple executions
cur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='prod'")
if not cur.fetchone():  # If the fetch returns None, table does not exist
    cur.execute("CREATE TABLE prod AS SELECT * FROM initial")
    print("Table 'prod' created successfully.")
else:
    print("Table 'prod' already exists.")



Table 'prod' already exists.
[(0, 'id', 'TEXT', 0, None, 0), (1, 'name', 'TEXT', 0, None, 0), (2, 'AB', 'INT', 0, None, 0), (3, 'description', 'TEXT', 0, None, 0), (4, 'OX', 'TEXT', 0, None, 0), (5, 'dataset', 'TEXT', 0, None, 0), (6, 'seq', 'TEXT', 0, None, 0), (7, 'seq_len', 'INT', 0, None, 0)]


## Valid sequences

Check wether sequence only contains valid amino acids and no whitespaces or any other letters not being valid AA's.

In [5]:
# This query checks if there are any symbols other the the valid aa code
pd.read_sql_query("""SELECT *
                FROM initial
                WHERE seq REGEXP '[^ARNDCEQGHILKMFPSTWYVX]'; 
""", con)
 # WHERE seq REGEXP '[^ARNDCEQGHILKMFPSTWYV]'; to not include X



Unnamed: 0,id,name,AB,description,OX,dataset,seq,seq_len
0,L11A004522,LAMP2_L11A004522,1,,alien,LAMP2.fasta,kklaklallkwllalkklallalkk,25
1,L13A15655,LAMP2_L13A15655,1,,alien,LAMP2.fasta,kklfkkilkyL,11
2,DRAMP20856,dramp_DRAMP20856,1,,alien,dramp_antimicrobial.fasta,OWOWOWORPVYOPRPRPPHPRL,22
3,DRAMP20857,dramp_DRAMP20857,1,,alien,dramp_antimicrobial.fasta,OIOIORPVYOPRPRPPHPRL,20
4,DRAMP21410,dramp_DRAMP21410,1,,alien,dramp_antimicrobial.fasta,klckivvikvck,12
...,...,...,...,...,...,...,...,...
88,P21986,FLA3_SPIAU,0,Flagellar filament 32 kDa core protein (Fragme...,Spirochaeta aurantia OX=147,uniprot_swissprot.fasta,MIINHNMSAINANRVLGBT,19
89,P21987,FLA4_SPIAU,0,Flagellar filament 31.5 kDa core protein (Frag...,Spirochaeta aurantia OX=147,uniprot_swissprot.fasta,MIINHNMSAINANRVLGBTNADITKDL,27
90,P25072,PA21_MICTM,0,Phospholipase A2 1 (Fragment),Micrurus tener microgalbineus OX=8636,uniprot_swissprot.fasta,SLLBFKBMIEST,12
91,P35707,FLAV_NOSSM,0,Flavodoxin (Fragment),Nostoc sp. (strain MAC) OX=35822,uniprot_swissprot.fasta,SKKIGLFYGTZTGKTESVAEIIDEFGDEVVTLDID,35


Problems found in seq:
* non capitalized letters
* the letter X

Theese problems are adressed in the prod database by only allowing data conform with follwing restrictions to be added:
 * only valid amino acid
 * all capitalized
 * AB only being 1 or 2
 * dataset must not be null
 * seq length between 1 and 200
 * seq must be unique

In [24]:
cur.execute("DROP TABLE IF EXISTS prod;")
cur.execute("""
CREATE TABLE prod (
    id TEXT,
    name TEXT,
    AB INTEGER NOT NULL CHECK (AB IN (0, 1)),
    description TEXT,
    OX TEXT,
    dataset TEXT NOT NULL,
    seq TEXT UNIQUE CHECK (seq = UPPER(seq) AND seq REGEXP '^[ARNDCEQGHILKMFPSTWYVX]+$')
);
""")
print(cur.fetchall())

[]


# PROD and rejected table
the new prod table enforces data integrity meaning data in must adhere to the top restrictions. Next step is to import the data in prod db. Rejected data is sored in "rejected" for debuuging proposes.

In [26]:
# Insert data into prod
cur.execute("""
INSERT INTO rejected (id, name, AB, description, OX, dataset, seq, reason_for_rejection)
SELECT 
    id, 
    name, 
    AB, 
    description, 
    OX, 
    dataset, 
    seq, 
    CASE 
        WHEN dataset IS NULL THEN 'Dataset is NULL'
        WHEN NOT (AB IN (0, 1)) THEN 'AB is not 0 or 1'
        WHEN NOT (seq = UPPER(seq) AND seq REGEXP '^[ARNDCEQGHILKMFPSTWYVX]+$') THEN 'Seq format error'
        ELSE 'Unknown reason'  -- Handles other potential issues, such as application logic errors
    END
FROM initial
WHERE dataset IS NULL 
    OR NOT (AB IN (0, 1))
    OR NOT (seq = UPPER(seq) AND seq REGEXP '^[ARNDCEQGHILKMFPSTWYVX]+$');

""")

# To not loose data we will collect data which is rejected in a table called rejected.

<sqlite3.Cursor at 0x7fca4b236ac0>

## Duplicates
Check wether there are duplicates

In [20]:
pd.read_sql_query("""SELECT seq, COUNT(seq)
                FROM initial
                GROUP BY seq
                HAVING COUNT(seq) > 1;
""", con)

Unnamed: 0,seq,COUNT(seq)
0,FALALKAKKL,12864
