# Data curation

Goal: Process the data from the raw database to make it ready for data analysis. This new database will be called prod

In [1]:
# Data curation
import sqlite3

# Used for pretty printing
import pandas as pd

# Creating empty data base
con = sqlite3.connect("unified.db")
cur = con.cursor()

# Enable REGEX for sqlite
import re


def regexp(expr, item):
    reg = re.compile(expr)
    return reg.search(item) is not None

con.create_function("REGEXP", 2, regexp)

# Create new table and check for multiple executions
cur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='prod'")
if not cur.fetchone():  # If the fetch returns None, table does not exist
    cur.execute("CREATE TABLE prod AS SELECT * FROM initial")
    print("Table 'prod' created successfully.")
else:
    print("Table 'prod' already exists.")



Table 'prod' already exists.


## Valid sequences

Check wether sequence only contains valid amino acids and no whitespaces or any other letters not being valid AA's.

In [2]:
# This query checks if there are any symbols other the the valid aa code
pd.read_sql_query("""SELECT *
                FROM initial
                WHERE seq REGEXP '[^ARNDCEQGHILKMFPSTWYV]'; 
""", con)
 # WHERE seq REGEXP '[^ARNDCEQGHILKMFPSTWYV]'; to not include X



Unnamed: 0,id,name,AB,description,OX,dataset,seq,seq_len


Problems found in seq:
* non capitalized letters
* the letter X and B in seq



# The lower case problem
This can be solved pretty easy by just replacing all the lowercase seqences with upper case ones.


In [3]:
cur.execute("""
UPDATE initial
SET seq = UPPER(seq)
WHERE seq != UPPER(seq);
""")

print(cur.fetchall())

[]


# The other constraints
Idea is to add a new col called valid. This either says yes or no including the reason why not.

In [4]:
# create col
cur.execute("""
ALTER TABLE initial
ADD COLUMN valid TEXT DEFAULT 'yes';
""")

# update col
cur.execute("""
UPDATE initial
SET valid = CASE 
    WHEN AB NOT IN (0, 1) THEN 'Invalid AB value'
    WHEN dataset IS NULL THEN 'Dataset is null'
    WHEN LENGTH(seq) < 1 OR LENGTH(seq) > 200 THEN 'Invalid seq length'
    WHEN seq REGEXP '^[ARNDCEQGHILKMFPSTWYV]+$' THEN valid
    ELSE 'Invalid sequence characters'
END;
""")

# uniqueness check
cur.execute("""
UPDATE initial
SET valid = 'Seq must be unique'
WHERE rowid NOT IN (
    SELECT MIN(rowid)
    FROM initial
    GROUP BY seq
);
""")


<sqlite3.Cursor at 0x7f27188be840>

# Reasons why data is rejected

In [5]:
pd.read_sql_query("SELECT COUNT(*),valid FROM initial GROUP BY valid", con)

Unnamed: 0,COUNT(*),valid
0,833,Seq must be unique
1,13133,yes


I think we have engough data to just reject the 34 invalid sequences.

# Duplicates
Before we just reject all duplicates we need to check wether they are true duplicates meaning if they are stemming from multiple datasets or just technical replicates.

Definition used here:

**technical replicate:** if dataset and seq is a duplicate -

**duplicate:** if only seq is a duplicate but sources state the same AB

**contradictory duplicate** matching seq different source and different AB 

In [6]:
 def identify_duplicates():   
    # Step 1: Identify Technical Replicates
    cur.execute("""
    UPDATE initial
    SET valid = 'technical replicate'
    WHERE seq IN (
        SELECT seq
        FROM initial
        GROUP BY seq, dataset
        HAVING COUNT(*) > 1
    );
    """)
    
    # Step 2: Identify Duplicates (same seq, multiple datasets, same AB)
    cur.execute("""
    UPDATE initial
    SET valid = 'duplicate'
    WHERE seq IN (
        SELECT seq
        FROM initial
        GROUP BY seq, AB
        HAVING COUNT(DISTINCT dataset) > 1
    ) AND valid != 'technical replicate';
    """)
    
    # Step 3: Identify Contradictory Duplicates (same seq, different AB in different datasets)
    cur.execute("""
    UPDATE initial
    SET valid = 'contradictory duplicate'
    WHERE seq IN (
        SELECT seq
        FROM initial
        GROUP BY seq
        HAVING COUNT(DISTINCT AB) > 1 AND COUNT(DISTINCT dataset) > 1
    ) AND valid NOT IN ('technical replicate', 'duplicate');
    """)


identify_duplicates()
#pd.read_sql_query("SELECT * FROM initial WHERE valid = 'contradictory duplicate' ORDER BY seq;", con).to_excel("contradictions.xlsx")
pd.read_sql_query("SELECT COUNT(*), valid FROM initial GROUP BY valid", con)

Unnamed: 0,COUNT(*),valid
0,1138,technical replicate
1,12828,yes


# Adressing contradictory duplicates
Theese stem from negative data sources (i.e uniprot). We can reject the ones from uniprot.

In [7]:
cur.execute("""
UPDATE initial
SET AB = 1
WHERE dataset = 'uniprot_swissprot.fasta' AND valid = 'contradictory duplicate';
""")

identify_duplicates()
pd.read_sql_query("SELECT COUNT(*), valid FROM initial GROUP BY valid", con)

Unnamed: 0,COUNT(*),valid
0,1138,technical replicate
1,12828,yes


This means all duplicates stem from the uniprot data base. We have no other duplicates.

# Adressing Invalid sequences and technical duplicates
we just remove them.


In [8]:
cur.execute("""
DELETE FROM initial
WHERE NOT seq REGEXP '^[ARNDCEQGHILKMFPSTWYV]+$' OR valid = 'technical replicate';
""")

identify_duplicates()
pd.read_sql_query("SELECT COUNT(*), valid FROM initial GROUP BY valid", con)

Unnamed: 0,COUNT(*),valid
0,12828,yes


# Merge duplicates
To remove the duplicates but keep the information we will merge theese rows containing the duplicates.

From here on we will work with a new table which enforces data integrity. The table enforces the folloing conditions:


 * only valid amino acid seq
 * AB only being 0 or 1
 * AB cant bo 0
 * dataset must not be null
 * seq length between 1 and 200
 * seq must be unique and not null

I have decided to designate seq as database keys. This enforces uniqueness, non nullabilty and improves lookup performance.

In [9]:
pd.read_sql_query("SELECT * FROM  initial WHERE valid = 'duplicate' ORDER BY seq LIMIT 4;", con)

Unnamed: 0,id,name,AB,description,OX,dataset,seq,seq_len,valid


In [10]:
# Step 1: Create a new table for the merged results
try: 
    cur.execute(""" DROP TABLE prod; """)
except Exception as e:
    print("")

cur.execute("""
CREATE TABLE IF NOT EXISTS prod (
    id TEXT,
    name TEXT,
    AB INTEGER NOT NULL CHECK (AB IN (0, 1)),
    description TEXT,
    OX TEXT,
    source TEXT NOT NULL,
    seq TEXT PRIMARY KEY CHECK (seq = UPPER(seq) AND seq REGEXP '^[ARNDCEQGHILKMFPSTWYV]+$'),
    valid TEXT
);
""")



# Step 2: Insert aggregated data into the new table INSERT INTO prod(id, name, AB, description, OX, source, seq, valid)
cur.execute("""
INSERT INTO prod(id, name, AB, description, OX, source, seq, valid)
SELECT 
    GROUP_CONCAT(id, '; ') AS id,
    GROUP_CONCAT(name, '; ') AS name,
    AB,
    GROUP_CONCAT(description, '; ') AS description,
    GROUP_CONCAT(OX, '; ') AS OX,
    GROUP_CONCAT(dataset, '; ') AS source,
    UPPER(seq) AS seq,
    'yes - merged duplicate' AS valid
FROM initial
WHERE valid = 'duplicate' AND seq REGEXP '^[ARNDCEQGHILKMFPSTWYV]+$'
GROUP BY seq;
""")

<sqlite3.Cursor at 0x7f27188be840>

In [11]:
pd.read_sql_query("SELECT * FROM  prod;", con)

Unnamed: 0,id,name,AB,description,OX,source,seq,valid


830 duplicates successfully merged into 415 concatenated rows.

# Merge the rest

Add valid sequences into valid


In [12]:
cur.execute("""
INSERT INTO prod
SELECT id, name, AB, description, OX, dataset, seq, valid FROM initial
WHERE valid = 'yes';
""")

pd.read_sql_query("SELECT * FROM  prod;", con)

Unnamed: 0,id,name,AB,description,OX,source,seq,valid
0,ADAM_2177,InverPep_ADAM_2177,1,,alien,InverPep.fasta,GLFNVFKGLKTAGKHVAGSLLNQLKCKVSGGC,yes
1,DBAASP13663,"HistoneH2A(1-21),Fi-Histin",1,,alien,InverPep.fasta,SRSSRAGLQFPVGRIHRLLRK,yes
2,DBAASP729,Histatin5(5-22),1,,alien,InverPep.fasta,KRHHGYKRKFHEKHHSHR,yes
3,DBAASP644,Gaegurin5(1-11)[F1W],1,,alien,InverPep.fasta,WLGALFKVASK,yes
4,ADAM_0664,InverPep_ADAM_0664,1,,alien,InverPep.fasta,DLWNSIKDMAAAAGRAALNAVTGMVNQ,yes
...,...,...,...,...,...,...,...,...
12823,Q7M463,SCK6_MESMA,0,Neurotoxin BmK A3-6,Mesobuthus martensii OX=34649,uniprot_swissprot.fasta,LPYPVNCKTECECVMCGLGIICKQCYYQQ,yes
12824,Q7NSS5,Y3345_CHRVO,0,UPF0434 protein CV_3345,Chromobacterium violaceum (strain ATCC 12472 /...,uniprot_swissprot.fasta,MDAKFLEILVCPLCKGPLVFDKSKDELICKGDRLAFPIKDGIPMML...,yes
12825,Q8QHM9,Y56_SIRV1,0,Uncharacterized protein 56,Sulfolobus islandicus rod-shaped virus 1 OX=15...,uniprot_swissprot.fasta,MKKEIQVQGVRYYVESEDDLVSVAHELAKMGYTVQQIANALGVSER...,yes
12826,Q9R4N8,RL33_BREVE,0,Large ribosomal subunit protein bL33 (Fragment),Brevundimonas vesicularis OX=41276 GN=rpmG,uniprot_swissprot.fasta,CKPASIKIRLNSTADTGFYV,yes


# Final health check

In [13]:
pd.read_sql_query("SELECT COUNT(*), valid FROM prod GROUP BY valid", con)

Unnamed: 0,COUNT(*),valid
0,12828,yes


# Save and commit


In [14]:
cur.execute("DROP TABLE initial")
con.commit()
con.close()