# DNA Identification in Python
## A program capable of identifying a person based on their DNA.

In [1]:
# Import libraries to be used
import csv

from sys import argv, exit

__NOTE:__ Using _manually generated_ `argv` of type list of arguments instead of `sys.argv` for simplicity.

[Jupyter Notebook and Command-line arguments possible solution](https://gist.github.com/gbishop/acf40b86a9bca2d571fa)

In [2]:
argv: list = ["dna.py", input("Database: "), input("Sequence: ")]

### Using command line arguments
__Usage:__

```shell
$ python3 dna.py $DATABASE $SEQUENCE
```

In [3]:
if len(argv) != 3:
    print("Usage: python3 dna.py $DATABASE $SEQUENCE")

    # Exit with an error and code 1
    exit(1)

### A check is needed to determine valid file selection

1. __Command line-argument__ - `database_path` of type `.csv`

2. __Command line-argument__ - `sequence_path` of type `.txt`

_Otherwise error_

In [4]:
database_path, sequence_path = argv[1], argv[2]
file_ext1, file_ext2 = database_path.split(".")[1], sequence_path.split(".")[1]

### A check for valid file extensions

In [5]:
if file_ext1 != "csv" or file_ext2 != "txt":
    print("Usage: python3 dna.py $DATABASE[.csv] $SEQUENCE[.txt]")

    # Exit with an error and code 1
    exit(1)

### Define a function to prompt the user with an error message of file's path

In [6]:
def file_error(path):
    print(f"Could not open {path}")
    exit(1)


### Compute function is supposd to return the longest chain created from a given nucleotide

In [7]:
# Create a function to perform the computations of alike chains of STRs
def compute(sequence, nucleotide) -> int:

    # Get the length of the corresponding nucleotide
    n = len(nucleotide)

    # Create a list to store all computed sums of a specific nucleotide (default value of 0 since not counter could
    # be computed - cause of crash)
    counters: list = [0]

    # Iterate through all chars in the sequence - n
    for i in range(len(sequence) - n):

        # Set index to 0
        index = 0

        # Check for head of a sequence -> start of a STR chain, a substring method
        if sequence[i:i + n] == nucleotide:

            # Create a counter and set it to 1 (count in the header too)
            counter: int = 1

            # Create a while loop to search for continuous chain following the header
            while True:

                # Increment the index by the length of the nucleotide
                index += n

                # Check whether the chain is continuous and not to pass the max. index (a corner case to prevent crash)
                if sequence[i + index: i + index + n] == nucleotide and (i + index) < len(sequence):

                    # Increment the counter for this chain
                    counter += 1

                # Otherwise break out of the loop and find another head
                else:
                    break

            # Store all found chains (their lengths) corresponding to a particular nucleotide in a list
            counters.append(counter)

    # Return the biggest of them
    return max(counters)

### Check function is supposed to return the name of a person or None (if not found)

In [8]:
# Create a check function (to output str or None)
def check(comp_data, database) -> str or None:

    # Compare the lists (transform database to a list containing all the keys, omit 'name')
    if comp_data == list(database.values())[1:]:

        # Return the name of the person
        return database['name']
    else:

        # Otherwise return None
        return


### Open selected files and handle possible errors
__Usage:__

<!-- Preview code in MD -->
```python
try:
    file = open(path, "#")
except OSError:
    file_error(path)
```

[__Docs__][link1]

[link1]:https://docs.python.org/3/tutorial/errors.html

In [9]:
# Open database file and handle possible errors
try:
    database_file = open(database_path, "r")
except OSError:
    file_error(database_path)

# Open sequence file and handle possible errors
try:
    sequence_file = open(sequence_path, "r")
except OSError:
    file_error(sequence_path)


<!-- Explanation of used data structures -->
## File structures

### Database file
Custom database of type [csv][ID1] containg gathered people and their occurence of particular [STR(s)][ID4]. 

<!-- TABLE with file structure -->
__Structure:__

| Name | STR1 | STR2 | ... | STRn |
| :-:  | :--: | :--: | :-: | :--: |
| Name | n    | n    | ... | n    |

\*Where `n` represents the numerical occurence of a specific __STR__ in person's DNA sequence.

### Sequence file
__DNA__ is just a sequence of nucleotides arranged in a shape! [[1]][ID2]

__Structure:__

```
B1B2B3 ... Bn
```

\*Where `B` represents an indexed base of DNA's nucleotide.

<!-- TABLE with base pair division (in DNA)-->
__Possible nucleotides:__

| A | C | G | T |
| :-: | :-: | :-: | :-: |
| Adenine | Cytosine | Guanine | Thymine |

[Source][ID3]

<!-- REFS -->
[ID1]: https://support.google.com/google-ads/answer/9004364?hl=en
[ID2]: ttps://www.nature.com/scitable/topicpage/dna-is-a-structure-that-encodes-biological-6493050/
[ID3]: https://www.genome.gov/genetics-glossary/Nucleotide
[ID4]: https://www.future-science.com/doi/10.2144/000112582


In [10]:
# Continue with valid files
with sequence_file:

    # Read the dna sequence from the file (omit '\n' and read only the first line)
    sequence = sequence_file.readlines()[0].replace("\n", "")

def repl(row) -> dict:
    """
    Dictionary comprehension if-else structure
    { (some_key if condition else default_key):(something_if_true if condition
    else something_if_false) for key, value in dict_.items() }
    """

    # Return each row from the csv file as a dict (and transform all str-to-int convertibles to ints)
    data: dict = {key: (row[key] if not row[key].isdigit() else int(row[key])) for key in row}
    return data

# Load data from the dna database and call the repl function
with database_file:

    # Populate list with respective rows of type dict
    dna_database: list = [repl(row) for row in csv.DictReader(database_file)]

# Store possible nucleotides (omit 'name' key -> start from index 1)
nucleotides: list = [key for key in dna_database[0]][1:]

# Search for chains of repeating nucleotides
computed_data: list = [compute(sequence, nucleotide) for nucleotide in nucleotides]

# Call the check and store its outputs in a list (only append if not of type None)
names: list = [check(computed_data, dna) for dna in dna_database if check(computed_data, dna) is not None]

## Content of input files

In this section, `pandas` library will be used to display individual data sets using a `DataFrame`.

__File structure(s)__ described above.

- [Pandas DataFrame][DataFrame]

Moreover, to display results of __DNA sequence__ in `Markdown`, `IPython` will be used.

- [IPython.display][IPython]

<!-- DOCS links -->
[DataFrame]: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html
[IPython]: https://ipython.readthedocs.io/en/stable/api/generated/IPython.display.html

In [11]:
# Import pandas lib.
from IPython.display import display, HTML
import pandas as pd

# Import IPython lib.
from IPython.display import display, Markdown

# Create a dataframe
dt = pd.DataFrame(data=dna_database, index=[i + 1 for i in range(len(dna_database))])
display(Markdown("#### DNA database"))
print(dt, end="\n" * 3)

# Display sequence in markdown
display(Markdown(f"#### DNA sequence\n`\n{sequence}\n`"))

#### DNA database

     name  AGATC  TTTTTTCT  AATG  TCTAG  GATA  TATC  GAAA  TCTG
1  Daniel     14        44    28     27    19     7    25    20
2    Fero     29        29    40     31    45    20    40    35
3  Michal      6        18     5     42    39    28    44    22
4    Adam     37        47    13     25    17     6    13    35
5  Rachel     29        27    32     41     6    27     8    34
6    Mike     31        11    28     26    35    19    33     6




#### DNA sequence
```sh
GGGTGAATCTCGGACAAAAGGGACCCAGTAATGGGAAAACACCCTGTACTTTCATTTATCTTAAGGAAAAGGCACTGCGACTTGTGACTGTTTTGTGCTTACCGGTCTGCTGACCACCTCCGCAAGATTCACCGGGCCCTCGCCCCTGGGCCCGCGGGGCTGTTTCCCCATTAATTCTGCACGGGCGAAGGCGGCCCCTCGGCCCGATAACGGACGTTGAAAGGCACCCAGTGTACCAACTCTATCGTTGTTAATCTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTCTAGTATTACAGCGGCATCACTATACTATAATAGTCCGACATGTTAGAATTCTGACGCGTCGAACCGTAGGCGAGTCCGAGTTTCCTACTCCTCACGTGGTCAGAAGTCCCGCTCGAGGATACCGATAAGCCTGAATTCTTCATTTCTACTAACTCGACAGACTGACTCACGCTAGTTTGGTTACTGTCCCGTACCCGCGTTTTCCACCTTAGTCTTGCTGGCGTACACTTTCAAGCGATAGCATGCTTACAATAGGCTATTGCGACGTGTCCGATCCTACATGTACTCATATATCGCAACGGCCCAGGTTACTTAGGGACTAAACGGCCCTTTAAAGCGAGGGTAGTAGAATTCAGGCCACTGAAATTGGGATTATCTAATAAATCACCCGCCGCCGAGATAAGGCAGCTTACGAGTGAAGTTTGCATCACGTGCCTTTGTATTATTAATCTCGACTACCAACTCCTCAATTATAAGCAAGTTCGTTCATAAATAGTCAGATGCTTGGGGCGCATGCGGTATTGGAGTTGCCATGGTCTACACGCGGCCCTATGCAAACTTTTCTTAAGCGAGGAAGTCTTCCGTATTGCGTTGCATTTTCTCAATTGTTTTAGTCTTGTCGTAACCAATGCGGTAAGAATGCGTTTACAGGGCCGCGACCCCAACAATCCTTCCTTCGAGGAAGGTACTGAGTAGGGCTGGTCTATGCCGGTACTCCTTAGGTTGTTCACTCATGCCGATACCACTCTTGGTTTCTCACCCTTATTCTGGCACTGGTAAAAGATAAAGAGAACCTCCAGTCTAAGCGATTTACTTGGACACGCCCTTTTTGAAGTTCGGGCACACTCCGATTACCTAGAGCCCTAATGGGAGCCCGTAATTTCCAATAAAAGGAAAGTGGCTATCAAGCCATCCGCTCGAAGCACGTCATCCGGTACTTAGGTGTCCCTTGTGTCGCTTAATTAGATGTTGTACGAACGATATATACGGGCTAGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATACGTCCGGCATCCAGCTCGTCAATCTTTAAGTGGCGTCATCAGTAAGTGCACGTGGTCTTCCAAACCCTGTGGCGACACTATTAGACGTGGTCGGATCGGTCTTTATCTTTGGTTCTGTAGTAGGAGAACTCATGGGATGGTTATGGAGTTTGACTACAGCGTCTGGTCTCGGTAGTCCCCACACCCTCTAGCCCAACCACTAGAAAGTGACTAGTAATATGCGTTCTGCGAATCTCAGGGGTCAACCGTGACTATTGATCTTACCAGCGCATCGCTCCGCAAGCTCAGTAAATTTGATGTTTTTCCACAACCGAAAGAGCGATTAAGTACTGAACAAGAGTGTTAAGTCCAGTACTAACAATTAGATACATACTCGACTCGTAGAATCTTTCGTATTCCTGGTCGTACCCAGCAACGCCAGCAGAGTTTAGTCGACTGGTAGTTTGGAGTTTTCGAGGACGGAGACTGTGCATAAACTGTAGACATTCTGAAGCGCATGTCGGAGTGTATTCACGTGCACGCTCACCATGAGTTCAGGTCTATGGTGTCGGCGTTCACAAGTCCATAATCTGCGTCTCGTGACGTGACTTATCCCTTGTCACTCTAGTTGGACATCACAGTAGCGTTCGCCTCCTTCGTAATTCCTATCCCCAAAGAGAGTATTTCAATTAATACTCTAAACTCGAGCGCCGAGCGACACCATTCCATTCTAAATCTGGGGGCTCACGCGCTTTCAGTCGTCAATTTTATCGCGAAATAAACATGATCGCTATAAGATATCCCTTCCTCATCTGTTGCCCCAACCTGAGGCGTTCTAGACGTACACGATTAACGCGTTACGCGACAGCAGAATCGAGCTACGTGCAGAGCGTATCTGCGCCGGGGATCGCCGTAGAAGACGGCGCTAGCCAACCAGGGTTCGACGAACTGCCCTAGGAAACAAGCAAGCATGCTCGTGAGAGCAGGACCTTCTGCAGAAGATATTGCGATACATGAGTTTCTACGGTATTGGGTTATGGAATCTAAGAGGGCCAATGGAAATAGTAAGTTGGGGCGGATTATTTGAATACCGTTCGGTGGACTGTTTTCCGGAAGAGCGATGCCACCTCCTTGGTGTCTGCATCGAATAGATGCCCTGTCTTTCGTAATCGGTGAACCCATTAGTGACTCATTCGTGGCCGATACATTTACTTATACGTCTTGAGGCGCAGCAAGTCTAGTATGTCGTATAGAAAGCAGGTTTGCTTAGTTCGACTTAAGAGTGACGCTAGGTCACGAATTTCTCGTCCGGGGCATGTCAGAGATTGATTCCTTAGATACTGGCCAAACCGATACTACGCGTTGATAAGTGAACGACATGAAAGTTGACAACACCCTCTAGGGTTCGAACCAAAACAGAAAGTAGGAGCAACGTTCTCGGAGCACCACATCACTTGAACCGCCACTTCGACTTTCACCGAGTGACATAAAGACTAGTGGACCCCATCTGTTACCAATAAGGGGCGTGTAGATCAGCACTGGAATACAACACACGATGCCTCAGTCATCTCACTACGTGTCTCCTCGCTCCGCGTTCGCACTATTGCTCGCGCCCACCGCACACTACCGTGTTGAATGACGAAAGTTGCGGGGCTTACGCTCGGTGCAAAAGTTTCACATCTCTTGATAGCATTGGGCAACGCCGACGTATAGCTATGCAATTCACACGGCCGCAAAGCTATTTGACTGAACGTCTAGCATAGTGGTCAAACTGGCTCGAGCTAGATCTTGACGCACGTTCGGTATCTTGGAATTACCACCGGTACATGAGAGCAAAAGGGGAATCCTTGGTATGGAATTTACAAACCTGATCATCTTATTAGTGGTGTGATGAGATATCTTCATCTTTATAAACGTGGCGCGGCCACTTAGCGCTCAACGGCAATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATGTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTTTTTTTCTCCGTCGGCTCTGCGACGCGCGGAGTCAGTCTTCTGCCCCGGGGTCAGCCAGTCAAGACCTGCTGGGAAACGAAGAGAAAGAACAGTGCTAATGCAGGCTCCTCTGACGTCTATGTACTATAGCTTTTCCGCACGCGGAATCTGTCATTGCTACAAGCCGTGCCCAGGGCGAATATTCTGGCCGCACCAAAAGGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAACGGGGCTTTGCCGTCAATAGCACGAGCTGAGACGGGGAAGCCCGTATTTCATACAGGTCACGTTCAAGAGACAGCGCATTAAAGATCATGCGCGGAAGGGTTAGCAAGGTCGCAAAAGGAAACATTGTCTTTTTCAAAGGGGCCTCTGCCCCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCAGATCCAGATAAACACATGTCTTGCGTTTTCTCGCCTGTTACTGGCTCGGCTCGCCTATGTCTTTTATATAAAACTATAAGGGTGTCGTCTGCTCCGTTTGGTCGGCTGAACGAACCTATGTTCTGTCTTTACCACACTTTCTCCGTTGCAAGGCTCAACGCCCCCAATTTTAAAATGTTCGAGATTCTGTCTGTCTGTCTGTCTGTCTGAAGTTAATAACCGAAACGATCTAATTTACAGTAGACATGGGCGAATGGCGTAAGCAGACAGTGACCGATAACGCTTCCTCCGCGCAGTTGGCCCCGGCTGACTCAAGAGCGGTACGGTCGGCCTTCTTGGCCATAGCCTACACACATCTCAAATGTCGCAAAGCGAACTACCTGCACCGGAGTCCGGGAATCTACTGCACGTTGGGCCTCTTGTACTGATTGCTGCCCGCTACGAACTAGAGCATAATAAAACTTCCGCCGAACCATTGGACTGAGATCAGAGGTAGCTTGATATGGGATCAGAGTTAGTCGATCCAAGTGAAGCTTGTAAGCCAATAGATTGAACATAGGATCATATTACCCCCACCGGTCCCACATTGCTGGGACGGACCCTGTCAGAGAGAGATCTAAACATCATTTGGACTTTGCCTATCTGCGGGACACCACAGGAGCTCCCAGCGCACGAAAAGAGCGGATTCACTAATCCTCTGGCAGTCCTCTTGAGGCATGGTGGGGATGGGTAATTAAGTCGAGGCCGTAGCACGGGATCGGCAACACAAGAATGGCACAAAACATATTCCCTTAGCTCCAAGATGTCGTAGTAGTAGTAGGATGAGAGGCGTGTCATGGGTTCCGTAGTATAGTGCATGCCAACTGTCTTGAATGCTAATATAGATCTTGGAGGAAGCTGGATCGTTGGTCCCTCCGACAGACGATTAAATAGTGGCGTAGATCGCACAAAGTTGCATTAGCAAGCGGGGCGTATCTAGAGTTTACTGTGCTGTAGGATCGGCGCGCTAGCAACAAAAGATTATCCACACAACAGACGCCCTCTCATGTCATAGACCGTGGGGGGGTCCGCACTTTTGGATTGCTGCTATACTATAAGACTCGTCTATCAGAAAACTTCTCGATGGTCCGGAGCCTCAGGACACCCCCCTCACCCATCGCCGGGCAAATGGCTCACCTCGATAATTTTAATCAACGGTGAACATCCAATTCCACACGGAGAGAACCGTACGTCTCAAGATCACTGCTTGTTTAAAATTGCAGAGTAGACCGATTCCGTACTTTAGAATCCAGCGATCCCTCAGCATGTCCATTTGCTCGACTTAATCATGCTGGTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCGCCTCGCTATCGACGTCGCATACATGCTTAGTTCTTGCCCAAGTTGACTGACGATCTTAGTCATAGTCTCACACCTGATA
```

In [12]:
# If the list is not of size 1 -> no match
if len(names) != 1:
    print("No match")
else:

    # Otherwise only 1 valid person was found -> print them out (in Markdown)
    display(Markdown(f"#### Person's name: {names[0]}"))

#### Person's name: Mike