In [None]:
from IPython.display import Image
from IPython.display import clear_output
from IPython.display import FileLink, FileLinks

<img src="img/python-logo-master-flat.png" alt="Python Logo" style="width: 120px; float: right; margin: 0 0 10px 10px;" />

## Introduction to Python with Application in Bioinformatics



### Nanjiang Shu

#### 2024-07-18 (Day 4)

## Review of Day 3
- Functions and methods
  - `functionName()`
  - `object.functionName()`
- How to write your own functions
- How to write a script that can take arguments from the command line using `sys.argv`
  - outcome: a software that can be reused by others
  

## Review of quiz from yesterday

#### In what ways does the type of an object matter?
-  Questions 1, 2 and 3

In [None]:
row = 'sofa|2000|buy|Uppsala'
fields = row.split('|')
price = fields[1]
if price == 2000:
    print('The price is a number!')
if price == '2000':
    print('The price is a string!')

In [36]:
print(sorted([ 2000,   30,   100 ]))

In [None]:
print(sorted(['2000', '30', '100']))

#### In what ways does the type of an object matter?

- Each type store a specific type of information
    - `int` for integers,
    - `float` for floating point values (decimals),
    - `str` for strings,
    - `list` for lists,
    - `dict` for dictionaries.

- Each type supports different operations, functions and methods.

- Each type supports different **operations**

In [None]:
30 > 2000

In [None]:
'30' > '2000'

In [None]:
30 > int('2000')

In [None]:
'12345'[2]

In [None]:
12345[2]

- Each type supports different **functions**

In [None]:
max('2000')

In [None]:
max(2000)

In [None]:
math.cos(3.14)

In [None]:
math.cos('3.14')

- Each type supports different **methods**

In [None]:
'ACTG'.lower()

In [None]:
[1, 2, 3].lower()

In [None]:
set([]).add('tiger')

In [None]:
[].add('tiger')

- How to find what methods are available: Python documentation, or `dir()`

In [None]:
dir('ACTG') # list all attributes

In [None]:
dir(str) # list all attributes

#### Convert string to number
- Questions 4, 5 and 6


In [None]:
float('2000')

In [None]:
float('0.5')

In [None]:
float('1e9')

In [None]:
float('1e-2')

In [None]:
int('2000')

In [None]:
int('1.5')

In [None]:
int('1e9')

#### Convert to boolean: `1`, `0`, `'1'`, `'0'`, `''`, `{}`
- Question 7


In [None]:
bool(1)

In [None]:
bool(0)

In [None]:
bool('1')

In [None]:
bool('0')

In [None]:
bool('')

In [None]:
bool({})

- Python and the truth: true and false values

In [None]:
values = [1, 0, '', '0', '1', [], [0]]
for x in values:
    if x:
        print(repr(x), 'is true!')
    else:
        print(repr(x), 'is false!')

- `if x` is equivalent to `if bool(x)`

- Is `1` equivalent to `True`?

In [None]:
1 == True

In [None]:
x = 1
if x is True:
    print(repr(x), 'is true!')
else:
    print(repr(x), 'is false!')

In [None]:
x = 1
if bool(x) is True:
    print(repr(x), 'is true!')
else:
    print(repr(x), 'is false!')

- Be careful: `if x is True` is **not** equivalent to `if bool(x) is True`

#### Container types, when should you use which? (Question 8)

- **lists**: when order is important
- **dictionaries**: to keep track of the relation between keys and values
- **sets**: to check for membership. No order, no duplicates.

In [43]:
genre_list = ["comedy", "drama", "drama", "sci-fi"]
genre_list

In [44]:
genres = set(genre_list)
genres

In [45]:
'drama' in genre_list
'drama' in genres
# which operation is faster?

In [46]:
genre_counts = {"comedy": 1, "drama": 2, "sci-fi": 1}
genre_counts

In [47]:
movie = {"rating": 10.0, "title": "Toy Story"}
movie

#### Python syntax (Question 9)


In [None]:
def echo(message): # starts a new function definition
    # this function echos the message 
    print(message) # print state of the variable
    return message # return the value to end the function


#### Converting between strings and lists
- Question 10

In [None]:
list("hello")

In [None]:
str(['h', 'e', 'l', 'l', 'o'])

In [None]:
'_'.join(['h', 'e', 'l', 'l', 'o'])

#### What is a function?
- A named piece of code that performs a specific task
- A relation (mapping) between inputs (arguments) and output (return value)

In [None]:
def increment_by_two(number):
    number += 2
    return number

print(increment_by_two(100))

### Day 4

- More on functions:
    - Scop of variables
    - Positional arguments and keyword arguments
    - `return` statement
- Reusing code:
    - Comments and documentation
    - Importing modules: using libraries
    - Brief introduction to Biopython


### More on functions: scope - global vs local variables


### Variables defined inside the function are local variables

In [10]:
def show_host():
    host = "local"
    print(f'host inside the function = {host}')
show_host()

host inside the function = local


In [11]:
def show_host():
    host = "local"
    print(f'host inside the function: {host}')
print(f'host outside the function: {host}')

NameError: name 'host' is not defined

### Global variables can be accessed both inside and outside of the function

In [13]:
host = 'global'

def show_host():
    print(f'host inside the function = {host}')

show_host()
print(f'host outside the function = {host}')


host inside the function = global
host outside the function = global


- Change in the function will not change the global variable

In [14]:
host = 'global'

def change_host():
    host = 'local'
    print(f'host inside the function = {host}')

print(f'host outside the function before change = {host}')
change_host()
print(f'host outside the function after change  = {host}')

host outside the function before change = global
host inside the function = local
host outside the function after change  = global


- Pass global variable as argument

In [None]:
host = 'global'

def change_host(host):
    host = 'local'
    print(f'host inside the function = {host}')

print(f'host outside the function before change = {host}')
change_host(host)
print(f'host outside the function after change  = {host}')


### More on functions: scope - global vs local variables cont.
List as global variables


In [15]:
gene_ids = ['COX1', 'COX2']

def change_genes():
    gene_ids = ['COX3', 'COX4']
    print(f'gene_ids inside the function = {gene_ids}')

print(f'gene_ids outside the function before change = {gene_ids}')
change_genes()
print(f'gene_ids outside the function after change  = {gene_ids}')


gene_ids outside the function before change = ['COX1', 'COX2']
gene_ids inside the function = ['COX3', 'COX4']
gene_ids outside the function after change  = ['COX1', 'COX2']


Will the global variable never to changed by function?

In [17]:
gene_ids = ['COX1', 'COX2']

def change_genes():
    gene_ids.extend(["COX3", "COX4"])
    print(f'gene_ids inside the function = {gene_ids}')

print(f'gene_ids outside the function before change = {gene_ids}')
change_genes()
print(f'gene_ids outside the function after change  = {gene_ids}')


gene_ids outside the function before change = ['COX1', 'COX2']
gene_ids inside the function = ['COX1', 'COX2', 'COX3', 'COX4']
gene_ids outside the function after change  = ['COX1', 'COX2', 'COX3', 'COX4']


Take away: be careful when using global variables. Do not use it unless you know what you are doing.

### More on functions: `return` statement
A function that counts the number of occurences of `'C'` in the argument string.

In [None]:
def cytosine_count(nucleotides):
    count = 0
    for x in nucleotides:
        if x == 'c' or x == 'C':
            count += 1
    return count

count1 = cytosine_count('CATATTAC')
count2 = cytosine_count('tagtag')
print(count1, count2)

Functions that `return` are easier to repurpose than those that `print` their result

In [None]:
cytosine_count('catattac') + cytosine_count('tactactac')

In [None]:
def print_cytosine_count(nucleotides):
    count = 0
    for x in nucleotides:
        if x == 'c' or x == 'C':
            count += 1
    print(count)

print_cytosine_count('CATATTAC')
print_cytosine_count('tagtag')

In [None]:
print_cytosine_count('catattac') + print_cytosine_count('tactactac')

- Functions without any `return` statement returns `None`


In [None]:
def foo():
    do_nothing = 1

r = foo()
print(f'Return value of foo() = {r}')

- Use `return` for all values that you might want to use later in your program

### Keyword arguments
- A way to give a name explicitly to a function for clarity

In [None]:
sorted('file', reverse=True)

In [None]:
attribute = 'gene_id "unknown gene"'
attribute.split(sep=' ', maxsplit=1)

In [None]:
# print(value, ..., sep=' ', end='\n', file=sys.stdout, flush=False)
print('x=', end='')
print('1')

### Keyword arguments cont.
- Can be used in both ways, with or without keyword, if there is no ambiguity
- Arguments after `*` must be keyword arguments, e.g. sorted()

In [None]:
open('files/gene_ids.txt', 'w', encoding='utf-8')

In [None]:
open('files/gene_ids.txt',  mode='w', encoding='utf-8')

- The order of keyword arguments does not matter

In [None]:
open('files/gene_ids.txt', mode='w', encoding='utf-8')

In [None]:
open('files/gene_ids.txt', encoding='utf-8', mode='w')

- Positional arguments must be in front of keyword arguments

In [None]:
open('files/gene_ids.txt', encoding='utf-8', 'w')

### How to define functions taking keyword arguments

- Just define them as usual:

In [None]:
def format_sentence(subject, value, end):
    return 'The ' + subject + ' is ' + value + end

print(format_sentence('lecture', 'ongoing', '.'))

print(format_sentence('lecture', 'ongoing', end='!'))

print(format_sentence(subject='lecture', value='ongoing', end='...'))

### Defining functions with default arguments

In [None]:
def format_sentence(subject, value, end='.'):
    return 'The ' + subject + ' is ' + value + end

print(format_sentence('lecture', 'ongoing'))

print(format_sentence('lecture', 'ongoing', '...'))

### Defining functions with optional arguments

- Convention: use the object `None`

In [None]:
def format_sentence(subject, value, end='.', second_value=None):
    if second_value is None:
        return 'The ' + subject + ' is ' + value + end
    else:
        return 'The ' + subject + ' is ' + value + ' and ' + second_value + end

print(format_sentence('lecture', 'ongoing'))

print(format_sentence('lecture', 'ongoing',
                      second_value='self-referential', end='!'))

### Small detour: Python's value for missing values: `None`

- Default value for optional arguments
- Implicit return value of functions without a `return`
- `None` is `None`, not anything else

In [None]:
bool(None)

In [None]:
None == False

In [None]:
None == 0

In [None]:
None == ''

In [None]:
type(None)

### Day 4, Exercise 1

- Extra reading:
    - https://realpython.com/python-kwargs-and-args/
    - https://able.bio/rhett/python-functions-and-best-practices--78aclaa
### Break

## Session 2: Moudles and Documentaiton

### A short note on code structure

- functions
- modules (files)
- documentation

#### Why functions?
- Cleaner code
- Better defined tasks in code
- Re-usability
- Better structure

#### Why modules?

- Cleaner code
- Better defined tasks in code
- Re-usability
- Better structure

- Collect all related functions in one file
- Import a module to use its functions
- Only need to understand what the functions do, not how

#### Example of modules

In [5]:
import sys

sys.argv

['/Users/nanjiang/anaconda3/lib/python3.11/site-packages/ipykernel_launcher.py',
 '-f',
 '/Users/nanjiang/Library/Jupyter/runtime/kernel-2be64ec6-d847-441c-ae23-4d9f4aa2b710.json']

In [314]:
from datetime import datetime
print(datetime.now())

2022-10-13 13:13:08.980977


### Python standard modules

Check out the [module index](https://docs.python.org/3/py-modindex.html)

How to find the right module?

How to understand it?

### How to find the right module?

- Look at the Python Module Index (https://docs.python.org/3/py-modindex.html)
- Python tutorial at w3school (https://www.w3schools.com/python/default.asp)
- Search in the Python Package Index (https://pypi.org)
- Ask your colleagues
- Search the web
- Ask AI

- Standard modules: no installation needed
- Other libraries: install with `pip install` or `conda install`

### How to understand it?
- E.g. I want to know how to split a string by the separator `,`

In [321]:
text = 'Programming,is,cool'
help(dummy)

NameError: name 'dummy' is not defined

In [317]:
text.split(sep=',')

['Programming', 'is', 'cool']

- For slightly more complicated problems, e.g. how to download Python logo from internet with `urllib`
- URL: https://www.python.org/static/img/python-logo@2x.png

In [None]:
import urllib



- Sometimes easier to find the answer by searching the web

In [None]:
import urllib
url = 'https://www.python.org/static/img/python-logo@2x.png'
urllib.request.urlretrieve(url, 'files/python-logo.png')

In [None]:
help(math.sqrt)

In [None]:
math.sqrt(3)

### Various ways of importing

In [333]:
import math 

math.sqrt(3)

1.7320508075688772

In [21]:
import math as m
m.sqrt(3)

1.7320508075688772

In [335]:
from math import sqrt 


1.7320508075688772

In [None]:
from pprint import pprint

### Documentation and commenting your code


Remember `help()`?

In [22]:
help(print)

Help on built-in function print in module builtins:

print(*args, sep=' ', end='\n', file=None, flush=False)
    Prints the values to a stream, or to sys.stdout by default.
    
    sep
      string inserted between values, default a space.
    end
      string appended after the last value, default a newline.
    file
      a file-like object (stream); defaults to the current sys.stdout.
    flush
      whether to forcibly flush the stream.



- This works because somebody else has documented their code!

In [41]:
def process_file(filename, chrom, pos):
    """
    Read VCF file and print all samples that match the specified chromosome and position
    """
    for line in open(filename):
        # Line started with '#' are ingored
        if not line.startswith('#'):
            col = line.split('\t') # VCF is tab delimited
            if col[0] == chrom and int(col[1]) == pos: 
                print(col[9:]) # samples are started from 10th column

In [40]:
help(process_file)

Help on function process_file in module __main__:

process_file(filename, chrom, pos)
    Read VCF file and print all samples that match the specified chromosome and position



### Your code may have two types of users:

- library users
- maintainers (maybe yourself!)

#### Write documentation for both of them!

- __library users (docstrings)__:
  ```python
  """
  What does this function do?
  """
  ```
- __maintainers (comments)__:
  ```python
  # implementation details
  ```
Add comment when the code needs additional explanation
  ```python
    my_list[5] += other_list[3]  # explain why you do this!
    ```

In [None]:
def process_file(filename, chrom, pos):
    """Read a vcf file, search for lines matching 
    chromosome chrom and position pos.

    Print the genotypes of the matching lines.
    """
    for line in open(filename):
        if not line.startswith('#'):  # skip comments
            col = line.split('\t')  # file is tab separated
            # Check if chrom and pos match
            if col[0] == chrom and col[1] == pos:
                # genotype starts at column index 9
                print(col[9:])

### Places for documentation:

#### 1. At the beginning of the file
```python
     """
     This module provides functions for ...
     """
```

#### 2. At every function definition under `def` 
```python
    def function_name(parameters):
        """*
        This function performs ...
        """
        # code block
        return value
```

Note: Live coding, show an example of writing documentation

create an empty file and named it as dna.py
first, write a docstring at the beginning of the file

### Demo, create a Python module with documentation

In [64]:
# Input the module we have just created and show the help message
import importlib
importlib.reload(dna)
import dna
from dna import get_seqlen
help(dna.get_seqlen)
print(dna.get_seqlen("AAAAAACCCCTGTGG"))
print(get_seqlen("AAAAAACCCCTGTGG"))

Help on function get_seqlen in module dna:

get_seqlen(seq)
    Calculate the length of the given DNA sequence and return the length

15
15


### Read more about documentation and comment:

https://realpython.com/documenting-python-code/

https://www.python.org/dev/peps/pep-0008/?#comments

### Introduction to Biopython
- An open-source collection of tools for biological computation.
- Provides functionalities for reading, writing, and analyzing biological data.

#### Installing biopython
```bash
pip install biopython
# or
conda install -c conda-forge biopython
```


Note: yesterday we have showed a python script to read DNA sequences from file and calculate the length of sequence. here I have modified the code slightly so that readseq is packed in a function readseq()

In [68]:
def get_seqlen(seqfile):
    """Read the sequence in fasta format, return the sequence in string"""
    with open(seqfile, "r") as file:
        seqlength = 0
        for line in file:
            line = line.strip()
            if not line.startswith(">"):
                seqlength += len(line)
    return seqlength
seqlength = get_seqlen("../files/one_dna_sequence.fa")
print(f"Length of DNA sequence: {str(seqlength)}")

Length of DNA sequence: 386


### How to do that with Biopython

SeqIO.parse support many formats, such as 
- fastaq: Extension of FASTA with quality scores.
- genebank: Rich format that combines sequence data and annotation.
- Stockholm: Used for multiple sequence alignments.
- PIR: protein sequence database format

In [87]:
from Bio import SeqIO
for record in SeqIO.parse("../files/one_dna_sequence.fa", "fasta"): 
    seqlength = len(record.seq)
print(f"Length of DNA sequence: {str(seqlength)}")

Length of DNA sequence: 170


In [90]:
from Bio import SeqIO
for record in SeqIO.parse("../downloads/one_dna_sequence.fa", "fasta"): 
    print(f"id: {record.id}") # characters after '>' until the first white space 
    print(f"description: {record.description}") # characters after '>'
    print(f"seq: {record.seq}")

id: random_sequence_1
description: random_sequence_1 | Random generated DNA sequence of length 125
seq: ATGGGACTGCTAGGACTAGCTAGCTCTAGGACTGATCGTAGCTAGCTAGGCTAGCCTAGCAGTCTAGCTAGGACGATCGTAGCCCATAGCGGCTAGCTAGTACTGATCGTGCTAGCTAGCTAGCTGACTAGCTGCGTAGCTAGCTGACTGCTAGGACTAGTTGCGCTAGC


### It works with sequences files with multiple sequences as well

In [93]:
!cat ../downloads/dna_sequences.fa

>seq_1 Random generated DNA sequence 1
CGTACGTAGCTAGCTAGCTAGCTAGCTGATCGTAGCTAGCTAGCTGATCGTA
>seq_2 Random generated DNA sequence 2
GACCTAGCTGACTGATCGTAGCTAGCTAGCTGACTAGCTAGCTGACTGA
>seq_3 Random generated DNA sequence 3
TAGAGCTAGCTAGCTGATCGTAGCTAGCTAGCTAGCTGATCGTAGCTAGCTAG
TAGGATCGTAGCTAGCTAGCTAGC
>seq_4 Random generated DNA sequence 4
GCTCTGATCGTAGCCTGACTAGCTAGCTGATCGTAGCTAGCTAGCT


In [94]:
from Bio import SeqIO
for record in SeqIO.parse("../downloads/dna_sequences.fa", "fasta"): 
    print(f"Length of {record.id}: {len(record.seq)}")

Length of seq_1: 52
Length of seq_2: 49
Length of seq_3: 77
Length of seq_4: 46


### Fetch biological data from the database seamlinglessly

In [97]:
from Bio import Entrez

# Always tell NCBI who you are
Entrez.email = "nanjiang.shu@scilifelab.se"

# Fetch a nucleotide sequence
handle = Entrez.efetch(db="nucleotide", id="NM_000546", rettype="fasta", retmode="text")
print(handle.read())


>NM_000546.6 Homo sapiens tumor protein p53 (TP53), transcript variant 1, mRNA
CTCAAAAGTCTAGAGCCACCGTCCAGGGAGCAGGTAGCTGCTGGGCTCCGGGGACACTTTGCGTTCGGGC
TGGGAGCGTGCTTTCCACGACGGTGACACGCTTCCCTGGATTGGCAGCCAGACTGCCTTCCGGGTCACTG
CCATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCAGACCTATG
GAAACTACTTCCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATGGATGATTTGATGCTGTCC
CCGGACGATATTGAACAATGGTTCACTGAAGACCCAGGTCCAGATGAAGCTCCCAGAATGCCAGAGGCTG
CTCCCCCCGTGGCCCCTGCACCAGCAGCTCCTACACCGGCGGCCCCTGCACCAGCCCCCTCCTGGCCCCT
GTCATCTTCTGTCCCTTCCCAGAAAACCTACCAGGGCAGCTACGGTTTCCGTCTGGGCTTCTTGCATTCT
GGGACAGCCAAGTCTGTGACTTGCACGTACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGA
CCTGCCCTGTGCAGCTGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTA
CAAGCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCAGATAGCGAT
GGTCTGGCCCCTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTGTGGAGTATTTGGATGACAGAA
ACACTTTTCGACATAGTGTGGTGGTGCCCTATGAGCCGCCTGAGGTTGGCTCTGACTGTACCACCATCCA
CTACAACTACATGTGTAACAGTTCCTGCATGGGCGGCATGAACCGGAGGCCCATCCTCACCATCATCAC

### Further resources
- Biopython Tutorial and Cookbook: http://biopython.org/DIST/docs/tutorial/Tutorial.html
- Biopython Documentation: https://biopython.org/wiki/Documentation

## Day 2, Exercise 2

## Break

## Quiz

## Project time after lunch