In [None]:
from IPython.display import Image
from IPython.display import clear_output
from IPython.display import FileLink, FileLinks

<img src="img/python-logo-master-flat.png" alt="Python Logo" style="width: 120px; float: right; margin: 0 0 10px 10px;" />

## Introduction to Python with Application in Bioinformatics



### Nanjiang Shu

#### 2024-07-18 (Day 4)

## Review of Day 3
- Functions and methods
    - A function is a block of code that performs a specific task
    - A method is a function that is associated with an object
    - `function_name()` vs `object.method_name()`
- How to write your own functions
```python
        def function_name(parameters):
            # block of code
            return result
```
- How to write a script that can take arguments from the command line using `sys.argv`
  - outcome: a software that can be reused by others
  

## Review of the quiz from Day 3
- <a href="https://forms.office.com/Pages/DesignPageV2.aspx?origin=NeoPortalPage&subpage=design&id=DQSIkWdsW0yxEjajBLZtrQAAAAAAAAAAAAa__Yehr4dUQkIwRUdDVjBGOElETlNPMERKRTUzNlpMTS4u&analysis=true">Link to the quiz statistics</a>

### Question 5
For a list `my_list`, what does `my_list.sort()` do?
```
  A: Prints a sorted copy of the list.
  B: Returns a new list that is sorted.
✔ C: Sorts the list in place and does not return a value.
  D: Sorts the list and returns the sorted list.
```
Many answered `D`

In [206]:
my_list = [1, 3, 5, 2]
return_value = my_list.sort()
print(f"my_list: {my_list}")
print(f"return_value: {return_value}")

my_list: [1, 2, 3, 5]
return_value: None


### Question 8
Which is a common use case for defining a function in Python?
```
✔  A: To break code into reusable blocks.
   B: To store data like variables do.
   C: To increase the speed of the program.
   D: To create a new module.
```
Many answered `D`
#### In Python, a module is a file that contains a collections of related functions, classes and other attributes

### Day 4

- __Session 1__: More on functions
    - Scop of variables - global vs local
    - Positional arguments and keyword arguments
    - `return` statement
- __Session 2__
    - Comments and documentation
    - Importing modules for using libraries
    - Brief introduction to SeqIO module of Biopython
#### Project time after lunch

## Scope of variables - global vs local 


#### Variables defined inside the function are local variables

In [208]:
def show_host():
    host = "local"
    print(f'host inside the function = {host}')
    
show_host()
del host

host inside the function = local


In [209]:
def show_host():
    host = "local"
    print(f'host inside the function: {host}')
    
print(f'host outside the function: {host}')

NameError: name 'host' is not defined

#### Global variables can be accessed both inside and outside of the function

In [210]:
def show_host():
    print(f'host inside the function = {host}')

host = "global"
show_host()
print(f'host outside the function = {host}')


host inside the function = global
host outside the function = global


#### Change the value from inside the function will not change the value of a global variable

In [213]:
def change_host():
    host = 'local'
    print(f"address of host inside = {id(host)}")
    print(f'host inside the function = {host}')
    
host = 'global'
print(f"address of host outside ={id(host)}")
print(f'host outside the function before change = {host}')
change_host()
print(f"address of host outside = {id(host)}")
print(f'host outside the function after change  = {host}')

address of host outside =4307542256
host outside the function before change = global
address of host inside = 4307054320
host inside the function = local
address of host outside = 4307542256
host outside the function after change  = global


#### Actually, a new local variable with the same name was created. 

#### Use the `global` keyword to modify the value of a global variable from inside the function

In [214]:
def change_host():
    global host
    host = 'modified'  # This modifies the global variable
    print(f'host inside the function: {host}')
    
host = 'global'
print(f'host outside the function before change: {host}')
change_host()
print(f'host outside the function: {host}')

host outside the function before change: global
host inside the function: modified
host outside the function: modified


####  Using the `global` keyword is not recommended, unless you know what you are doing!

### Using mutable objects, e.g. `lists` as global variables

In [215]:
def change_genes():
    gene_ids = ['COX3', 'COX4'] # a new local variable gene_ids is created
    print(f'gene_ids inside the function = {gene_ids}')

gene_ids = ['COX1', 'COX2']
print(f'gene_ids outside the function before change = {gene_ids}')
change_genes()
print(f'gene_ids outside the function after change  = {gene_ids}')

gene_ids outside the function before change = ['COX1', 'COX2']
gene_ids inside the function = ['COX3', 'COX4']
gene_ids outside the function after change  = ['COX1', 'COX2']


In [216]:
def change_genes():
    gene_ids.extend(['COX3', 'COX4']) # call the global variable gene_ids
    print(f'gene_ids inside the function = {gene_ids}')

gene_ids = ['COX1', 'COX2']
print(f'gene_ids outside the function before change = {gene_ids}')
change_genes()
print(f'gene_ids outside the function after change  = {gene_ids}')

gene_ids outside the function before change = ['COX1', 'COX2']
gene_ids inside the function = ['COX1', 'COX2', 'COX3', 'COX4']
gene_ids outside the function after change  = ['COX1', 'COX2', 'COX3', 'COX4']


### Summary: 
- Variables defined inside the function are local variables and are alive only within the function.
- Global variables can be accessed both from inside and outside of functions - __that is convenient__.
- One can use the `global` keyword to declear a global variable within the function, but it is not recommended for beginners.
- Be careful to use mutable objects as global variables due to their complexity and potential unintended side effects.

### `return` statement

```python
def function_name(parameters):
    # block of code
    return result
```

#### Functions that `return` are easier to repurpose than those that `print` their result

In [217]:
def get_seqlen(sequence):
    length = len(sequence)
    print(length)

seq1 = "ATCGACC"
seq2 = "CAAAG"

get_seqlen(seq1)
get_seqlen(seq2)

7
5


In [218]:
get_seqlen(seq1) + get_seqlen(seq2) 

7
5


TypeError: unsupported operand type(s) for +: 'NoneType' and 'NoneType'

In [219]:
def get_seqlen(sequence):
    length = len(sequence)
    return length

print(get_seqlen(seq1))
print(get_seqlen(seq2))

7
5


In [220]:
get_seqlen(seq1) + get_seqlen(seq2)



12

#### Functions without any `return` statement or without returning any value return `None`


In [221]:
def get_seqlen(sequence):
    length = len(sequence)
    
r = get_seqlen("A")
print(f'Return value of get_seqlen() = {r}')

Return value of get_seqlen() = None


In [222]:
def get_seqlen(sequence):
    length = len(sequence)
    return 
    
r = get_seqlen("A")
print(f'Return value of get_seqlen() = {r}')

Return value of get_seqlen() = None


### Summary
Use `return` for all values that you might want to use later in your program

### Keyword arguments and positional arguments

In [223]:
def sum3(a, b, c):
    print(f"a={a}")
    print(f"b={b}")
    print(f"c={c}")
    return a+b+c

In [227]:
sum3(1, c=4, b=5)

a=1
b=5
c=4


10

### We have used keyword arguments and positional arguments already

In [229]:
sorted([1, 3, 2, 5], reverse=True)

[5, 3, 2, 1]

In [232]:
attribute = 'gene_id "unknown gene"'
attribute.split(' ', maxsplit=1)

['gene_id', '"unknown gene"']

In [234]:
# print(value, ..., sep=' ', end='\n', file=sys.stdout, flush=False)
print('x=', end='')
print('1')

x=1


## Properties of keyword arguments and positional arguments

### Arguments can be used in both ways, with or without keyword, if there is no ambiguity
- When used with keyword, they are keyword arguments
- When used without keyword, they are positional arguments

In [None]:
def sum3(a, b, c):
    print(f"a={a}")
    print(f"b={b}")
    print(f"c={c}")
    return a+b+c

In [None]:
sum3(1, 5, 8)

In [None]:
sum3(1, b=5, c=8)

### The order of keyword arguments can be switched

In [235]:
sum3(1, b=5, c=8)

a=1
b=5
c=8


14

In [236]:
sum3(1, c=8, b=5)

a=1
b=5
c=8


14

### Positional arguments must be in front of keyword arguments

In [237]:
sum(1, b=5, 8)

SyntaxError: positional argument follows keyword argument (3010964451.py, line 1)

### How to define functions taking keyword arguments

- Just define them as usual:

In [238]:
def format_sentence(subject, value, end):
    return f"The {subject} is {value}{end}"

print(format_sentence('lecture', 'ongoing', '.'))

print(format_sentence('lecture', 'ongoing', end='!'))

print(format_sentence(subject='lecture', value='ongoing', end='...'))

The lecture is ongoing.
The lecture is ongoing!
The lecture is ongoing...


### Defining functions with default arguments

In [239]:
def format_sentence(subject, value, end='.'):
    return f"The {subject} is {value}{end}"

print(format_sentence('lecture', 'ongoing'))

print(format_sentence('lecture', 'ongoing', '...'))

The lecture is ongoing.
The lecture is ongoing...


### Defining functions with optional arguments

- Convention: use the object `None`

In [240]:
def format_sentence(subject, value, end='.', second_value=None):
    if second_value is None:
        return f"The {subject} is {value}{end}"
    else:
        return f"The {subject} is {value} and {second_value}{end}"

print(format_sentence('lecture', 'ongoing'))

print(format_sentence('lecture', 'ongoing', second_value='self-referential', end='!'))

The lecture is ongoing.
The lecture is ongoing and self-referential!


#### Small detour: Python's value for missing values: `None`
- It is the default value for optional arguments
- It is also the implicit return value of functions without a `return`
- `None` is `None`, not anything else

In [None]:
bool(None)

In [241]:
None == False

False

In [242]:
None == 0

False

In [243]:
None == ''

False

In [None]:
type(None)

### Special rules for defining positional arguments and keyword arguments

In [246]:
sorted([1, 3, 2, 5], True)

TypeError: sorted expected 1 argument, got 2

In [247]:
help(sorted)

Help on built-in function sorted in module builtins:

sorted(iterable, /, *, key=None, reverse=False)
    Return a new list containing all items from the iterable in ascending order.
    
    A custom key function can be supplied to customize the sort order, and the
    reverse flag can be set to request the result in descending order.



- Parameters before the slash symbol `/` are positional-only. 
- Parameters after the asterisk symbol `*` are keyword-only.
- These rules are applied to enforce clarity and reduce errors in its usage.

### Day 4, Exercise 1
- Link: https://python-bioinfo.bioshu.se/exercises.html
- Extra reading:
    - https://able.bio/rhett/python-functions-and-best-practices--78aclaa
___
#### Take a break after the exercise

## Session 2: Moudles and Documentaiton

### Code structure

- functions
- modules (files)

- Documentation

### Why functions?
- Cleaner code
- Better defined tasks in code
- Re-usability
- Better structure

### Why modules?

- Cleaner code
- Better defined tasks in code
- Re-usability
- Better structure

- Collect all related functions in one file
- Import a module to use its functions
- Only need to understand what the functions do, not how

#### Example of modules

In [None]:
import sys

sys.argv

In [248]:
from datetime import datetime
print(datetime.now())

2024-07-18 10:35:07.506254


### Python standard modules

Check out the [module index](https://docs.python.org/3/py-modindex.html)

How to find the right module?

How to understand it?

### How to find the right module?

- Look at the Python Module Index (https://docs.python.org/3/py-modindex.html)
- Python tutorial at w3school (https://www.w3schools.com/python/default.asp)
- Search in the Python Package Index (https://pypi.org)
- Ask your colleagues
- Search the web
- Ask AI

- Standard modules: no installation needed

- Other libraries: install with e.g. 
    - `pip install` 
    - `conda install`

### How to understand it?
- E.g. I want to know how to split a string by the separator `,`

In [None]:
text = 'Programming,is,cool'
help(dummy)

In [None]:
text.split(sep=',')

- For slightly more complicated problems, e.g. how to download Python logo from internet with `urllib`
- URL: https://www.python.org/static/img/python-logo@2x.png

In [None]:
import urllib



- Sometimes easier to find the answer by searching the web

In [None]:
import urllib
url = 'https://www.python.org/static/img/python-logo@2x.png'
urllib.request.urlretrieve(url, 'files/python-logo.png')

In [None]:
help(math.sqrt)

In [None]:
math.sqrt(3)

### Various ways of importing

In [250]:
import math 

math.sqrt(3)

1.7320508075688772

In [251]:
import pandas as pd
m.sqrt(3)

1.7320508075688772

In [254]:
from math import *
pi

3.141592653589793

In [None]:
from pprint import pprint

### Documentation and commenting your code


Remember `help()`?

In [256]:
help(sorted)

Help on built-in function sorted in module builtins:

sorted(iterable, /, *, key=None, reverse=False)
    Return a new list containing all items from the iterable in ascending order.
    
    A custom key function can be supplied to customize the sort order, and the
    reverse flag can be set to request the result in descending order.



- This works because somebody else has documented their code!

#### The help message for functions can be added directly under the function definition line

In [259]:
def process_file(filename, chrom, pos):
    """
    Read VCF file and print all samples that match the specified chromosome and position
    
    Input arguments: VCF file name, chromosome, position
    """
    for line in open(filename):
        # Line started with '#' are ingored
        if not line.startswith('#'):
            col = line.split('\t') # VCF is tab delimited
            if col[0] == chrom and int(col[1]) == pos: 
                print(col[9:]) # samples are started from 10th column

In [260]:
help(process_file)

Help on function process_file in module __main__:

process_file(filename, chrom, pos)
    Read VCF file and print all samples that match the specified chromosome and position
    
    Input arguments: VCF file name, chromosome, position



### Your code may have two types of users:

- library users
- maintainers (maybe yourself!)

### Write documentation for both of them!

- #### Library users (docstrings):
  ```python
  """
  What does this function do?
  """
  ```
- #### Maintainers (comments):
  ```python
  # implementation details
  ```
Add comment when the code needs additional explanation
  ```python
    my_list[5] += other_list[3]  # explain why you do this!
    ```

In [None]:
def process_file(filename, chrom, pos):
    """Read a vcf file, search for lines matching 
    chromosome chrom and position pos.

    Print the genotypes of the matching lines.
    """
    for line in open(filename):
        if not line.startswith('#'):  # skip comments
            col = line.split('\t')  # file is tab separated
            # Check if chrom and pos match
            if col[0] == chrom and col[1] == pos:
                # genotype starts at column index 9
                print(col[9:])

### Places for documentation:
#### 1. Add module documentation at the beginning of the file
```python
     """
     This module provides functions for ...
     """
```
#### 2. Add function documentation at every function definition under `def` 
```python
    def function_name(parameters):
        """
        This function performs ...
        """
        # code block
        return value
```

Note: Live coding, show an example of writing documentation

create an empty file and named it as dna.py
first, write a docstring at the beginning of the file

### Demo, create a Python module called `dna.py` with documentation

In [271]:
# Input the module we have just created and show the help message
import importlib
importlib.reload(dna)
import dna
from dna import get_seqlen

help(dna.cal_gc_content)
dna.cal_gc_content("AAACGCGCG") 
dna.cal_gc_content("AACGT")

Help on function cal_gc_content in module dna:

cal_gc_content(sequence)
    Return the GC content in percentage given a DNA sequence
    
    Input: DNA sequence (string)
    Output: gc_content in percentage (float)



40.0

### Read more about documentation and comment:

https://realpython.com/documenting-python-code/

https://www.python.org/dev/peps/pep-0008/?#comments

### Introduction to SeqIO in Biopython
- An open-source collection of tools for biological computation.
- Provides functionalities for reading, writing, and analyzing biological data.

#### Installing biopython
```bash
pip install biopython
# or
conda install -c conda-forge biopython
```


Note: yesterday we have showed a python script to read DNA sequences from file and calculate the length of sequence. here I have modified the code slightly so that readseq is packed in a function readseq()

In [273]:
def get_seqlen_from_file(seqfile):
    """Read the sequence in fasta format, return the sequence in string"""
    with open(seqfile, "r") as file:
        seqlength = 0
        for line in file:
            line = line.strip()
            if not line.startswith(">"):
                seqlength += len(line)
    return seqlength
seqlength = get_seqlen_from_file("../downloads/Ecoli-10seq.fna")
print(f"Length of DNA sequence: {str(seqlength)}")

Length of DNA sequence: 9312


### How to do that with Biopython

SeqIO.parse support many formats, such as 
- fastaq: Extension of FASTA with quality scores.
- genebank: Rich format that combines sequence data and annotation.
- Stockholm: Used for multiple sequence alignments.
- PIR: protein sequence database format

In [275]:
from Bio import SeqIO
for record in SeqIO.parse("../files/one_dna_sequence.fa", "fasta"): 
    seqlength = len(record.seq)
print(f"Length of DNA sequence: {str(seqlength)}")

Length of DNA sequence: 170


In [276]:
from Bio import SeqIO
for record in SeqIO.parse("../downloads/one_dna_sequence.fa", "fasta"): 
    print(f"id: {record.id}") # characters after '>' until the first white space 
    print(f"description: {record.description}") # characters after '>'
    print(f"seq: {record.seq}")

id: random_sequence_1
description: random_sequence_1 | Random generated DNA sequence of length 125
seq: ATGGGACTGCTAGGACTAGCTAGCTCTAGGACTGATCGTAGCTAGCTAGGCTAGCCTAGCAGTCTAGCTAGGACGATCGTAGCCCATAGCGGCTAGCTAGTACTGATCGTGCTAGCTAGCTAGCTGACTAGCTGCGTAGCTAGCTGACTGCTAGGACTAGTTGCGCTAGC


### It works with sequences files with multiple sequences as well

In [None]:
!cat ../downloads/Ecoli-10seq.fna

In [278]:
from Bio import SeqIO
for record in SeqIO.parse("../downloads/Ecoli-10seq.fna", "fasta"): 
    print(f"Length of {record.id}: {len(record.seq)}")

Length of lcl|NC_000913.3_cds_YP_025308.1_1724: 2262
Length of lcl|NC_000913.3_cds_NP_416194.1_1672: 417
Length of lcl|NC_000913.3_cds_NP_418217.1_3710: 264
Length of lcl|NC_000913.3_cds_NP_418490.1_3983: 1293
Length of lcl|NC_000913.3_cds_NP_416901.1_2381: 1257
Length of lcl|NC_000913.3_cds_NP_417887.1_3363: 1434
Length of lcl|NC_000913.3_cds_NP_418643.1_4139: 342
Length of lcl|NC_000913.3_cds_NP_414712.1_172: 852
Length of lcl|NC_000913.3_cds_NP_416467.1_1950: 918
Length of lcl|NC_000913.3_cds_NP_416274.1_1752: 273


### Fetch biological data from the database seamlinglessly

In [None]:
from Bio import Entrez

# Always tell NCBI who you are
Entrez.email = "nanjiang.shu@scilifelab.se"

# Fetch a nucleotide sequence
handle = Entrez.efetch(db="nucleotide", id="NM_000546", rettype="fasta", retmode="text")
print(handle.read())


### Further resources
- Biopython Tutorial and Cookbook: http://biopython.org/DIST/docs/tutorial/Tutorial.html
- Biopython Documentation: https://biopython.org/wiki/Documentation

### Day 2, Exercise 2
- Link: https://python-bioinfo.bioshu.se/exercises.html
___
##### Break
___
### Quiz of Day 2
- Link: https://python-bioinfo.bioshu.se/quiz.html
___
#### Lunch
___
### Project time after lunch