# Day 4 - Solved Examples

## Stats Modules

In [None]:
import pandas as pd

print(df.groupby("Function")["log2FC"].mean())
print(df.groupby("Function")["log2FC"].std())
print(df.groupby("Function")["Gene Length"].sum())


In [None]:
from scipy.stats import shapiro, mannwhitneyu, kruskal, spearmanr, wilcoxon

def test_vs_WTavg(sample):
    # Perform Shapiro-Wilk test
    stat, p_value = shapiro(all_samples[sample])
    print(f"Normality:\t\t   Stat = {round(stat,5)}\tp = {round(p_value,5)}")

    # Mann-Whitney U Test versus WT mean
    stat, p = mannwhitneyu(WT_mean_values, all_samples[sample])
    print(f'Mann-Whitney U Test:\t   Stat = {round(stat,5)}\tp = {round(p,5)}')

    # Spearman Rank Correlation versus WT mean
    coef, p = spearmanr(WT_mean_values, all_samples[sample])
    print(f'Spearman Rank Correlation: Coeff = {round(coef,5)}\tp = {round(p,5)}')

    # Wilcoxon Signed-Rank Test versus WT mean
    stat, p = wilcoxon(WT_mean_values, all_samples[sample])
    print(f'Wilcoxon Signed-Rank Test: Stat = {round(stat,5)}\tp = {round(p,5)}')

# Run for all samples
for sample in all_samples.keys():
    print(sample + "\n====")
    test_vs_WTavg(sample)
    print()

## Data visualisation

## Argparse

In [None]:
#!/usr/bin/python3

import argparse
from Bio import SeqIO, Seq


# Initialise the parser class
parser = argparse.ArgumentParser(description='Description of your script')

# Define some options/arguments/parameters
parser.add_argument('-i', '--input', help='Path to input file')
parser.add_argument('-o', '--output', help='Path to output file', default='my_output.txt')
parser.add_argument('-m', '--min_length', help='minimum length required', default='900')

# Collect the inputted arguments into a dictionary
args = parser.parse_args()


for seq_record in SeqIO.parse(args.input, 'fasta'):
    seq_len = len(seq_record)
    GC = (seq_record.seq.count("G") + seq_record.seq.count("C")) / seq_len * 100

    if seq_len > int(args.min_length):
        print("Sequence", seq_record.id, "has length", seq_len, "and GC of", str(round(GC, 2)) + "%")

## Classes

In [None]:
import re

class Sequence:
    def __init__(self, ID, raw):
        self.id = ID
        self.raw = raw
        self.length = len(raw)

    def __repr__(self):
        return "The sequence object for {} (length: {})".format(self.id, self.length)

    def calc_GC(self):
        gc_perc = (self.raw.count("G") + self.raw.count("C")) / self.length * 100
        return str(round(gc_perc, 2)) + "%"

    def translateRNA(self):
        tr = self.raw.replace("T", "U")
        return tr
    
    def motif_search(self, motif):
        motif_count = self.raw.count(motif)

        if motif_count != 0:
            return "Motif {} found in {} {} times".format(motif, self.id, motif_count)
        else:
            return "Motif {} was not found in {}".format(motif, self.id)

## Some tests here

my_first_sequence = Sequence("HSA01", "TGTGTCATGCAAAACTAGGTCATGCGTCCGCTGACTGATGACTGACACTGGTGGCACAACTGACTGAC")
my_second_sequence = Sequence("MET01", "AAAAAAACGCGACTACGCGGCGACTATGTGTCATGCAAAACTAGGTCATGCGTCCGCTTGTGTGTGCAACGATGCGACTA")

print(my_first_sequence)
print(my_second_sequence)

print(my_first_sequence.calc_GC())
print(my_second_sequence.calc_GC())

print(my_first_sequence.translateRNA())
print(my_second_sequence.translateRNA())


print(my_first_sequence.motif_search("TGACTG"))
print(my_second_sequence.motif_search("TGACTG"))
