# Analysis of microarry data from patients with myotonic dystrophy.

# Pre-process & Normalise data

In [1]:
# First check if all CEL files are present. We expect data from 35 patients

import os, os.path

CEL_LOCATION = os.path.join("CELfiles", "blood")

CELFILES = os.listdir(CEL_LOCATION)
CELFILES = [file for file in CELFILES if file[-4:] == ".CEL"]

assert(len(CELFILES) == 35)

In [2]:
# Carry out RMA normalisation on all CEL files. Write output to corefile This step takes a few minutes

import utils
corefile =  os.path.join(CEL_LOCATION, "core")
try:
    with open(corefile, "rb") as f:
        print("corefile exists")
except FileNotFoundError:
    utils.executeNotebook("normalise.ipynb", CEL_LOCATION)
    print("corefile created")

corefile exists


In [3]:
# Retrieve HUAX IDs from CEL files

import sys
import re
import json

def processCELFile(basePath, lastPath):
    relPath = os.path.join(basePath, lastPath)
    result = ""
    with open(relPath, "rb") as f:
        lines = f.readlines()
        for i, line in enumerate(lines[0:20]):
            try:
                result = re.search(b"(\d)*HUEX1A11", line).group()
                
            except AttributeError:
                pass
    return result.decode("utf-8")
CELTOHUAX = {}
for celfile in CELFILES:
    CELTOHUAX[celfile] = str(processCELFile(CEL_LOCATION, celfile))

In [4]:
# Swap IDs from CEL to HUEX
filename = os.path.join(CEL_LOCATION, "core")
with open(filename, "r") as f:
    lines = f.readlines()
    swapped = [CELTOHUAX[line] for line in lines[0].rstrip().split("\t")[1:]]

swappedLine = "IDs\t" + "\t".join(swapped)
with open("normalisedFirstLine", "w") as f:
    f.write(swappedLine)
    f.write("\n")

In [5]:
# finish off the job using a bash script

import subprocess
completed = subprocess.run("./analyseBlood.sh", stdout=subprocess.PIPE, stderr=subprocess.PIPE)
print("stdout")
print(completed.stdout.decode("utf-8"))
print("sderr")
print(completed.stderr.decode("utf-8"))

stdout
cleaning the dataset normalised.
file cleaned written to /home/picrin/programming/phd/kurkiewicz-transcriptomics
file containes 22017 lines

sderr
35 None None
189676HUEX1A11
189694HUEX1A11
189682HUEX1A11
189598HUEX1A11
189688HUEX1A11
189604HUEX1A11
190217HUEX1A11
189700HUEX1A11
189706HUEX1A11
190223HUEX1A11
189610HUEX1A11
189616HUEX1A11
189622HUEX1A11
189628HUEX1A11
189670HUEX1A11
189664HUEX1A11
189730HUEX1A11
189634HUEX1A11
190229HUEX1A11
190235HUEX1A11
189640HUEX1A11
190241HUEX1A11
189712HUEX1A11
190247HUEX1A11
189718HUEX1A11
189646HUEX1A11
190253HUEX1A11
189652HUEX1A11
190259HUEX1A11
189724HUEX1A11
190265HUEX1A11
189592HUEX1A11
189586HUEX1A11
189658HUEX1A11
190271HUEX1A11
189676HUEX1A11
189694HUEX1A11
189682HUEX1A11
189598HUEX1A11
189688HUEX1A11
189604HUEX1A11
190217HUEX1A11
189700HUEX1A11
189706HUEX1A11
190223HUEX1A11
189610HUEX1A11
189616HUEX1A11
189622HUEX1A11
189628HUEX1A11
189670HUEX1A11
189664HUEX1A11
189730HUEX1A11
189634HUEX1A11
190229HUEX1A11
190235HUEX1A11
189640HU

# Analyse the data using Numpy