# Bad Blood

The objective of this program is to analyse any IR spectra and extract info from their file names.

Written by Mario González Jiménez  
School of Chemistry, University of Glasgow  
Glasgow G12 8QQ, UK.



### Changelog

- Version 1.0.
    - Original Program based on ``Loco mosquito 5.0``
- Version 1.1.
    - Works with filenames of any number of categories.

## What spectra can Bad Blood read?

Bad Blood can read any ``*.dpt`` or ``*.mzz`` file. To extract info from the name, the different categories must be separated in sections by a hyphen and **ALL** the files of the folder must have the same number of sections. If not, the program will not work. Bad Blood will interpret each section and if there are enough sections prepare a table with the number of spectra in each category.




### Useful modules

These are the modules that will be used by Bad Blood

In [None]:
#we will use numpy to process the data
import numpy as np 
# to remove the water and carbon dioxide spectra from our data
from scipy import optimize

# to make the figures
import matplotlib.pyplot as plt 

# to manage the files 
import os
# to deal with file paths on Windows, Mac and Linux
from pathlib import Path 
# to manage the data textfiles
import csv 
# to decompress the mzz files
import zipfile 
import zlib

# to control the program execution
import time 
# to know the progress in the slow parts
from tqdm import tqdm 

# a quick algorithm to check if all the names have the same number of sections
def equality(listina):
    listina = iter(listina)
    try:
        uno = next(listina)
    except StopIteration:
        return True
    return all(uno == rest for rest in listina)

First you must indicate the folder location. To do this, the easiest way to proceed is just going to a file in the folder where the spectra are, right-click on it, select properties, and, then, copy the text at the right of "Location:". **It is not necessary** to add `\` or `/` at the end of the path.

In [None]:
spectra_path = input("Please, indicate the folder location: ")

The following algorithm checks that the files in the folder are rightly named. 

In [None]:
# we include the path module that helps a lot with the / or \ problem
spectra_path = Path(spectra_path)
# find all the .mzz and .dpt files in the folder (and its subfolders)
dptfiles = []
mzzfiles = []
for root, dirs, files in os.walk(spectra_path):
    for file in files:
        if file.endswith(".dpt"):
             dptfiles.append(Path(root) / Path(file))
        elif file.endswith(".mzz"):
             mzzfiles.append(Path(root) / Path(file))
#If there are *.mzz files, we will not consider the *.dpt files
if len(mzzfiles) > 0:
    spectra_names = mzzfiles
    mzzq = True
else:
    spectra_names = dptfiles
    mzzq = False
naimeision = []
for i in spectra_names:
    # To obtain the info from the name of the file, first we split the name in the different sections:
    tmp = os.path.basename(i).split(".")[0].split(" ")[0].split("-")
    tmp2 = len(tmp)
    naimeision.append(tmp2)
if equality(naimeision) == False:
    nf = []
    lf = list(set(naimeision))
    for j in lf:
        nf.append(naimeision.count(j))
    jf = []
    for k,l in enumerate(naimeision):
        if l == lf[nf.index(min(nf))]:
            jf.append(os.path.basename(spectra_names[k]))
    print("Attention!!!!")
    print("Not all files have the same number of sections. These are probably the files that are misnamed:")
    print("")
    for j in jf:
        print("     "+j)
else:
    print("Everything seems all right. You may continue.")

If everything went well, you can proceed with extracting the info from the names. This first part of the algorithm clasify the data and if it detects dates, let you know if there are problems with their format:

In [None]:
if equality(naimeision) == True:
    tembo = [[] for jijiji in range(tmp2)]
    for i in spectra_names:
        # To obtain the info from the name of the file, first we split the name in the different sections:
        tmp = os.path.basename(i).split(".")[0].split(" ")[0].split("-")
        for k in range(tmp2):
            tembo[k].append(tmp[k])
    hakuna = [[] for jijiji in range(tmp2)]
    matata = []
    for i in range(len(tembo)):
        hakuna[i] = list(set(tembo[i]))
        if len(hakuna[i]) > 10:
            if len(hakuna[i][0]) == 6 and (hakuna[i][0][0] == "1" or hakuna[i][0][0] == "2"):
                matata.append("Dat")
            else:
                matata.append("ID")
        else:
            matata.append("Cat")
    jf = []
    for j, l in enumerate(matata):
        if l == "Dat":
            lf = []
            for m,n in enumerate(tembo[j]):
                if len(n) != 6:
                    jf.append(os.path.basename(spectra_names[m]))
    if len(jf) > 0:
        print("Attention!!!!")
        print("There are files with wrond date format. These are probably the files that are misnamed:")
        print("")
        for j in jf:
            print("     "+j) 
else: 
    print("I told you this was not going to run unless you solve the problem with the names")

The second part of the algorithm creates the matrix with the extracted data:

In [None]:
matrix = []
mbuni = [m for m,n in enumerate(matata) if n != "Dat"]
kifaru = [m for m,n in enumerate(matata) if n == "Dat"]
wrong_named = []
# Now we load the spectra in a matrix
for i in tqdm(spectra_names):
    # To obtain the info from the name of the file, first we split the name in the different sections:
    tmp = os.path.basename(i).split(".")[0].split(" ")[0].split("-")
    # First the spectrum and its characteristics
    if mzzq == False:
        with open(i, 'rb') as tmp:
            avmi = (line.replace(b'\t',b',') for line in tmp)
            spectrum = np.genfromtxt(avmi, delimiter=',')
        start = spectrum[0,0]
        end = spectrum[-1,0]
        ls = len(spectrum)
        spectrum = np.transpose(spectrum)[1]    
    else:
        with zipfile.ZipFile(i) as myzip:
            tmpname = myzip.namelist()[0]
            with myzip.open(tmpname) as myfile:
                spectrum = np.genfromtxt(myfile, delimiter=',')
        start = spectrum[0]
        end = spectrum[1]
        ls = int(spectrum[2])
        spectrum = spectrum[3:]
    # And then we incorporate all the info to the matrix
    fisi = [[start,end,ls],spectrum] + [tmp[j] for j in mbuni]
    if len(kifaru) == 2:
        try:
            colday = time.mktime(time.strptime(tmp[kifaru[0]],"%y%m%d"))
        except:
            wrong_named.append(i)
            continue
        try:
            mesday = time.mktime(time.strptime(tmp[kifaru[1]],"%y%m%d"))
        except:
            wrong_named.append(i)
            continue
        stime = round(abs((mesday - colday) / (3600 * 24)))
        fisi.append(stime)
    matrix.append(fisi)

This is the list with the files that the algorithm detected with wrong date formats:

In [None]:
wrong_named

## Number of spectra for each category

Coming soon!!



## Detection of spectra with low quality

### Spectra with low intensity

If the mosquito was not well placed at the ATR's crystal, the intensity of the whole spectrum is small. Our experience says that we can use as reference the small plateau between 400 and 500 wavenumbers that the mosquito spectra usually have. Since the spectrometer with ZnSe optics only can reach to 500 wavenumbers, it doesn't matter if we extend this range to 600 cm<sup>-1</sup>. Then if the average of this reference is smaller than 0.11, the spectrum doesn't have enough quality to be scaled and, then, of course, employed.

In [None]:
# A list of the discarted spectra will be collected:
bad_spectra = []
for i in range(len(matrix)):
    # first we calculate the position of the points that comprise that section of the spectrum
    if matrix[i][0][1] < 600 and matrix[i][0][1] > 400:
        sta = int(round((((matrix[i][0][2] - 1) / (matrix[i][0][1] - matrix[i][0][0])) * (600 - matrix[i][0][0])) + 1)) - 1
        end = matrix[i][0][2]
    elif matrix[i][0][1] <= 400:
        sta = int(round((((matrix[i][0][2] - 1) / (matrix[i][0][1] - matrix[i][0][0])) * (600 - matrix[i][0][0])) + 1)) - 1
        end = int(round((((matrix[i][0][2] - 1) / (matrix[i][0][1] - matrix[i][0][0])) * (400 - matrix[i][0][0])) + 1)) - 1
    else:
        sta = 0 # if the spectrum doesn't reach 600 cm-1 we cannot prove if the spectrum has enough intensity
        raise Exception("The spectrum {} doesn't reach 600 cm-1".format(spectra_names[1]))
    # Now we check the intensity of the spectra in that region. If is not over 0.1 we discard the spectrum
    if np.average(matrix[i][1][sta:end]) < 0.11:
        bad_spectra.append("LI: " + str(spectra_names[i]))
        matrix[i] = None
if (bad_spectra) == 1:
    print("1 spectrum has been discarded because its low intensity")
else:
    print(str(len(bad_spectra)) + " spectra have been discarded because their low intensity")

### Spectra with abnormal background

Bruker spectrometers sometimes measure spectra whose characteristics have been masked by a high-absorbance band covering the entire mid-infrared region. I do not know the origin of this band. I used to think it was due to the interaction of the ATR crystal with the metallic anvil, but now I'm inclined to think it's due to some vibration or shock during measurement. Sometimes these spectra pass the previous filters and it is necessary to remove them. To do it that we are going to do is to select the wavenumber with less signal from the mosquito (this wavenumber is usually 1900 cm<sup>-1</sup>) and look for outliers at that frequency.

In [None]:
bs = 0 # counter for the number of spectra discarderd 
# we calculate the fences of the data set based in a value we can choose (in statistics 1.5 times
# the interquartile range is the inner fence and 3 times is the outer fence)
l = 2.5
# We look for the point at 1900 cm-1 and add it to the list of intensities
li = []
for i in range(len(matrix)):
    if matrix[i]: #to check if we have spectra
        # Now one would spect that the spectrum will reach 3900 so the program will not check it out.
        sta = int(round((((matrix[i][0][2] - 1) / (matrix[i][0][1] - matrix[i][0][0])) * (1900 - matrix[i][0][0])) + 1)) - 1
        li.append(matrix[i][1][sta])
q3, q1 = np.percentile(li, [75 ,25])
ir = q3 - q1
for i in range(len(matrix)):
    if matrix[i]: #to check if we have spectra
        sta = int(round((((matrix[i][0][2] - 1) / (matrix[i][0][1] - matrix[i][0][0])) * (1900 - matrix[i][0][0])) + 1)) - 1
        if matrix[i][1][sta] > (l * ir + q3) or matrix[i][1][sta] < (q1 - l * ir):
            bs +=1
            bad_spectra.append("SA: " + str(spectra_names[i]))
            matrix[i] = None 
if (bs) == 1:
    print("1 spectrum has been discarded because it was distorted by the anvil")
else:
    print(str(bs) + " spectra have been discarded because they were distorted by the anvil")

### Spectra with atmospheric interferences

If the spectra were measured after the change of the beamsplitter or after installing the ATR and the background was not correctly measured, the spectra will be with the interference of the water and CO<sub>2</sub> spectra. In the case of water vapour, its IR spectrum has three bands with many narrow peaks that can appear as noise: one between 4000 and 3400 cm<sup>-1</sup>, other between 2200 and 1300 cm<sup>-1</sup> and the last one starts to appear below 800 cm<sup>-1</sup>. CO<sub>2</sub> has one strong band at 2345 cm<sup>-1</sup> and two smaller bands (unless you use a very fine resolution, you will not see its narrow peaks) at 3650 and 750 cm<sup>-1</sup>. Since some of this features appear in the same regions than the most interesting peaks from mosquitos, it is necessary to get rid of them. The following algorithms (originally part of a program named with another Iggy Pop song: ``Candy 0.0``) check the presence of these of these bands:

In [None]:
bs = 0 # counter for the number of spectra discarderd
mycollection = []
# Now we define a function to calculate the R-squared coefficient of the fitting of our data to a polynomial
def rs_pf(x, y, degree):
    coeffs = np.polyfit(x, y, degree)
    # r-squared
    p = np.poly1d(coeffs)
    # fit values, and mean
    yhat = p(x)                         # or [p(z) for z in x]
    ybar = np.sum(y)/len(y)          # or sum(y)/len(y)
    ssreg = np.sum((yhat-ybar)**2)   # or sum([ (yihat - ybar)**2 for yihat in yhat])
    sstot = np.sum((y - ybar)**2)    # or sum([ (yi - ybar)**2 for yi in y])
    results = ssreg / sstot

    return results

# Here take that the section of the data between 3900 and 3500 cm-1 and check if it fits well to a 5th degree polinomial
for i in range(len(matrix)):
    if matrix[i]: #to check if we have spectra
        # Now one would spect that the spectrum will reach 3900 so the program will not check it out.
        sta = int(round((((matrix[i][0][2] - 1) / (matrix[i][0][1] - matrix[i][0][0])) * (3900 - matrix[i][0][0])) + 1)) - 1
        end = int(round((((matrix[i][0][2] - 1) / (matrix[i][0][1] - matrix[i][0][0])) * (3500 - matrix[i][0][0])) + 1)) - 1
        # we take that data:
        yd = matrix[i][1][sta:end]
        xd = list(range(len(yd)))
        rs = rs_pf(xd,yd,5)
        # And now, if the fitting is bad, we discard the spectrum
        if rs < 0.96:
            bs +=1
            bad_spectra.append("AI: " + str(spectra_names[i]))
            matrix[i] = None
if (bs) == 1:
    print("1 spectrum has been discarded because has atmospheric interferences")
else:
    print(str(bs) + " spectra have been discarded because have atmospheric interferences") 

## Number of spectra after screening

Coming soon!!

## Selection of the wavenumbers and data extraction

Now, we proceed to finish the task extracting the intensity of the remaining spectra at the wavenumbers that we want.  We just need to indicate those wavenumbers at the next list (we have to select always more than one wavenumber). If we want to extract a range only write two numbers, the starting wavenumber and the ending wavenumber. 

In [None]:
wns = [3855, 3400, 3275, 2922, 2853, 1900, 1745, 1635, 1539, 1457, 1306, 1154, 1076, 1027, 880, 525]

This algorithm corrects the wavenumbers selected in case they contain wavenumbers that are not in our spectra range

In [None]:
# it is very important to have the wavenumbers sorted from higher to lower
wns.sort(reverse = True)

# Now we check the lowest and highest measured wavenumbers 
a = []
b = []
for i in range(len(matrix)):
    if matrix[i]:
        a.append(matrix[i][0][0])
        b.append(matrix[i][0][1])
a = max(a)
b = min(b)

# If only two peaks remain, einselechta prevents the algorithm from interpreting them as a range.
if len(wns) == 2:
    einselechta = False
else:
    einselechta = True

# Now we correct the wavenumbers selected that are bigger than our highest measured wavenumber
if wns[0] > a:
    if len(wns) == 2:
        wns[0] = int(a)
    else: 
        while wns[0] > a:
            wns.pop(0)
        if len(wns) == 1 or wns[0] < b:
            wns.insert(0, int(a))
    
# And we do the same with the smaller wavenumbers:
if wns[-1] < b:
    if len(wns) == 2:
        wns[-1] = int(b)
    else:
        while wns[-1] < b:
            wns.pop()
            if len(wns) == 1:
                wns.append(int(b))   

We can see in a spectrum the wavenumbers selected

In [None]:
# We select a random spectrum:
n = np.random.randint(0,len(matrix)-1)
while not matrix[n]:
    n = np.random.randint(0,len(matrix)-1)

# we prepare the data
a = matrix[n][0][0]
b = matrix[n][0][1]
c = matrix[n][0][2]
xd = [a - x/c * (a-b) for x in range(c)]
yd = matrix[n][1]

# we draw the plot
plt.figure(figsize=(14,7))
plt.plot(xd,yd)
plt.xlim(a, b)

# and the selected wavenumbers
if len(wns) == 2 and einselechta == False:
    plt.axvspan(wns[0], wns[1], facecolor='papayawhip', alpha=0.5)
else:
    for i in wns:
        plt.axvline(x=i, c='black', lw = 1)    
        
# labels
plt.xlabel('Wavenumber')
plt.ylabel('Absorbance')
if len(wns) == 2 and einselechta == False:
    plt.title('Selected Range')
else:
    plt.title('Selected Wavenumbers')

plt.show()

## Choose your own matrix

Now it is the time to export the data. Since ``Bad Blood`` can deal with a lot of data, here is an algorithm to select only the spectra that we want for each category. These are the available options for each category: 

In [None]:
simba = [m for m,n in enumerate(matata) if n == "Cat"]
for i in simba:
    print("Cat " + str(i+1) + " - Options:")
    print("---------------- ")
    for j in hakuna[i]:
        print("      " + j)

You can make the selection by writing down in a list the options you want to use. These options have to be in lists. If you want to include all, just write ``"all"`` 

In [None]:
sel = [["all"],["IG", "SU"],["SF"]]

And now, the following algorithm will extract from the spectra the desired intensities and the desired characteristics. 

In [None]:
# We start the timer
a = time.time()

# We define the variable that will contain the final data
fida = []
csc = 0
ssel = 0

#We convert the "all" statements:
for m, n in enumerate(sel):
    if n[0] in ("all","aLl","ALL","All","alL","aLL"):
        sel[m] = hakuna[simba[m]]

# If we have selected a range of wavenumbers, now it is time to change it into discrete wavenumbers
# First, we look for the boundaries of the data
for i in range(len(matrix)):
    if matrix[i]:
        kk = i
        break
if len(wns) == 2 and einselechta == False:
    resolution = 2
    if matrix[kk][0][0] < wns[0]:
        wns[0] = int(matrix[kk][0][0])
    if matrix[kk][0][1] > wns[-1]:
        wns[-1] = int(matrix[kk][0][-1])+1
    wns = list(range(wns[0],wns[1],-resolution))

# And start the algorithm to extract the info
truth = True
for i in tqdm(matrix):
    # If that item exist
    if i:
        for j in range(len(sel)):
            if truth == True and i[2+j] in sel[j]:
                truth = True
            else:
                truth = False
        if truth == True:
            # we count the number of spectra that are removed because are too short for the range of wavenumbers selected
            if i[0][0] >= wns[0] and i[0][1] <= wns[-1]:
                pos = []
                for j in wns:
                    pos.append(int(round((((i[0][2] - 1) / (i[0][1] - i[0][0])) * (j - i[0][0])) + 1)) - 1)
                lint = []
                for k in pos:
                    lint.append(i[1][k])
                fuzz = []
                for k in range(len(mbuni)):
                    fuzz.append(i[2+k])
                if len(kifaru) == 2:
                    fuzz.append(str(int(i[2+len(mbuni)])))
                fida.append(fuzz + lint)
                ssel += 1
            else:
                csc += 1
fida = sorted(fida)
fluf = []
for i in mbuni:
    if matata[i] == "Cat":
        fluf.append(matata[i] + str(i+1))
    else:
        fluf.append(matata[i])
if len(kifaru) == 2:
    fluf.append("StoTime")
fida.insert(0,fluf + wns)
if (csc) == 0:
    pass
elif (csc) == 1:
    print("1 spectrum has been discarded because was shorter than the selected wavenumbers")
else:
    print(str(csc) + " spectra have been discarded because were shorter than the selected wavenumbers")  
b = time.time()
print("This last process has lasted " + str(round(b-a,3)) + " s. The new matrix cotains " + str(ssel) + " spectra.")

## Exporting the matrix

Now we export the matrix with the info in the same folder where the data were collected

In [None]:
with open(spectra_path.parent / Path("the matrix.dat"), 'w') as file:
    sc = csv.writer(file, delimiter='\t')
    for i in fida:
            sc.writerow(i)