In [1]:
import pandas as pd
from docx import Document
import regex as re
import glob
from lxml import etree

from collections import Counter, defaultdict

In [2]:
fastaSequenceMirRE = re.compile(r'^[ATCGU]{15,30}$')
mirnaFasta = re.compile(r'>.*\n[ATCGU]{15,30}$')
fastaStartRE = re.compile(r'^>\S\S\S\S+')

In [3]:
print(fastaSequenceMirRE.search("GTGTGTTCTCTTATGTTGGTTG"))
print(fastaSequenceMirRE.search("Sequence: GTGTGTTCTCTTATGTTGGTTG"))

print(fastaSequenceMirRE.search("AUACAUGUGUGGCGUUGAUGGA"))


<regex.Match object; span=(0, 22), match='GTGTGTTCTCTTATGTTGGTTG'>
None
<regex.Match object; span=(0, 22), match='AUACAUGUGUGGCGUUGAUGGA'>


In [4]:
print(mirnaFasta.search("""
521_94_22_3p cand1

>seq_3430 22 bp
GAAAUACCAGUGGCUUACCGCA"""))

<regex.Match object; span=(21, 59), match='>seq_3430 22 bp\nGAAAUACCAGUGGCUUACCGCA'>


In [38]:
def is_empty(elem):
    if elem is None:
        return True

    if len(str(elem)) == 0:
        return True

    return False

def test_substrings_included(word, substrs):
    for x in substrs:
        if x in word:
            return True
    return False

def test_is_sequence( word):
    mirSequence = str(word).replace("Sequence: ", "").strip()
    match = fastaSequenceMirRE.search(mirSequence)

    if match:
        return True

    match = mirnaFasta.search(mirSequence)
    if match:
        return True

    return False
    
def column_contains_mirnacells(df, indexRow, xi):
    

    foundSeqs = 0
    for ri, row in df.loc[indexRow+1:min(10, df.shape[0]),].iterrows():
        
        cellContent = str(row[xi])

        #print('"'+cellContent+'"')
        #print(mirnaFasta.search(cellContent))

        if mirnaFasta.search(cellContent):
            foundSeqs += 1

    if foundSeqs > 0:
        return True

    print("[column_contains_mirnacells] Column", xi, "is not a valid miRNA Sequence", foundSeqs)
    return False

def column_contains_mirnas(df, indexRow, xi):

    foundSeqs = 0
    testedSeqs = 0
    for ri, row in df.loc[indexRow+1:min(10, df.shape[0]),].iterrows():
        
        word = str(row[xi])

        #print(word, test_is_sequence(word))
        if not is_empty(word):
            testedSeqs+= 1

        if test_is_sequence(word):
            foundSeqs += 1
        #else:
        #    print("F", word)

    if testedSeqs>0 and foundSeqs/testedSeqs > 0.8:
        return True

    print("[column_contains_mirnas] Column", xi, "is not a valid miRNA Sequence", foundSeqs, testedSeqs)
    return False

def column_contains_highlighted_mirnas(table, indexRow, xi):

    foundSeqs = 0
    testedSeqs = 0
    for ri, row in enumerate(table.rows):
        
        if ri <= indexRow:
            continue

        for par in row.cells[xi].paragraphs:
            cellWasFilled = False
            mirnaWasFound = False

            for run in par.runs:

                specialText = bool(run.underline) or bool(run.bold) or bool(run.italic)

                if not is_empty(run.text):
                    cellWasFilled = True
                
                if specialText and test_is_sequence(run.text):
                    mirnaWasFound = True

            if cellWasFilled:
                testedSeqs += 1

            if mirnaWasFound: 
                foundSeqs += 1
        
    if  testedSeqs>0 and foundSeqs/testedSeqs > 0.8:
        return True

    print("[column_contains_highlighted_mirnas] Column", xi, "is not a valid miRNA Sequence", foundSeqs)
    return False

def getNameSeqColumns(df, table=None):

    indexRow = 0
    for i in range(0, min(10, df.shape[0])):
        idxCnt = Counter(["str" if not pd.isna(x) and not x == "nan" else "NAN" for x in df.loc[i,]])
        #print(i,idxCnt)
        if idxCnt.most_common(1)[0][0] == "str":
            indexRow = i
            break

    #print(indexRow)
    potentialColumns = list(df.loc[indexRow,])
    potentialColumns = [str(x).strip() for x in potentialColumns]
    #print(potentialColumns)
    
    mirName = None
    mirSeq = None
    mirReason = None

    mirFounds = []

    for xi, x in enumerate(potentialColumns):
        if test_substrings_included(x.upper(), ["NAME","ACCESSION", "MATURE MIRNA"]) and not test_substrings_included(x.upper(), ["TARGET GENE", "PROTEIN", "GENE"]):
            mirName = xi
            print("set mirName", mirName, mirSeq, potentialColumns[xi])

        elif test_substrings_included(x.upper(), ["MIRNA SEQUENCE","MATURE SEQUENCE", "SEQUENCE OF MATURE", "SEQUENCE", "5P MIRNA", "3P MIRNA"]) and column_contains_mirnas(df, indexRow, xi) and mirSeq is None:

            mirSeq = xi
            mirReason = "SEQ_COLUMN"
            print("set mirSeq", mirName, mirSeq, potentialColumns[xi])
            
        elif test_substrings_included(x.upper(), ["MIRNA"]) and column_contains_mirnacells(df, indexRow, xi) and mirSeq is None:
            mirSeq = xi
            mirName = xi
            mirReason = "FASTA_COLUMN"

        elif not table is None and test_substrings_included(x.upper(), ["PRECURSOR SEQUENCE", "MIRNA SEQUENCE","MATURE SEQUENCE", "SEQUENCE OF MATURE", "SEQUENCE"]) and column_contains_highlighted_mirnas(table, indexRow, xi) and mirSeq is None:
            mirSeq = xi
            mirReason = "HIGHLIGHT_COLUMN"

        if mirSeq != None and mirName != None:

            mirFounds.append((mirName, mirSeq, mirReason))

            #mirName = None
            mirSeq = None
            mirReason = None


    if mirName is None and mirSeq is None:
        print(potentialColumns)

        for xi, x in enumerate(potentialColumns):

            if test_substrings_included(x.upper(), ["MIRNA", "MIRNA SEQUENCE","MATURE SEQUENCE", "SEQUENCE OF MATURE", "SEQUENCE"]) and column_contains_mirnas(df, indexRow, xi) and mirSeq is None:

                mirSeq = xi
                mirName = xi
                mirReason = "NAME_SEQ_COLUMN"

            if mirSeq != None and mirName != None:

                mirFounds.append((mirName, mirSeq, mirReason))
                mirName = None
                mirSeq = None
                mirReason = None


    print(mirFounds)
    for mirName, mirSeq, mirReason in mirFounds:
        print("Name:", mirName, potentialColumns[mirName] if mirName != None else "")
        print("Seq: ", mirSeq, potentialColumns[mirSeq] if mirSeq != None else "")
        print("Reason: ", mirReason)

    return indexRow, mirFounds

In [6]:
test_is_sequence("AUACAUGUGUGGCGUUGAUGGA")

True

In [7]:
def extract_mirnas_from_document(infile):

    document = Document(infile)
    
    for table in document.tables:
        data = [[cell.text for cell in row.cells] for row in table.rows]
        df = pd.DataFrame(data)
        
        extract_mirnas_from_pandas_tabledf( df, table )



In [27]:
def extract_mirnas_from_pandas_tabledf( df, table=None ):
    startRow, mirFounds = getNameSeqColumns(df, table)
    
    for mirName, mirSeq, mirReason in mirFounds:

        print(mirSeq, mirName, mirReason)
        if mirName == None or mirSeq == None:
            print("Skipping")
            print(startRow, mirName, mirSeq)
            continue

        for ri, row in df.loc[startRow:,].iterrows():

            mirSeqText = str(row[mirSeq])
            mirNameText = str(row[mirName])

            if not is_empty(mirSeqText) and not is_empty(mirSeqText):

                mirSequence = mirSeqText.replace("Sequence: ", "")

                if test_is_sequence(mirSeqText):
                    print(mirNameText, mirSeqText)


In [9]:
def extract_mirnas_from_excel(infile):
    sheetsDF = pd.read_excel(infile, header=None, sheet_name=None)
    print("Sheets", [x for x in sheetsDF])

    #columns is empty!

    extract_mirnas_sheetsdf(sheetsDF)

In [10]:
def extract_mirnas_from_xml(infile):
    tree = etree.parse(infile)
    tables = tree.findall("//table")

    sheetsDF = {}
    for ti, table in enumerate(tables):
        dfs = pd.read_html(etree.tostring(table,method='html'))

        for tii, tdf in enumerate(dfs):
            tname = "Table{}.{}".format(ti, tii)

            fracStrCols = sum([1 if type(x) == str else 0 for x in tdf.columns])/len(tdf.columns)
            
            if fracStrCols > 0 or isinstance(tdf.columns, pd.MultiIndex):
                #print("Removing Columns")
                #print(type(tdf.columns))
                if isinstance(tdf.columns, pd.MultiIndex):
                    #print("Fixing Columns")

                    lvlValues = [""] * len(tdf.columns.get_level_values(0))

                    for x in range(0, len(tdf.columns.levels)):

                        levelValues = tdf.columns.get_level_values(x)

                        if len(set(levelValues)) == 1:
                            continue

                        for vi, v in enumerate(levelValues):

                            if v in lvlValues[vi]:
                                continue

                            if len(lvlValues[vi]) > 0:
                                lvlValues[vi] += " "
                            lvlValues[vi] += v

                    print(lvlValues)

                    tdf.columns = list(tdf.columns.get_level_values(len(tdf.columns.levels)-1))
                    #print(tdf.columns)
                    
                tdf = tdf.T.reset_index().T.reset_index(drop=True)

            sheetsDF[tname] = tdf

            #print(tname)
            #print(tdf)

    print("Sheets", [x for x in sheetsDF])

    extract_mirnas_sheetsdf(sheetsDF)

    return sheetsDF

In [11]:
def extract_mirnas_sheetsdf(sheetsDF):

    if len(sheetsDF) == 0:
        return

    print("Testing FASTA")
    extract_mirnas_from_pandas_fasta(sheetsDF)
    print("Testing COLUMNS")
    extract_mirnas_from_pandas_table(sheetsDF)

In [12]:
def extract_mirnas_from_pandas_table(sheetsDF):

    for sheet in sheetsDF:

        df = sheetsDF[sheet]
        print(sheet, df.shape)
        if 0 in df.shape:
            print("Skipping sheet", sheet)
            continue

        extract_mirnas_from_pandas_tabledf( df )

In [13]:
def extract_mirnas_from_pandas_fasta(sheetsDF):

    for sheet in sheetsDF:

        df = sheetsDF[sheet]

        for column in df.columns:

            bestRowSequence = None
            rowSequence = []
            
            for rowIdx, row in df.iterrows():

                fastaStart = fastaStartRE.search(str(row[column]))
                fastaSequenceMir = fastaSequenceMirRE.search(str(row[column]))

                if fastaStart:
                    #print(rowIdx, "fasta start")
                    rowSequence.append("start")
                if fastaSequenceMir:
                    #print(rowIdx, "fasta seq")
                    rowSequence.append("seq")

                if not fastaStart and not fastaSequenceMir:
                    if len(rowSequence) > 0 and (bestRowSequence == None or len(bestRowSequence) > len(rowSequence)):
                        bestRowSequence = rowSequence

                if rowIdx > 20:
                    break
            if len(rowSequence) > 0 and (bestRowSequence == None or len(bestRowSequence) > len(rowSequence)):
                bestRowSequence = rowSequence

            if not bestRowSequence is None:

                count = 0
                bestCount = 0
                for i in range(1, len(bestRowSequence)):

                    if bestRowSequence[i-1] != bestRowSequence[i]:
                        count += 1

                    else:
                        if count > bestCount:
                            bestCount = count

                if count > bestCount:
                            bestCount = count

                if bestCount > 0:
                    print(sheet, column, bestRowSequence)
                    print(sheet, column, bestCount)
                    print(df[column])

In [14]:
allFiles = []
allFiles += glob.glob("../covidtexts/*.xml")
allFiles += glob.glob("*/*.xlsx")
allFiles += glob.glob("*/*.XLSX")
allFiles += glob.glob("*/*.docx")
allFiles += glob.glob("*/*.DOCX")
allFiles = sorted(allFiles)
allFiles

['../covidtexts/PMC7278893.xml',
 '../covidtexts/PMC7381279.xml',
 '../covidtexts/PMC7382400.xml',
 '../covidtexts/PMC7395633.xml',
 '../covidtexts/PMC7717134.xml',
 '../covidtexts/PMC7773562.xml',
 '../covidtexts/PMC7834301.xml',
 '../covidtexts/PMC8078050.xml',
 '../covidtexts/PMC8193712.xml',
 '../covidtexts/PMC8257610.xml',
 '../covidtexts/PMC8294073.xml',
 '../covidtexts/PMC8307234.xml',
 '../covidtexts/PMC8358877.xml',
 '../covidtexts/PMC8719879.xml',
 '../covidtexts/PMC8733928.xml',
 '../covidtexts/PMC8811647.xml',
 'PMC7278893/PMC7278893_peerj-08-9369-s001.xlsx',
 'PMC7381279/PMC7381279_Data_Sheet_1.XLSX',
 'PMC7381279/PMC7381279_Data_Sheet_2.XLSX',
 'PMC7381279/PMC7381279_Data_Sheet_3.XLSX',
 'PMC7381279/PMC7381279_Data_Sheet_4.XLSX',
 'PMC7381279/PMC7381279_Data_Sheet_5.XLSX',
 'PMC7381279/PMC7381279_Data_Sheet_6.DOCX',
 'PMC7395633/PMC7395633_mmc2.docx',
 'PMC7773562/PMC7773562_mmc1.xlsx',
 'PMC8078050/PMC8078050_mmc1.docx',
 'PMC8078050/PMC8078050_mmc2.docx',
 'PMC8078050/P

In [15]:
sorted(glob.glob("../covidtexts/*.xml"))

['../covidtexts/PMC7278893.xml',
 '../covidtexts/PMC7381279.xml',
 '../covidtexts/PMC7382400.xml',
 '../covidtexts/PMC7395633.xml',
 '../covidtexts/PMC7717134.xml',
 '../covidtexts/PMC7773562.xml',
 '../covidtexts/PMC7834301.xml',
 '../covidtexts/PMC8078050.xml',
 '../covidtexts/PMC8193712.xml',
 '../covidtexts/PMC8257610.xml',
 '../covidtexts/PMC8294073.xml',
 '../covidtexts/PMC8307234.xml',
 '../covidtexts/PMC8358877.xml',
 '../covidtexts/PMC8719879.xml',
 '../covidtexts/PMC8733928.xml',
 '../covidtexts/PMC8811647.xml']

In [46]:
for infile in [ '../covidtexts/PMC7773562.xml']:
    print()
    print()
    print()
    print(infile)
    if infile.upper().endswith(".XLSX"):
        extract_mirnas_from_excel(infile)

    elif infile.upper().endswith(".XML"):
        sdf = extract_mirnas_from_xml(infile)

    elif infile.upper().endswith(".DOCX"):
        extract_mirnas_from_document(infile)





../covidtexts/PMC7773562.xml
Sheets ['Table0.0', 'Table1.0', 'Table2.0']
Testing FASTA
Testing COLUMNS
Table0.0 (21, 9)
set mirName 0 None Name
[column_contains_mirnas] Column 2 is not a valid miRNA Sequence 0 10
[]
Table1.0 (21, 5)
set mirName 0 None Name
set mirSeq 0 2 Sequence
set mirSeq 0 4 Sequence
[(0, 2, 'SEQ_COLUMN'), (0, 4, 'SEQ_COLUMN')]
Name: 0 Name
Seq:  2 Sequence
Reason:  SEQ_COLUMN
Name: 0 Name
Seq:  4 Sequence
Reason:  SEQ_COLUMN
2 0 SEQ_COLUMN
SARS-CoV-2-pre-miR-R6 AAGAGUAGACUAUAUAUCGUAA
SARS-CoV-2-pre-miR-R5 AGAUGAAACAUCUGUUGUCACU
SARS-CoV-2-pre-miR-R4 AUCAACAAUUUUAUUGUAGAUG
SARS-CoV-2-pre-miR-R3 CAUUUGAGUUAUAGUAGGGAUG
SARS-CoV-2-pre-miR-R2 UUCUUAAAAGAGGGUGUGUAGU
SARS-CoV-2-pre-miR-R1 CACUUUUCUCAAAGCUUUCGCU
SARS-CoV-2-pre-miR-D14 AUAGUGUUUAUAACACUUUGCU
SARS-CoV-2-pre-miR-D13 ACUGUUGCUACAUCACGAACGC
SARS-CoV-2-pre-miR-D12 UGAUCCUUCGUGGACAUCUUCG
SARS-CoV-2-pre-miR-D11 UUGGAGGUUCCGUGGCUAUAAA
SARS-CoV-2-pre-miR-D10 UUCAUAACAGAUGCGCAAACAG
SARS-CoV-2-pre-miR-D9 UAUGUACCAC

In [17]:
[i for i in range(0, len(sdf["Table1.0"].columns.levels)-1)]

AttributeError: 'RangeIndex' object has no attribute 'levels'

In [None]:
isinstance(sdf["Table1.0"].columns, pd.MultiIndex)

True

In [None]:
sdf["Table1.0"]

Unnamed: 0_level_0,MatureBayes results,MatureBayes results,MatureBayes results,MatureBayes results,MatureBayes results
Unnamed: 0_level_1,Name,5′ stem,5′ stem,3′ stem,3′ stem
Unnamed: 0_level_2,Name,Position,Sequence,Position,Sequence
0,SARS-CoV-2-pre-miR-R6,15–36,AAGAGUAGACUAUAUAUCGUAA,54–75,UUUAUAUAGCCCAUCUGCCUUG
1,SARS-CoV-2-pre-miR-R5,34–55,AGAUGAAACAUCUGUUGUCACU,44–65,UCUGUUGUCACUUACUGUACAA
2,SARS-CoV-2-pre-miR-R4,32–53,AUCAACAAUUUUAUUGUAGAUG,54–75,AAGAAGGUAACAUGUUCAACAC
3,SARS-CoV-2-pre-miR-R3,15–36,CAUUUGAGUUAUAGUAGGGAUG,58–79,AAAAGUGCAUCUUGAUCCUCAU
4,SARS-CoV-2-pre-miR-R2,24–45,UUCUUAAAAGAGGGUGUGUAGU,61–82,CCACCACAUCACCAUUUAAGUC
5,SARS-CoV-2-pre-miR-R1,23–44,CACUUUUCUCAAAGCUUUCGCU,48–69,AUUUCAGUAGUGCCACCAGCCU
6,SARS-CoV-2-pre-miR-D14,46–67,AUAGUGUUUAUAACACUUUGCU,81–102,AAAGACAGAAUGAUUGAACUUU
7,SARS-CoV-2-pre-miR-D13,32–53,ACUGUUGCUACAUCACGAACGC,72–93,GAGCUUCGCAGCGUGUAGCAGG
8,SARS-CoV-2-pre-miR-D12,35–56,UGAUCCUUCGUGGACAUCUUCG,52–73,CUUCGUAUUGCUGGACACCAUC
9,SARS-CoV-2-pre-miR-D11,15–36,UUGGAGGUUCCGUGGCUAUAAA,61–82,UGAUCUUUAUAAGCUCAUGGGA


In [None]:
sdf["Table1.0"].columns.levels[len(sdf["Table1.0"].columns.levels)-1]

Index(['Name', 'Position', 'Sequence'], dtype='object')

In [None]:
sdf["Table1.0"]

Unnamed: 0_level_0,MatureBayes results,MatureBayes results,MatureBayes results,MatureBayes results,MatureBayes results
Unnamed: 0_level_1,Name,5′ stem,5′ stem,3′ stem,3′ stem
Unnamed: 0_level_2,Name,Position,Sequence,Position,Sequence
0,SARS-CoV-2-pre-miR-R6,15–36,AAGAGUAGACUAUAUAUCGUAA,54–75,UUUAUAUAGCCCAUCUGCCUUG
1,SARS-CoV-2-pre-miR-R5,34–55,AGAUGAAACAUCUGUUGUCACU,44–65,UCUGUUGUCACUUACUGUACAA
2,SARS-CoV-2-pre-miR-R4,32–53,AUCAACAAUUUUAUUGUAGAUG,54–75,AAGAAGGUAACAUGUUCAACAC
3,SARS-CoV-2-pre-miR-R3,15–36,CAUUUGAGUUAUAGUAGGGAUG,58–79,AAAAGUGCAUCUUGAUCCUCAU
4,SARS-CoV-2-pre-miR-R2,24–45,UUCUUAAAAGAGGGUGUGUAGU,61–82,CCACCACAUCACCAUUUAAGUC
5,SARS-CoV-2-pre-miR-R1,23–44,CACUUUUCUCAAAGCUUUCGCU,48–69,AUUUCAGUAGUGCCACCAGCCU
6,SARS-CoV-2-pre-miR-D14,46–67,AUAGUGUUUAUAACACUUUGCU,81–102,AAAGACAGAAUGAUUGAACUUU
7,SARS-CoV-2-pre-miR-D13,32–53,ACUGUUGCUACAUCACGAACGC,72–93,GAGCUUCGCAGCGUGUAGCAGG
8,SARS-CoV-2-pre-miR-D12,35–56,UGAUCCUUCGUGGACAUCUUCG,52–73,CUUCGUAUUGCUGGACACCAUC
9,SARS-CoV-2-pre-miR-D11,15–36,UUGGAGGUUCCGUGGCUAUAAA,61–82,UGAUCUUUAUAAGCUCAUGGGA
