In [1]:
from pathlib import Path
import pandas as pd

In [2]:
PATH = Path.cwd()
DATA_PATH = PATH/'datasets'
with open(PATH/'gene_synonyms.txt') as f:
    genes = [line.strip().split(',') for line in f.readlines()]

In [3]:
def read(path):
    try: return pd.read_excel(path).to_string().lower()
    except ValueError: return pd.read_csv(path).to_string().lower()

In [4]:
def output_found_genes(dir_path):
    for path in (dir_path).iterdir():
        print(path.name, ":")
        string = read(path)
        for gene in genes:
            for synonym in gene:
                if synonym.lower() in string:
                    print('\t', synonym)

In [5]:
def paths_to_excels(dir_path):
    excels, df_names = [], []
    for path in (dir_path).iterdir():
        df_names.append(path.name)
        df = pd.read_excel(path)
        out = df.head(9)
        for gene in genes:
            for synonym in gene:
                out = out.append(df[df['Unnamed: 1'].eq(synonym).fillna(False)])
        excels.append(out)
    return list(zip(df_names, excels))

In [None]:
pte = paths_to_excels(DATA_PATH)

In [None]:
pte[1][1]

Unnamed: 0,Eye,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39
0,,,,,,,,,,,...,,,,,,,,,,
1,a Identification of individual mice indicates ...,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,b Expression data are log2 transformed and nor...,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,Age (mos.),1,1,1,1,1,6,6,...,6,16,16,16,16,24,24,24,24,24
6,,,Sex,F,F,F,F,F,F,F,...,M,M,M,M,M,M,M,M,M,M
7,,,Name,EFA1aa,EFA1b,EFA1c,EFA1d,EFA1e,EFA2a,EFA2b,...,EMA2e,EMA3a,EMA3c,EMA3d,EMA3e,EMA4a,EMA4b,EMA4c,EMA4d,EMA4e
8,Mouse Unigene,Gene Name,Gene Ontology,Expressionb,Expression,Expression,Expression,Expression,Expression,Expression,...,Expression,Expression,Expression,Expression,Expression,Expression,Expression,Expression,Expression,Expression
4113,Mm.275071,Jun,molecular function|DNA binding|IDA|GO:0003677|...,-1.778904,-1.414037,-1.459372,-1.301792,-1.20942,-1.24114,-0.612054,...,-1.316657,-0.620569,-0.84007,-1.297206,-1.048314,-1.419538,-1.272131,-1.129167,-0.925015,-1.29533


In [None]:
for path, df in pte:
    df.to_excel(DATA_PATH.parent/("Selected"+path))

In [None]:
selected_dfs = [pd.read_excel(x) for x in DATA_PATH.iterdir()]

In [None]:
selected_dfs[0]

Unnamed: 0,Cerebrum,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40,Unnamed: 41,Unnamed: 42
0,,,,,,,,,,,...,,,,,,,,,,
1,a Identification of individual mice indicates ...,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,b Expression data are log2 transformed and nor...,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8936,Mm.998,Psen1,cellular component|Golgi apparatus|IEA|GO:0005...,1.185674,1.030636,1.316592,0.66527,1.191124,1.154757,1.136131,...,1.194129,1.415672,1.043214,0.955768,1.212382,1.133628,1.195023,1.031871,0.955965,1.273213
8937,Mm.99850,E130014J05Rik,,-0.310347,-0.553494,-0.517468,-1.619751,-0.500824,-1.68138,-1.559942,...,-0.154742,-0.661455,-1.92966,-1.69862,-0.237964,-0.388204,-0.480387,-1.676975,-1.987396,-0.308769
8938,Mm.999,Mpv17,cellular component|integral to membrane|IEA|GO...,-0.094848,0.092678,-0.03468,0.158257,0.072524,0.024129,0.044589,...,0.016343,-0.045402,0.155962,0.077914,-0.070326,0.038026,-0.064552,0.034767,0.13411,0.01402
8939,Mm.99953,Cbx8,cellular component|chromatin|IEA|GO:0000785|MG...,-1.109772,-1.061965,-0.975764,-0.557904,-1.020482,-0.875162,-0.733763,...,-0.679564,-0.890819,-0.687,-0.80389,-1.076681,-1.168147,-1.028043,-0.67194,-0.649782,-1.156515
