diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..e02b20b --- /dev/null +++ b/.dockerignore @@ -0,0 +1,5 @@ +.vscode/ +examples/ +.github/ +.venv/ +.git/ \ No newline at end of file diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 0000000..77c64c7 --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,57 @@ +name: Python tests +on: [push, pull_request] +jobs: + test: + runs-on: ubuntu-20.04 + steps: + - name: checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + - name: Install Python + uses: actions/setup-python@v1 + with: + python-version: 3.9 + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install poetry + poetry config virtualenvs.create false + poetry install --no-dev + # Before running the test you have to download the tags! + - name: Run tests + run: | + python get_data/get_fpbase_data.py allele_components/tags_fpbase.toml + cd genestorian_module/test + python -m unittest + + # Update docker image when committing to master branch if tests pass + # push_to_registry: + # name: Push Docker image to Docker Hub + # runs-on: ubuntu-latest + # needs: test + # if: github.ref == 'refs/heads/master' + # steps: + # - name: Check out the repo + # uses: actions/checkout@v3 + + # - name: Log in to Docker Hub + # uses: docker/login-action@v2 + # with: + # username: ${{ secrets.DOCKER_USERNAME }} + # password: ${{ secrets.DOCKER_PASSWORD }} + + # - name: Extract metadata (tags, labels) for Docker + # id: meta + # uses: docker/metadata-action@v2 + # with: + # images: genestorian_refinement_pipeline + + # - name: Build and push Docker images + # uses: docker/build-push-action@v3.1.1 + + # with: + # context: . + # push: true + # tags: manulera/genestorian_refinement_pipeline:latest + # labels: ${{ steps.meta.outputs.labels }} diff --git a/.gitignore b/.gitignore index f794b2e..0d31dbf 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,10 @@ /Lab_strains/**/*.json /allele_components/tags_fpbase.toml +/grammar/*.txt + +/genestorian_module/test/alleles_pattern* +/genestorian_module/test/common_pattern.json +/genestorian_module/test/common_pattern_count.txt +/genestorian_module/test/most_common_other_tag.txt +/genestorian_module/test/nltk_trees_dataset/nltk_trees.json \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..13f643b --- /dev/null +++ b/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.9 + +WORKDIR /pipeline + +RUN pip install poetry +RUN pip install nltk +RUN pip install toml + +COPY ./ /pipeline/ + +RUN poetry config virtualenvs.create false +RUN poetry install --without dev +RUN poetry shell + +COPY . /pipeline + diff --git a/Lab_strains/format_all.sh b/Lab_strains/format_all.sh new file mode 100644 index 0000000..31282bb --- /dev/null +++ b/Lab_strains/format_all.sh @@ -0,0 +1,9 @@ +for lab in *_lab +do + cd $lab + if test -f "format.py"; then + echo "running in $lab" + python format.py + fi + cd .. +done \ No newline at end of file diff --git a/example.py b/example.py new file mode 100644 index 0000000..5e8be3d --- /dev/null +++ b/example.py @@ -0,0 +1,41 @@ +# %% + +from nltk.chunk.regexp import RegexpChunkRule, ChunkString +import re +from nltk.tree import Tree +from nltk.chunk import RegexpParser +# %% +grammar = """ + GENE_DELETION|BLAH: {???} +""" + +custom_tag_parser = RegexpParser(grammar, root_label='ROOT') + +input = Tree('ROOT', [ + Tree('GENE', ['mph1']), + Tree('SPACER', ['::']), + Tree('other', ['hello']), + Tree('SPACER', ['::']), + Tree('MARKER', ['kanr']) +]) +result: Tree = custom_tag_parser.parse_all(input) +# custom_tag_parser + +# %% + +# match = re.match('(aa)aa', 'aaaa') +# match.group() +# %% +cs = ChunkString(input) + +rule = RegexpChunkRule.fromstring( + '{???}') + +print(rule._regexp) + +match = re.match(rule._regexp, cs._str) +print(rule._regexp) +print(match.groups()) +# cs.xform(rule._regexp, '{\g}') +rule._regexp.flags +# print(cs._str) diff --git a/genestorian_module/genestorian_module.egg-info/PKG-INFO b/genestorian_module/genestorian_module.egg-info/PKG-INFO index f7202e3..53c35a6 100644 --- a/genestorian_module/genestorian_module.egg-info/PKG-INFO +++ b/genestorian_module/genestorian_module.egg-info/PKG-INFO @@ -1,9 +1,3 @@ Metadata-Version: 2.1 Name: genestorian-module Version: 0.0.0 -Summary: UNKNOWN -License: UNKNOWN -Platform: UNKNOWN - -UNKNOWN - diff --git a/genestorian_module/genestorian_module.egg-info/SOURCES.txt b/genestorian_module/genestorian_module.egg-info/SOURCES.txt index 6dc9165..552fef9 100644 --- a/genestorian_module/genestorian_module.egg-info/SOURCES.txt +++ b/genestorian_module/genestorian_module.egg-info/SOURCES.txt @@ -1,10 +1,16 @@ setup.py genestorian_module/__init__.py -genestorian_module/converge.py -genestorian_module/fourth_version_pipeline.py +genestorian_module/build_nltk_tags.py +genestorian_module/build_nltk_trees.py genestorian_module/replace_feature.py +genestorian_module/summary_nltk_tags.py genestorian_module/third_version_pipeline.py genestorian_module.egg-info/PKG-INFO genestorian_module.egg-info/SOURCES.txt genestorian_module.egg-info/dependency_links.txt -genestorian_module.egg-info/top_level.txt \ No newline at end of file +genestorian_module.egg-info/top_level.txt +test/test_build_nltk_tags.py +test/test_build_grammar.py +test/test_build_nltk_tags.py +test/test_nltk_trees.py +test/test_summary_nltk_tags.py \ No newline at end of file diff --git a/genestorian_module/genestorian_module/__init__.py b/genestorian_module/genestorian_module/__init__.py index 6bef8ca..2e49f25 100644 --- a/genestorian_module/genestorian_module/__init__.py +++ b/genestorian_module/genestorian_module/__init__.py @@ -2,6 +2,15 @@ def excel_to_tsv(excel_file, read_cols, tsv_file): + '''Extracts genotype and strain id from excel file to tsv file + + Parameter: + excel_file(path to file): path to the excel file + read_cols(list) : list of coloumn names to be read + tsv_file(path): path to tsv file + + Returns: + None''' #read_cols = ['strain_id/Sample Name', 'genotype'] read_file = pd.read_excel(excel_file, usecols=read_cols, na_filter=False) read_file = read_file.rename( @@ -23,6 +32,15 @@ def excel_to_tsv(excel_file, read_cols, tsv_file): def read_strains_tsv(tsv_file): + ''' + Reads the genotype and strain_id coloumn from strain.tsv file + + Parameter: + tsv_file(path): path to strains.tsv + + Return: + data(pandas dataframe): pandas dataframe where columns are strain_id and genotype + ''' data = pd.read_csv(tsv_file, sep='\t', na_filter=False) data['genotype'] = data['genotype'].astype(str) data['strain_id'] = data['strain_id'].astype(str) diff --git a/genestorian_module/genestorian_module/build_nltk_tags.py b/genestorian_module/genestorian_module/build_nltk_tags.py index bde7d16..06634e4 100644 --- a/genestorian_module/genestorian_module/build_nltk_tags.py +++ b/genestorian_module/genestorian_module/build_nltk_tags.py @@ -1,12 +1,21 @@ -from genestorian_module.replace_feature import build_feature_dict -from genestorian_module.third_version_pipeline import build_strain_list +from genestorian_module.replace_feature import (build_feature_dict, + build_strain_list) import re import json import sys import os +ROOT_DIR = os.path.realpath(os.path.join(os.path.dirname(__file__))) def build_separators_dict(): + ''' + Builds a dictionary where separators are the key from the text file + + Parameter: + None + Return: + separators_dict(dict): dict of separators + ''' separators_dict = {} with open("../../allele_components/separators.txt", "r") as fp: for x in fp: @@ -16,6 +25,15 @@ def build_separators_dict(): def add_other_tag(pattern_list): + ''' + Tokenizes the unidentified remaining elements of the alleles as other + + Parameter: + pattern_list(list): list of tokenized allele components along with untokenised components + + Return" + pattern_list(list): list of tokenized allele components + ''' for feature in pattern_list: if type(feature) != list: idx = pattern_list.index(feature) @@ -23,17 +41,27 @@ def add_other_tag(pattern_list): return pattern_list -def replace_allele_features(feature_dict, pattern_list, feature_name, matches): +def tokenize_allele_features(feature_dict, pattern_list, feature_name, matches): + '''Tokenizes the components of alleles according to the match found in feature dict + + Parameters: + feature_dict(dict): dictionary of features to be matched + pattern_list(list): list of features of an allele (tokenised and untokenised) + feature_name(str): name of the feature or tokens + matches(list): list of matches of an allele found in feature_dict + + Returns: + out_list(list): list of patterns(tokenized and untokenized) + ''' out_list = list() for i in range(len(pattern_list)): if type(pattern_list[i]) != str: out_list.append(pattern_list[i]) continue - if len(matches) == 0: - for feature in feature_dict.keys(): - if feature.lower() in pattern_list[i]: - matches.append(feature.lower()) - matches.sort(key=len, reverse=True) + for feature in feature_dict.keys(): + if feature.lower() in pattern_list[i]: + matches.append(feature.lower()) + matches.sort(key=len, reverse=True) allele_substring = pattern_list[i] this_list = [allele_substring] for match in matches: @@ -44,7 +72,7 @@ def replace_allele_features(feature_dict, pattern_list, feature_name, matches): feature_name, [allele_substring[start:end]]], allele_substring[end:]] # Remove empty strings this_list = list(filter(lambda x: x != '', this_list)) - this_list = replace_allele_features( + this_list = tokenize_allele_features( feature_dict, this_list, feature_name, matches) break out_list += this_list @@ -53,6 +81,15 @@ def replace_allele_features(feature_dict, pattern_list, feature_name, matches): def build_nltk_tag(allele_names, toml_files): + ''' + Builds a dict of allele names and a list of tokens of the allele features + + Parameter: + allele_names(list): list of alleles + toml_files(list): list of toml files in allele directory + + Return: + output_list: list of dictionary of allele names and pattern ''' output_list = [] for allele_name in allele_names: output_list.append({ @@ -60,15 +97,16 @@ def build_nltk_tag(allele_names, toml_files): 'pattern': [allele_name], }) for toml_file in toml_files: + print('finding features using', toml_file.split('/')[-1]) feature_dict, feature_name = build_feature_dict(toml_file) for allele_dict in output_list: - allele_dict['pattern'] = replace_allele_features( + allele_dict['pattern'] = tokenize_allele_features( feature_dict, allele_dict['pattern'], feature_name, []) separators_dict = build_separators_dict() for allele_dict in output_list: # replace separators - allele_dict['pattern'] = replace_allele_features( + allele_dict['pattern'] = tokenize_allele_features( separators_dict, allele_dict['pattern'], '-', []) # add other tags to untagged elements: allele_dict['pattern'] = add_other_tag(allele_dict['pattern']) @@ -76,6 +114,15 @@ def build_nltk_tag(allele_names, toml_files): def prettier_json(input_dict): + ''' + Formats json file to make it more readable + + Parameter: + input_dict(dict): dictionary of alleles + + Returns: + outpur_str(str): formatted input_dict + ''' output_str = json.dumps(input_dict, indent=3, ensure_ascii=False) match = re.search(r'\[(?=\n)(\n|(?![{}]).)+\]', output_str) diff --git a/genestorian_module/genestorian_module/build_nltk_trees.py b/genestorian_module/genestorian_module/build_nltk_trees.py new file mode 100644 index 0000000..32bb8d0 --- /dev/null +++ b/genestorian_module/genestorian_module/build_nltk_trees.py @@ -0,0 +1,149 @@ + + +from copy import deepcopy +from nltk.tree import ParentedTree +import json +import sys +import re + +from nltk.chunk import RegexpParser + + +def post_process_pseudo_grammar(pseudo_grammar: dict): + out_grammar = deepcopy(pseudo_grammar) + for rule in out_grammar: + rule['parser'] = RegexpParser( + f'{rule["feature_name"]}: ' + '{' + rule['pattern'] + '}', + root_label='ROOT') + return out_grammar + + +def build_allele_name2tree_dict(in_file): + ''' Returns nltk input format to input into the parser + + Parameter: + in_file : alleles_pattern_nltk.json + + Return: + dict of allele tags in ntlk input + ''' + with open(in_file) as fp: + allele_list = json.load(fp) + allele_name2tree_dict = {} + for allele in allele_list: + tree_list = [] + allele_name = allele['name'] + patterns = allele['pattern'] + for pattern in patterns: + tree_list.append(ParentedTree(pattern[0], pattern[1])) + # We create a root element called ALLELE + allele_name2tree_dict[allele_name] = ParentedTree('ROOT', tree_list) + return allele_name2tree_dict + + +def replace_parentedtree_node(tree: ParentedTree, target_node: ParentedTree, insert_nodes, inplace=True): + ''' + Return a COPY of a ParentedTree with a given node replaced by several, or replace if inplace is True + nodes, passed in insert_nodes + ''' + insertion_index = target_node.parent_index() + if not inplace: + tree = tree.copy(deep=True) + tree.remove(target_node) + for ele in insert_nodes[::-1]: + tree.insert(insertion_index, ele) + return tree + + +def check_regex_match(matched_subtree: ParentedTree, other_regex_patterns): + ''' + Tests the regex in rule['other_regex'], three possible outcomes: + - 'no match': regex not satisfied, so keep going + - 'match': regex is matched, and matches the entire tag + - 'split parent': regex is found, but does not match the entire + tag, so the tag has to be splitted, second return value is a list with + a list of trees after the split, that should be used to replace the + matched_subtree + ''' + # Get the trees from tags. + other_trees = [sstree for sstree in matched_subtree + if sstree.label() == 'other'] + + for regex, other_tree in zip(other_regex_patterns, other_trees): + other_tree: ParentedTree + match: re.Match[str] = re.search(regex, other_tree[0]) + if not match: + return 'no match', [] + if match.group() != other_tree[0]: + # Partition is a string method like split, but preserves the delimiter. the `if substr` is to remove + # the empty string: e.g., 'helloworld'.partition('hello') -> ['','hello','world'] + trees4replacement = [ParentedTree( + 'other', [substr]) for substr in other_tree[0].partition(match.group()) if substr] + # return a list of trees to replace matched_subtree in the parent, + # and restart the rule application from the current rule again + output_tree = replace_parentedtree_node( + matched_subtree, other_tree, trees4replacement) + return 'split parent', [t.copy(deep=True) for t in output_tree] + + # No errors in matching + return 'match', [] + + +def apply_pseudo_grammar(allele_tree: ParentedTree, pseudo_grammar): + output_tree = allele_tree.copy(deep=True) + + for rule_i, rule in enumerate(pseudo_grammar): + parser: RegexpParser = rule['parser'] + updated_tree: ParentedTree = ParentedTree.convert( + parser.parse(output_tree)) + # Match is found if the result is different + if updated_tree != output_tree: + other_regex_patterns = rule['other_regex'] + if len(other_regex_patterns) == 0: + # No rules for , we have a match + output_tree = updated_tree + continue + + # we have to make sure that regex patterns are matched, + # and splitted if needed + for matched_subtree in updated_tree.subtrees(filter=lambda x: x.label() == rule['feature_name']): + outcome, matched_subtree_replacement = check_regex_match( + matched_subtree, other_regex_patterns) + if outcome == 'no match': + replace_parentedtree_node(updated_tree, matched_subtree, [ + t.copy(deep=True) for t in matched_subtree], True) + elif outcome == 'split parent': + output_tree = replace_parentedtree_node( + updated_tree, matched_subtree, matched_subtree_replacement) + # We apply the same rule again (there might be further splits to do, or the same thing twice) + return apply_pseudo_grammar(output_tree, pseudo_grammar[rule_i:]) + else: + output_tree = updated_tree + + return output_tree + + +def main(input_file, pseudo_grammar_file, output_file): + + with open(pseudo_grammar_file) as f: + pseudo_grammar = json.load(f) + pseudo_grammar = post_process_pseudo_grammar(pseudo_grammar) + allele_name2tree_dict = build_allele_name2tree_dict(input_file) + + trees_dict = {} + for allele in allele_name2tree_dict: + + tree = apply_pseudo_grammar( + allele_name2tree_dict[allele], pseudo_grammar) + flat_tree = tree._pformat_flat("", "()", False) + trees_dict[allele] = flat_tree + + with open(output_file, 'w', encoding="utf-8") as fp: + json.dump(trees_dict, fp, indent=3, ensure_ascii=False) + + +if __name__ == "__main__": + input_file = sys.argv[1] + grammar_file = sys.argv[2] + output_dir = sys.argv[3] + main(input_file, grammar_file, output_dir) diff --git a/genestorian_module/genestorian_module/replace_feature.py b/genestorian_module/genestorian_module/replace_feature.py index a7a6fb0..4ca1de1 100644 --- a/genestorian_module/genestorian_module/replace_feature.py +++ b/genestorian_module/genestorian_module/replace_feature.py @@ -1,7 +1,19 @@ import toml +from genestorian_module import read_strains_tsv +import re def build_feature_dict(toml_file): + ''' + Builds a dictionary from input toml file + + Parameter: + toml_file(toml): toml file of an allele feature(gene, allele, marker etc) + + Returns: + synonyms_2toml_key_dict(dict): dict of name and synonyms of the features are keys and the name are the values + feature_type_name(str) : name of the feature(eg: gene, allele, marker etc) + ''' # dictionary in which the keys are name,synonyms,toml_keys and values are toml_keys synonyms_2toml_key_dict = {} feature_type_dict = toml.load(toml_file) @@ -20,17 +32,35 @@ def build_feature_dict(toml_file): return synonyms_2toml_key_dict, feature_type_name.upper() -def replace_allele_features(toml_file, genotypes, replace_word): - features = build_feature_dict(toml_file)[0] - genotype_features_replaced = [] - matches = [] - for genotype in genotypes: - for feature in features.keys(): - if feature.lower() in genotype.lower(): - matches.append(feature.lower()) - matches.sort(key=len, reverse=True) - for match in matches: - genotype = genotype.replace(match, replace_word) - genotype_features_replaced.append(genotype) - - return genotype_features_replaced +def build_strain_list(strain_tsv_file): + ''' + Builds a dict of strains where keys are strain_id, genotype, mating type + and alleles and save them in a list + + Parameter: + strains_tsv_file(tsv file): strains.tsv which contains strain_id and genotype + + Returns: + strain_list(list): list of dictionaries + ''' + data = read_strains_tsv(strain_tsv_file) + strain_list = list() + + # Iterate over rows + for row_index, strain in data.iterrows(): + alleles = list() + mating_type = 'h?' # use this as empty value + for allele in re.split("\s+", strain['genotype']): + # Sometimes people use h? to indicate that mating type is unkwown + if allele in ['h90', 'h-', 'h+', 'h?']: + mating_type = allele + else: + alleles.append(allele) + + strain_list.append({ + 'id': strain['strain_id'], + 'genotype': strain['genotype'], + 'mating_type': mating_type, + 'alleles': alleles + }) + return strain_list diff --git a/genestorian_module/genestorian_module/summary_nltk_tags.py b/genestorian_module/genestorian_module/summary_nltk_tags.py index 1255d92..db56557 100644 --- a/genestorian_module/genestorian_module/summary_nltk_tags.py +++ b/genestorian_module/genestorian_module/summary_nltk_tags.py @@ -5,6 +5,15 @@ def build_common_pattern_dict(input_file): + ''' + Builds a dictionary of common pattern followed by the the alleles + + Parameter: + input_file(json): json file which has allele list + + Returns: + pattern_dict: dictionary{pattern: [alleles following pattern]} + ''' with open(input_file) as f: alleles_list = json.load(f) pattern_dict = {} @@ -25,6 +34,15 @@ def build_common_pattern_dict(input_file): def json_common_pattern_dict(input_file): + ''' + Saves the pattern_dict to a json file in the directory same a that of input_file + + Parameter" + input_file(json): json file which has allele list + + Return: + None + ''' common_pattern_dict = build_common_pattern_dict(input_file) output_file = os.path.join(os.path.dirname( input_file), 'common_pattern.json') @@ -33,6 +51,16 @@ def json_common_pattern_dict(input_file): def count_common_patterns(input_file): + ''' + Counts the number of alleles following the same pattern and writes it in + a txt file in the same directory that of input_file. + + Parameters: + input_file(json):json file which has allele list + + Return: + None + ''' occ_dict = build_common_pattern_dict(input_file) output_list = list() for key in occ_dict: @@ -47,7 +75,14 @@ def count_common_patterns(input_file): def count_most_common_other_tag(input_file): - # A simpler version not building the pattern and then unbuilding it + '''Counts the frequency of unidentified elemens in the genotype and writes + in a txt file in the same directory that of input_file. + + Parameter: + input_file(json):json file which has allele list + + Return: + None''' with open(input_file) as f: alleles_list = json.load(f) all_other_tag_list = list() diff --git a/genestorian_module/test/nltk_trees_dataset/alleles_pattern_nltk.json b/genestorian_module/test/nltk_trees_dataset/alleles_pattern_nltk.json new file mode 100644 index 0000000..11a236f --- /dev/null +++ b/genestorian_module/test/nltk_trees_dataset/alleles_pattern_nltk.json @@ -0,0 +1,26 @@ +[ + { + "name": "dummyase1-sad11other", + "pattern": [ [ "other", [ "dummy" ] ], [ "GENE", [ "ase1" ] ], [ "-", [ "-" ] ], [ "GENE", [ "sad1" ] ], [ "other", [ "1other" ] ] ] + }, + { + "name": "ase1i130a,a143p", + "pattern": [ [ "GENE", [ "ase1" ] ], [ "other", [ "i130a,a143p" ] ] ] + }, + { + "name": "a-1pase1-ase1", + "pattern": [ [ "other", [ "a" ] ], [ "-", [ "-" ] ], [ "other", [ "1p" ] ], [ "GENE", [ "ase1" ] ], [ "-", [ "-" ] ], [ "GENE", [ "ase1" ] ] ] + }, + { + "name": "mph1δ::kanr", + "pattern": [ [ "GENE", [ "mph1" ] ], [ "other", [ "δ" ] ], [ "-", [ "::" ] ], [ "MARKER", [ "kanr" ] ] ] + }, + { + "name": "ase1(i130a,a143pap)", + "pattern": [ [ "GENE", [ "ase1" ] ], [ "other", [ "(i130a,a143pap)" ] ] ] + }, + { + "name": "pkj41x", + "pattern": [ [ "other", [ "pkj41x" ] ] ] + } +] \ No newline at end of file diff --git a/genestorian_module/test/nltk_trees_dataset/pseudo_grammar.json b/genestorian_module/test/nltk_trees_dataset/pseudo_grammar.json new file mode 100644 index 0000000..f999d59 --- /dev/null +++ b/genestorian_module/test/nltk_trees_dataset/pseudo_grammar.json @@ -0,0 +1,35 @@ +[ + { + "feature_name": "GENE_DELETION", + "pattern": "<->?<->?", + "other_regex": ["^(delta|δ|del|d)$"] + }, + { + "feature_name": "GENE_DELETION", + "pattern": "<->?", + "other_regex": [] + }, + { + "feature_name": "dummy_matching_PROMOTER_GENE", + "pattern": "", + "other_regex": ["^(dummy)$"] + }, + { + "feature_name": "PROMOTER_GENE", + "pattern": "", + "other_regex": ["(?<->?", + "other_regex": [ + "\\(?([gpavlimcfywhkrqnedst]\\d+[gpavlimcfywhkrqnedst]\\,?)+\\,?\\)?" + ] + }, + { + "feature_name": "C_Terminal_Tagging", + "pattern": "<->?<->?", + "other_regex": [] + } + ] + \ No newline at end of file diff --git a/genestorian_module/test/nltk_trees_dataset/test_strains.tsv b/genestorian_module/test/nltk_trees_dataset/test_strains.tsv new file mode 100644 index 0000000..c7de430 --- /dev/null +++ b/genestorian_module/test/nltk_trees_dataset/test_strains.tsv @@ -0,0 +1,7 @@ +strain_id genotype +1 h90 dummyase1-sad11other +2 h90 mph1δ::kanr +3 pkj41x +4 a-1pase1-ase1 +5 ase1i130a,a143p +6 ase1(i130a,a143pap) diff --git a/genestorian_module/test/test_build_grammar.py b/genestorian_module/test/test_build_grammar.py new file mode 100644 index 0000000..acd4f20 --- /dev/null +++ b/genestorian_module/test/test_build_grammar.py @@ -0,0 +1,8 @@ +from genericpath import isfile +import unittest + + +class TestBuildNltkTags(unittest.TestCase): + def test_input_file_is_present(self): + self.assertTrue(isfile('./nltk_trees_dataset/pseudo_grammar.json'), + 'File pseudo_grammar.json not found') diff --git a/genestorian_module/test/test_build_nltk_tags.py b/genestorian_module/test/test_build_nltk_tags.py index e9654b0..25d97a2 100644 --- a/genestorian_module/test/test_build_nltk_tags.py +++ b/genestorian_module/test/test_build_nltk_tags.py @@ -5,17 +5,10 @@ class TestBuildNltkTags(unittest.TestCase): - def test_all_input_files_are_present(self): - parent_dir = '../../Lab_strains' - path_list_strain_file = [] - # path_list_allele_json_file = [] - for dir in os.listdir(parent_dir): - path = os.path.join(parent_dir, dir) - files_strain = os.path.join(path + '/strains.tsv') - path_list_strain_file.append(files_strain) - for strain_file_path in path_list_strain_file: - self.assertTrue(os.path.isfile(strain_file_path), - 'The file strains.tsv not found') + def test_input_file_is_present(self): + + self.assertTrue(os.path.isfile('./test_strains.tsv'), + 'The file strains.tsv not found') def test_build_nltk_tag(self): try: diff --git a/genestorian_module/test/test_nltk_trees.py b/genestorian_module/test/test_nltk_trees.py new file mode 100644 index 0000000..528d5f9 --- /dev/null +++ b/genestorian_module/test/test_nltk_trees.py @@ -0,0 +1,48 @@ +import unittest +import os +import json + +class TestBuildNltkTrees(unittest.TestCase): + def test_that_test_files_are_there(self): + self.assertTrue(os.path.isfile('./nltk_trees_dataset/alleles_pattern_nltk.json'), + 'The file alleles_pattern_nltk.json could not be found') + + def test_build_tag_from_pattern(self): + try: + from genestorian_module.build_nltk_trees import build_allele_name2tree_dict + except ImportError: + raise Exception( + 'build_allele_name2tree_dict not imported from build_nltk_trees') + + output = build_allele_name2tree_dict( + './nltk_trees_dataset/alleles_pattern_nltk.json') + + self.assertEqual(type(output), dict, + 'build_allele_name2tree_dict should return a dict') + + def test_main(self): + try: + from genestorian_module.build_nltk_trees import main + except ImportError: + raise Exception( + 'main not imported from build_nltk_trees') + + main('./nltk_trees_dataset/alleles_pattern_nltk.json', './nltk_trees_dataset/pseudo_grammar.json', './nltk_trees_dataset/nltk_trees.json') + self.assertTrue(os.path.isfile('./nltk_trees_dataset/nltk_trees.json'), + 'nltk_trees.json not found') + + expected_output = { + "ase1i130a,a143p": "(ROOT (ALLELE_AA_SUBSTITUTION (GENE ase1) (other i130a,a143p)))", + "pkj41x": "(ROOT (other pkj41x))", + "dummyase1-sad11other": "(ROOT (dummy_matching_PROMOTER_GENE (other dummy) (GENE ase1)) (- -) (GENE sad1) (other 1other))", + "mph1δ::kanr": "(ROOT (GENE_DELETION (GENE mph1) (other δ) (- ::) (MARKER kanr)))", + "ase1(i130a,a143pap)": "(ROOT (ALLELE_AA_SUBSTITUTION (GENE ase1) (other (i130a,a143p)) (other ap)))", + "a-1pase1-ase1": "(ROOT (other a) (- -) (other 1) (PROMOTER_GENE (other p) (GENE ase1)) (- -) (GENE ase1))" + } + with open('./nltk_trees_dataset/nltk_trees.json') as f: + test_output = json.load(f) + + expected_output_sorted = sorted(expected_output.items()) + test_output_sorted = sorted(test_output.items()) + self.assertEqual(expected_output_sorted, test_output_sorted, + 'tree created by nltk chunker is not as expected') diff --git a/get_data/get_fpbase_data.py b/get_data/get_fpbase_data.py index fa24f4a..a125174 100644 --- a/get_data/get_fpbase_data.py +++ b/get_data/get_fpbase_data.py @@ -1,61 +1,65 @@ -# %% + import toml import requests import json +import sys -query = """{ - allProteins(name_Icontains:""){ - edges{ - node{ - name, - aliases, - primaryReference{ - doi +def main(output_file): + query = """{ + allProteins(name_Icontains:""){ + edges{ + node{ + name, + aliases, + primaryReference{ + doi + } } - } - } - } -}""" -url = 'https://www.fpbase.org/graphql/' -r = requests.post(url, json={'query': query}) -json_data = json.loads(r.text) - - -node_list = json_data['data']['allProteins']['edges'] -tags_list = [] -for node in node_list: - tags_list.append(node['node']) -for tag in tags_list: - if tag['primaryReference'] is not None: - tag["reference"] = tag['primaryReference']["doi"] - tag = tag.pop('primaryReference') - else: - tag['reference'] = '' - tag = tag.pop('primaryReference') - -toml_dict = {'tag': dict()} - -for tag in tags_list: - if tag['aliases'] is not None: - toml_dict['tag'][tag['name']] = { - 'name': tag['name'], - 'reference': tag['reference'], - 'synonyms': tag['aliases'] } - if len(toml_dict['tag'][tag['name']]['synonyms']) == 0: - toml_dict['tag'][tag['name']].pop('synonyms') + } + }""" + url = 'https://www.fpbase.org/graphql/' + r = requests.post(url, json={'query': query}) + json_data = json.loads(r.text) - else: - toml_dict['tag'][tag['name']] = { - 'name': tag['name'], - 'reference': tag['reference'] - } + node_list = json_data['data']['allProteins']['edges'] + tags_list = [] + for node in node_list: + tags_list.append(node['node']) + for tag in tags_list: + if tag['primaryReference'] is not None: + tag["reference"] = tag['primaryReference']["doi"] + tag = tag.pop('primaryReference') + else: + tag['reference'] = '' + tag = tag.pop('primaryReference') + + toml_dict = {'tag': dict()} + + for tag in tags_list: + if tag['aliases'] is not None: + toml_dict['tag'][tag['name']] = { + 'name': tag['name'], + 'reference': tag['reference'], + 'synonyms': tag['aliases'] + } + if len(toml_dict['tag'][tag['name']]['synonyms']) == 0: + toml_dict['tag'][tag['name']].pop('synonyms') + + else: + toml_dict['tag'][tag['name']] = { + 'name': tag['name'], + 'reference': tag['reference'] + } + + if len(toml_dict['tag'][tag['name']]['reference']) == 0: + toml_dict['tag'][tag['name']].pop('reference') - if len(toml_dict['tag'][tag['name']]['reference']) == 0: - toml_dict['tag'][tag['name']].pop('reference') + with open(output_file, "w") as toml_file: + toml.dump(toml_dict, toml_file) -with open('../allele_components/tags_fpbase.toml', "w") as toml_file: - toml.dump(toml_dict, toml_file) -# %% +if __name__ == "__main__": + output_file = sys.argv[1] + main(output_file) diff --git a/grammar/pseudo_grammar.json b/grammar/pseudo_grammar.json new file mode 100644 index 0000000..cb35a83 --- /dev/null +++ b/grammar/pseudo_grammar.json @@ -0,0 +1,34 @@ +[ + { + "feature_name": "GENE_DELETION", + "pattern": "<->?<->?", + "other_regex": ["^(delta|δ|del|d)$"] + }, + { + "feature_name": "GENE_DELETION", + "pattern": "<->?", + "other_regex": [] + }, + { + "feature_name": "dummy_matching_PROMOTER_GENE", + "pattern": "", + "other_regex": ["^(dummy)$"] + }, + { + "feature_name": "PROMOTER_GENE", + "pattern": "", + "other_regex": ["(?<->?", + "other_regex": [ + "\\(?([gpavlimcfywhkrqnedst]\\d+[gpavlimcfywhkrqnedst]\\,?)+\\,?\\)?" + ] + }, + { + "feature_name": "C_Terminal_Tagging", + "pattern": "<->?<->?", + "other_regex": [] + } +] diff --git a/poetry.lock b/poetry.lock index b60253b..2945f87 100644 --- a/poetry.lock +++ b/poetry.lock @@ -146,11 +146,22 @@ python-versions = ">=3.5.0" [package.extras] unicode_backport = ["unicodedata2"] +[[package]] +name = "click" +version = "8.1.3" +description = "Composable command line interface toolkit" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + [[package]] name = "colorama" version = "0.4.5" description = "Cross-platform colored terminal text." -category = "dev" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" @@ -362,6 +373,14 @@ MarkupSafe = ">=2.0" [package.extras] i18n = ["Babel (>=2.7)"] +[[package]] +name = "joblib" +version = "1.1.0" +description = "Lightweight pipelining with Python functions" +category = "main" +optional = false +python-versions = ">=3.6" + [[package]] name = "jsonschema" version = "4.6.0" @@ -575,6 +594,28 @@ category = "dev" optional = false python-versions = ">=3.5" +[[package]] +name = "nltk" +version = "3.7" +description = "Natural Language Toolkit" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +click = "*" +joblib = "*" +regex = ">=2021.8.3" +tqdm = "*" + +[package.extras] +all = ["numpy", "pyparsing", "scipy", "matplotlib", "twython", "requests", "scikit-learn", "python-crfsuite"] +corenlp = ["requests"] +machine_learning = ["numpy", "python-crfsuite", "scikit-learn", "scipy"] +plot = ["matplotlib"] +tgrep = ["pyparsing"] +twitter = ["twython"] + [[package]] name = "notebook" version = "6.4.12" @@ -889,6 +930,14 @@ packaging = "*" [package.extras] test = ["pytest (>=6,!=7.0.0,!=7.0.1)", "pytest-cov (>=3.0.0)", "pytest-qt"] +[[package]] +name = "regex" +version = "2022.9.13" +description = "Alternative regular expression module, to replace re." +category = "main" +optional = false +python-versions = ">=3.6" + [[package]] name = "requests" version = "2.28.0" @@ -987,7 +1036,7 @@ test = ["pytest", "pytest-cov", "pytest-flake8", "pytest-isort", "coverage"] name = "toml" version = "0.10.2" description = "Python Library for Tom's Obvious, Minimal Language" -category = "dev" +category = "main" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" @@ -999,6 +1048,23 @@ category = "dev" optional = false python-versions = ">= 3.5" +[[package]] +name = "tqdm" +version = "4.64.1" +description = "Fast, Extensible Progress Meter" +category = "main" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7" + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[package.extras] +dev = ["py-make (>=0.1.0)", "twine", "wheel"] +notebook = ["ipywidgets (>=6)"] +slack = ["slack-sdk"] +telegram = ["requests"] + [[package]] name = "traitlets" version = "5.3.0" @@ -1053,7 +1119,7 @@ notebook = ">=4.4.1" [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "13b0b5c3156cb8f6c54266a1369ed3e236531d0d19377735ca60e7acb00c7cf6" +content-hash = "61c60f897ba9da3e67ae4bc8224a080528e1ad8783ca5f3bc3d698de7629c834" [metadata.files] appnope = [ @@ -1171,6 +1237,10 @@ charset-normalizer = [ {file = "charset-normalizer-2.0.12.tar.gz", hash = "sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597"}, {file = "charset_normalizer-2.0.12-py3-none-any.whl", hash = "sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"}, ] +click = [ + {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"}, + {file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"}, +] colorama = [ {file = "colorama-0.4.5-py2.py3-none-any.whl", hash = "sha256:854bf444933e37f5824ae7bfc1e98d5bce2ebe4160d46b5edf346a89358e99da"}, {file = "colorama-0.4.5.tar.gz", hash = "sha256:e6c6b4334fc50988a639d9b98aa429a0b57da6e17b9a44f0451f930b6967b7a4"}, @@ -1252,6 +1322,7 @@ jinja2 = [ {file = "Jinja2-3.1.2-py3-none-any.whl", hash = "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"}, {file = "Jinja2-3.1.2.tar.gz", hash = "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852"}, ] +joblib = [] jsonschema = [ {file = "jsonschema-4.6.0-py3-none-any.whl", hash = "sha256:1c92d2db1900b668201f1797887d66453ab1fbfea51df8e4b46236689c427baf"}, {file = "jsonschema-4.6.0.tar.gz", hash = "sha256:9d6397ba4a6c0bf0300736057f649e3e12ecbc07d3e81a0dacb72de4e9801957"}, @@ -1351,6 +1422,7 @@ nest-asyncio = [ {file = "nest_asyncio-1.5.5-py3-none-any.whl", hash = "sha256:b98e3ec1b246135e4642eceffa5a6c23a3ab12c82ff816a92c612d68205813b2"}, {file = "nest_asyncio-1.5.5.tar.gz", hash = "sha256:e442291cd942698be619823a17a86a5759eabe1f8613084790de189fe9e16d65"}, ] +nltk = [] notebook = [ {file = "notebook-6.4.12-py3-none-any.whl", hash = "sha256:8c07a3bb7640e371f8a609bdbb2366a1976c6a2589da8ef917f761a61e3ad8b1"}, {file = "notebook-6.4.12.tar.gz", hash = "sha256:6268c9ec9048cff7a45405c990c29ac9ca40b0bc3ec29263d218c5e01f2b4e86"}, @@ -1621,6 +1693,7 @@ qtpy = [ {file = "QtPy-2.1.0-py3-none-any.whl", hash = "sha256:aee0586081f943029312becece9f63977b0a9e3788f77a6ac8cc74802bb173d6"}, {file = "QtPy-2.1.0.tar.gz", hash = "sha256:ca8cd4217175186344299ee4c0f7e7adcf362c70852ba35b255a534077025c06"}, ] +regex = [] requests = [ {file = "requests-2.28.0-py3-none-any.whl", hash = "sha256:bc7861137fbce630f17b03d3ad02ad0bf978c844f3536d0edda6499dafce2b6f"}, {file = "requests-2.28.0.tar.gz", hash = "sha256:d568723a7ebd25875d8d1eaf5dfa068cd2fc8194b2e483d7b1f7c81918dbec6b"}, @@ -1696,6 +1769,7 @@ tornado = [ {file = "tornado-6.1-cp39-cp39-win_amd64.whl", hash = "sha256:548430be2740e327b3fe0201abe471f314741efcb0067ec4f2d7dcfb4825f3e4"}, {file = "tornado-6.1.tar.gz", hash = "sha256:33c6e81d7bd55b468d2e793517c909b139960b6c790a60b7991b9b6b76fb9791"}, ] +tqdm = [] traitlets = [ {file = "traitlets-5.3.0-py3-none-any.whl", hash = "sha256:65fa18961659635933100db8ca120ef6220555286949774b9cfc106f941d1c7a"}, {file = "traitlets-5.3.0.tar.gz", hash = "sha256:0bb9f1f9f017aa8ec187d8b1b2a7a6626a2a1d877116baba52a129bfa124f8e2"}, diff --git a/pyproject.toml b/pyproject.toml index 70db754..43fe8ea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,8 @@ openpyxl = "^3.0.10" pandas = "^1.4.2" python = "^3.9" requests = "^2.27.1" +nltk = "^3.7" +toml = "^0.10.2" [tool.poetry.dev-dependencies] autopep8 = "^1.6.0" diff --git a/readme.md b/readme.md index ee24069..6daaaa0 100644 --- a/readme.md +++ b/readme.md @@ -28,6 +28,16 @@ You can add the virtual environment that you created to the jupyter kernel by ru ``` poetry run python -m ipykernel install --user ``` +### Working with Docker + +To build from the dockerfile available in the repo: +``` +$ docker build -t genestorian . +$ docker run --name genestorian_c -d genestorian:latest sleep inifinity +$ docker exec -it genestorian_c /bin/sh + +``` +To stop the docker: `$ docker stop genestorian_c` ## Getting the data @@ -66,10 +76,10 @@ It has 5 columns: 3. Allele name (if we are lucky we find it in the `genotype` column in `data/strains.tsv`) 4. Description (some info about the allele sequence). For now we won't use it. 5. Expression (expression level in the experiment. In general reflects a change in the promoter.). For now we won't use it. - + ### Other features The folder `alleles_components` contains a bunch of toml files. Each toml file corresponds to one feature type. -`markers.toml`, `promoters.toml`, `tags.tom`, `sequence_features.tom` contains common markers, promoters, tags and sequence features used in S Pombe labs. The format of the toml file is as: +`markers.toml`, `promoters.toml`, `tags.toml`, `sequence_features.toml` contains common markers, promoters, tags and sequence features used in S Pombe labs. The format of the toml file is: ```toml [feature_type.] @@ -95,15 +105,15 @@ synonyms = [ "wtGFP", "GFP", "gfp10", "Green Fluorescent Protein",] You can generate the file `allele_components/tags_fpbase.toml`, which contains many of the known fluorescent protein tags in the above format from fp_base(https://www.fpbase.org/). To do this go to the folder `get_data` and run: ```bash -python get_fpbase_data.py +python get_fpbase_data.py ../allele_components/tags_fpbase.toml ``` This script retrieves the data from fb_base graphql API(https://www.fpbase.org/graphql/). ## Running the Pipeline -This pipeline is refinement pipeline for genotype. The goal of the pipeline is to be able to extract the alleles from genotype, identify the pattern followed by the allele and structure it to follow a standard format. -At present, the pipeline extracts alleles from the genotype to a list then identifies different features of alleles and add a tag to each identified feature. The input must be a tsv file, typically named `strains.tsv` with column names 'strain_id' and 'genotype' which contain strain id and genotype of a strain. +The goal of this pipeline is to extract the alleles from genotype, identify the patterns followed by the alleles and structure the data in a way that it could be migrated to a database. +At present, the pipeline extracts alleles from the genotype to a list. It identifies different features of alleles to tokenize and tag the features.The tagged tokens are then parsed by NLTK RegexParser using the rules defined by us. The output of the parser is a tree with identified patterns as subtrees. The input of the pipeline must be a tsv file, typically named `strains.tsv` with column names 'strain_id' and 'genotype' which contain strain id and genotype of a strain. ```tsv strain_id genotype @@ -129,7 +139,7 @@ read_file.to_csv('strains.tsv', sep='\t', index=False) ### Build nltk tags -We are using nltk library to process tha data. Before using the nltk library, it's important to have data structured in a format which can be input to nltk APIs. +We are using nltk library to process tha data. Before using the nltk library, it's important to have data structured in a format which can be input to nltk parser. The script `build_nltk_tags` in `genestorian_module` takes `strains.tsv` as an input and creates a file named `alleles_pattern_nltk.json` in the same directory of `strains.tsv`. To run this script: @@ -233,4 +243,72 @@ laci 1 laco 1 9 1 -``` \ No newline at end of file +``` + +### Grammar for NLTK Regex Chunk Parser + +We use NLTK Regex chunk Parser to parse the allele names. The grammar is the set of chunk rules defined to parse the allele names. Because the data that we work with is much more complicated compared to the text usually parsed using nltk. Hence we have defined a pseudo grammar which is first, used to build the chunk rules and later in the process, it is used to further parse the chunked patterns. + +To build your own grammar: you need a json file which contains a dictionary where the keys are the rule name and value of the key is an other dictionary. In the other dictionary keys are pattern and other regex demonstrated in the example below. other_regex is the regex which should match to the value of other tag in the pattern to correctly identify the pattern. + +``` +{ + "GENE_DELETION": { + "pattern": "<->??<->?", + "other_regex": [ + "^(delta|δ|del)$" + ] + }, + + "PROMOTER_GENE": { + "pattern": "<->", + "other_regex": [ + "(?<->?<->?", + "other_regex": [] + } +} +``` + +Save this dict, e.g. in `grammar/pseudo_grammar.json`. + +Then, call `python genestorian_module/genestorian_module/build_grammar.py grammar/pseudo_grammar.json grammar/grammar.txt` on that file, and specify an output text file (in this case `grammar/grammar.txt`). + +This creates a `grammar.txt` file in `genestorian_module/genestorian_module/grammar` directory. Text file from above example would look like: + +``` + GENE_DELETION {<->??<->?} + PROMOTER_GENE : {<->} + C_Terminal_Tagging : {<->?<->?} +``` + +### Identify patterns using NLTK RegexChunker +We use NLTK Regex Chunker along with the regex defined in pseudo_grammar to identify patterns in allele names. The RegexChunk Parser first identifies the patterns in the `grammar.txt` then builds a tree. Then the other_regex in pseudo_grammar is matched to the value of the 'other' token in the subtree(the identified pattern tree in the tree) to validate the tree. If the value of other tag is matched then only the pattern identified by the chunker is labelled otherwise the identified pattern tree is discarded. In some cases, only a part of the 'other' token value is matched, in such cases the value is split and only the matched part is added to the tree, remaining part is added to outside the identified pattern tree. + +To identify patterns in your alleles run `python build_nltk_trees.py /path/to/alleles_pattern_nltk.json` +in `genestorian_module/genestorian_module/`. This creates a file `nltk_trees.json` in the same dictory as that of `alleles_pattern_nltk.json`. The file contains a dictionary in which keys are the allele names and value is the tree + +for example alleles: +``` +pht1kanmx6 +ade6-m210<