Addresses issue #29 ; (#35)

* WIP:nltk trees * readme and doc strings * more addition to readme * alleles_pattern_nltk.json * tests updated * fixed the failing tests * updating psuedo grammar * more to tests * readme updated * bug fix in nltk_trees * Nltk trees manu (#39) * fix fpbase things * intermediate fix * simplified version, new grammar, does not handle split * half way * added poetry dependencies * simple version working * fix tests * update gitignore * Ci workflow (#38) * ci_yaml and docker * updating ci.yaml * updating ci.yaml * dockerfile updated * fixing ci Co-authored-by: Anamika Yadav <anamika310.yadav@gmail.com> * fix ci line * remove docker action * make action run at each push * download tags in CI * silly mistake CI fixed * fixed error Co-authored-by: Anamika Yadav <anamika310.yadav@gmail.com> Co-authored-by: Manuel Lera Ramirez <manulera14@gmail.com>
manulera · Sep 15, 2022 · 50df31f · 50df31f
1 parent f35eb72
commit 50df31f
Show file tree

Hide file tree

Showing 24 changed files with 829 additions and 106 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,5 @@
+.vscode/
+examples/
+.github/
+.venv/
+.git/
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -0,0 +1,57 @@
+name: Python tests
+on: [push, pull_request]
+jobs:
+  test:
+    runs-on: ubuntu-20.04
+    steps:
+      - name: checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+      - name: Install Python
+        uses: actions/setup-python@v1
+        with:
+          python-version: 3.9
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install poetry
+          poetry config virtualenvs.create false
+          poetry install --no-dev
+      # Before running the test you have to download the tags!
+      - name: Run tests
+        run: |
+            python get_data/get_fpbase_data.py allele_components/tags_fpbase.toml
+            cd genestorian_module/test
+            python -m unittest
+
+ # Update docker image when committing to master branch if tests pass
+  # push_to_registry:
+  #   name: Push Docker image to Docker Hub
+  #   runs-on: ubuntu-latest
+  #   needs: test
+  #   if: github.ref == 'refs/heads/master'
+  #   steps:
+  #     - name: Check out the repo
+  #       uses: actions/checkout@v3
+
+  #     - name: Log in to Docker Hub
+  #       uses: docker/login-action@v2
+  #       with:
+  #         username: ${{ secrets.DOCKER_USERNAME }}
+  #         password: ${{ secrets.DOCKER_PASSWORD }}
+
+  #     - name: Extract metadata (tags, labels) for Docker
+  #       id: meta
+  #       uses: docker/metadata-action@v2
+  #       with:
+  #         images: genestorian_refinement_pipeline
+
+  #     - name: Build and push Docker images
+  #       uses: docker/build-push-action@v3.1.1
+
+  #       with:
+  #         context: .
+  #         push: true
+  #         tags: manulera/genestorian_refinement_pipeline:latest
+  #         labels: ${{ steps.meta.outputs.labels }}
diff --git a/.gitignore b/.gitignore
@@ -14,3 +14,10 @@
 /Lab_strains/**/*.json
 
 /allele_components/tags_fpbase.toml
+/grammar/*.txt
+
+/genestorian_module/test/alleles_pattern*
+/genestorian_module/test/common_pattern.json
+/genestorian_module/test/common_pattern_count.txt
+/genestorian_module/test/most_common_other_tag.txt
+/genestorian_module/test/nltk_trees_dataset/nltk_trees.json
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,16 @@
+FROM python:3.9
+
+WORKDIR /pipeline
+
+RUN pip install poetry 
+RUN pip install nltk
+RUN pip install toml
+
+COPY ./ /pipeline/
+
+RUN poetry config virtualenvs.create false
+RUN poetry install --without dev
+RUN poetry shell
+
+COPY . /pipeline
+
diff --git a/Lab_strains/format_all.sh b/Lab_strains/format_all.sh
@@ -0,0 +1,9 @@
+for lab in *_lab
+do
+	cd $lab
+	if test -f "format.py"; then
+    	echo "running in $lab"
+		python format.py
+	fi
+	cd ..
+done
diff --git a/example.py b/example.py
@@ -0,0 +1,41 @@
+# %%
+
+from nltk.chunk.regexp import RegexpChunkRule, ChunkString
+import re
+from nltk.tree import Tree
+from nltk.chunk import RegexpParser
+# %%
+grammar = """
+    GENE_DELETION|BLAH: {<GENE><SPACER>?<other>?<SPACER>?<MARKER>}
+"""
+
+custom_tag_parser = RegexpParser(grammar, root_label='ROOT')
+
+input = Tree('ROOT', [
+    Tree('GENE', ['mph1']),
+    Tree('SPACER', ['::']),
+    Tree('other', ['hello']),
+    Tree('SPACER', ['::']),
+    Tree('MARKER', ['kanr'])
+])
+result: Tree = custom_tag_parser.parse_all(input)
+# custom_tag_parser
+
+# %%
+
+# match = re.match('(aa)aa', 'aaaa')
+# match.group()
+# %%
+cs = ChunkString(input)
+
+rule = RegexpChunkRule.fromstring(
+    '{<GENE><SPACER>?<other>?<SPACER>?<MARKER>}')
+
+print(rule._regexp)
+
+match = re.match(rule._regexp, cs._str)
+print(rule._regexp)
+print(match.groups())
+# cs.xform(rule._regexp, '{\g<chunk>}')
+rule._regexp.flags
+# print(cs._str)
diff --git a/genestorian_module/genestorian_module.egg-info/PKG-INFO b/genestorian_module/genestorian_module.egg-info/PKG-INFO
@@ -1,9 +1,3 @@
 Metadata-Version: 2.1
 Name: genestorian-module
 Version: 0.0.0
-Summary: UNKNOWN
-License: UNKNOWN
-Platform: UNKNOWN
-
-UNKNOWN
-
diff --git a/genestorian_module/genestorian_module.egg-info/SOURCES.txt b/genestorian_module/genestorian_module.egg-info/SOURCES.txt
@@ -1,10 +1,16 @@
 setup.py
 genestorian_module/__init__.py
-genestorian_module/converge.py
-genestorian_module/fourth_version_pipeline.py
+genestorian_module/build_nltk_tags.py
+genestorian_module/build_nltk_trees.py
 genestorian_module/replace_feature.py
+genestorian_module/summary_nltk_tags.py
 genestorian_module/third_version_pipeline.py
 genestorian_module.egg-info/PKG-INFO
 genestorian_module.egg-info/SOURCES.txt
 genestorian_module.egg-info/dependency_links.txt
-genestorian_module.egg-info/top_level.txt
+genestorian_module.egg-info/top_level.txt
+test/test_build_nltk_tags.py
+test/test_build_grammar.py
+test/test_build_nltk_tags.py
+test/test_nltk_trees.py
+test/test_summary_nltk_tags.py
diff --git a/genestorian_module/genestorian_module/__init__.py b/genestorian_module/genestorian_module/__init__.py
@@ -2,6 +2,15 @@
 
 
 def excel_to_tsv(excel_file, read_cols, tsv_file):
+    '''Extracts genotype and strain id from excel file to tsv file
+
+            Parameter:
+                excel_file(path to file): path to the excel file
+                read_cols(list) : list of coloumn names to be read
+                tsv_file(path): path to tsv file
+
+            Returns:
+                None'''
     #read_cols = ['strain_id/Sample Name', 'genotype']
     read_file = pd.read_excel(excel_file, usecols=read_cols, na_filter=False)
     read_file = read_file.rename(
@@ -23,6 +32,15 @@ def excel_to_tsv(excel_file, read_cols, tsv_file):
 
 
 def read_strains_tsv(tsv_file):
+    '''
+    Reads the genotype and strain_id coloumn from strain.tsv file
+
+        Parameter:
+            tsv_file(path): path to strains.tsv 
+
+        Return:
+            data(pandas dataframe): pandas dataframe where columns are strain_id and genotype 
+    '''
     data = pd.read_csv(tsv_file, sep='\t', na_filter=False)
     data['genotype'] = data['genotype'].astype(str)
     data['strain_id'] = data['strain_id'].astype(str)

diff --git a/genestorian_module/genestorian_module/build_nltk_tags.py b/genestorian_module/genestorian_module/build_nltk_tags.py
@@ -1,12 +1,21 @@
-from genestorian_module.replace_feature import build_feature_dict
-from genestorian_module.third_version_pipeline import build_strain_list
+from genestorian_module.replace_feature import (build_feature_dict,
+                                                build_strain_list)
 import re
 import json
 import sys
 import os
+ROOT_DIR = os.path.realpath(os.path.join(os.path.dirname(__file__)))
 
 
 def build_separators_dict():
+    '''
+    Builds a dictionary where separators are the key from the text file 
+
+        Parameter: 
+            None
+        Return:
+            separators_dict(dict): dict of separators 
+    '''
     separators_dict = {}
     with open("../../allele_components/separators.txt", "r") as fp:
         for x in fp:
@@ -16,24 +25,43 @@ def build_separators_dict():
 
 
 def add_other_tag(pattern_list):
+    '''
+    Tokenizes the unidentified remaining elements of the alleles as other
+
+        Parameter:
+            pattern_list(list): list of tokenized allele components along with untokenised components
+
+        Return"
+            pattern_list(list): list of tokenized allele components
+     '''
     for feature in pattern_list:
         if type(feature) != list:
             idx = pattern_list.index(feature)
             pattern_list[idx] = ['other', [feature]]
     return pattern_list
 
 
-def replace_allele_features(feature_dict, pattern_list, feature_name, matches):
+def tokenize_allele_features(feature_dict, pattern_list, feature_name, matches):
+    '''Tokenizes the components of alleles according to the match found in feature dict
+
+        Parameters:
+            feature_dict(dict): dictionary of features to be matched
+            pattern_list(list): list of features of an allele (tokenised and untokenised)
+            feature_name(str): name of the feature or tokens
+            matches(list): list of matches of an allele found in feature_dict
+
+        Returns:
+            out_list(list): list of patterns(tokenized and untokenized)
+    '''
     out_list = list()
     for i in range(len(pattern_list)):
         if type(pattern_list[i]) != str:
             out_list.append(pattern_list[i])
             continue
-        if len(matches) == 0:
-            for feature in feature_dict.keys():
-                if feature.lower() in pattern_list[i]:
-                    matches.append(feature.lower())
-            matches.sort(key=len, reverse=True)
+        for feature in feature_dict.keys():
+            if feature.lower() in pattern_list[i]:
+                matches.append(feature.lower())
+        matches.sort(key=len, reverse=True)
         allele_substring = pattern_list[i]
         this_list = [allele_substring]
         for match in matches:
@@ -44,7 +72,7 @@ def replace_allele_features(feature_dict, pattern_list, feature_name, matches):
                     feature_name, [allele_substring[start:end]]], allele_substring[end:]]
                 # Remove empty strings
                 this_list = list(filter(lambda x: x != '', this_list))
-                this_list = replace_allele_features(
+                this_list = tokenize_allele_features(
                     feature_dict, this_list, feature_name, matches)
                 break
         out_list += this_list
@@ -53,29 +81,48 @@ def replace_allele_features(feature_dict, pattern_list, feature_name, matches):
 
 
 def build_nltk_tag(allele_names, toml_files):
+    '''
+    Builds a dict of allele names and a list of tokens of the allele features 
+
+        Parameter:
+            allele_names(list): list of alleles
+            toml_files(list): list of toml files in allele  directory
+
+        Return:
+            output_list: list of dictionary of allele names and pattern '''
     output_list = []
     for allele_name in allele_names:
         output_list.append({
             'name': allele_name,
             'pattern': [allele_name],
         })
     for toml_file in toml_files:
+        print('finding features using', toml_file.split('/')[-1])
         feature_dict, feature_name = build_feature_dict(toml_file)
         for allele_dict in output_list:
-            allele_dict['pattern'] = replace_allele_features(
+            allele_dict['pattern'] = tokenize_allele_features(
                 feature_dict, allele_dict['pattern'], feature_name, [])
 
     separators_dict = build_separators_dict()
     for allele_dict in output_list:
         # replace separators
-        allele_dict['pattern'] = replace_allele_features(
+        allele_dict['pattern'] = tokenize_allele_features(
             separators_dict, allele_dict['pattern'], '-', [])
         # add other tags to untagged elements:
         allele_dict['pattern'] = add_other_tag(allele_dict['pattern'])
     return output_list
 
 
 def prettier_json(input_dict):
+    '''
+    Formats json file to make it more readable
+
+        Parameter:
+            input_dict(dict): dictionary of alleles
+
+        Returns:
+            outpur_str(str): formatted input_dict
+        '''
     output_str = json.dumps(input_dict, indent=3, ensure_ascii=False)
 
     match = re.search(r'\[(?=\n)(\n|(?![{}]).)+\]', output_str)