Skip to content

Commit

Permalink
Addresses issue #29 ; (#35)
Browse files Browse the repository at this point in the history
* WIP:nltk trees

* readme and doc strings

* more addition to readme

* alleles_pattern_nltk.json

* tests updated

* fixed the failing tests

* updating psuedo grammar

* more to tests

* readme updated

* bug fix in nltk_trees

* Nltk trees manu (#39)

* fix fpbase things

* intermediate fix

* simplified version, new grammar, does not handle split

* half way

* added poetry dependencies

* simple version working

* fix tests

* update gitignore

* Ci workflow (#38)

* ci_yaml and docker

* updating ci.yaml

* updating ci.yaml

* dockerfile updated

* fixing ci

Co-authored-by: Anamika Yadav <anamika310.yadav@gmail.com>

* fix ci line

* remove docker action

* make action run at each push

* download tags in CI

* silly mistake CI fixed

* fixed error

Co-authored-by: Anamika Yadav <anamika310.yadav@gmail.com>

Co-authored-by: Manuel Lera Ramirez <manulera14@gmail.com>
  • Loading branch information
anamika-yadav99 and manulera committed Sep 15, 2022
1 parent f35eb72 commit 50df31f
Show file tree
Hide file tree
Showing 24 changed files with 829 additions and 106 deletions.
5 changes: 5 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
.vscode/
examples/
.github/
.venv/
.git/
57 changes: 57 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
name: Python tests
on: [push, pull_request]
jobs:
test:
runs-on: ubuntu-20.04
steps:
- name: checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Install Python
uses: actions/setup-python@v1
with:
python-version: 3.9
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install poetry
poetry config virtualenvs.create false
poetry install --no-dev
# Before running the test you have to download the tags!
- name: Run tests
run: |
python get_data/get_fpbase_data.py allele_components/tags_fpbase.toml
cd genestorian_module/test
python -m unittest
# Update docker image when committing to master branch if tests pass
# push_to_registry:
# name: Push Docker image to Docker Hub
# runs-on: ubuntu-latest
# needs: test
# if: github.ref == 'refs/heads/master'
# steps:
# - name: Check out the repo
# uses: actions/checkout@v3

# - name: Log in to Docker Hub
# uses: docker/login-action@v2
# with:
# username: ${{ secrets.DOCKER_USERNAME }}
# password: ${{ secrets.DOCKER_PASSWORD }}

# - name: Extract metadata (tags, labels) for Docker
# id: meta
# uses: docker/metadata-action@v2
# with:
# images: genestorian_refinement_pipeline

# - name: Build and push Docker images
# uses: docker/build-push-action@v3.1.1

# with:
# context: .
# push: true
# tags: manulera/genestorian_refinement_pipeline:latest
# labels: ${{ steps.meta.outputs.labels }}
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,10 @@
/Lab_strains/**/*.json

/allele_components/tags_fpbase.toml
/grammar/*.txt

/genestorian_module/test/alleles_pattern*
/genestorian_module/test/common_pattern.json
/genestorian_module/test/common_pattern_count.txt
/genestorian_module/test/most_common_other_tag.txt
/genestorian_module/test/nltk_trees_dataset/nltk_trees.json
16 changes: 16 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
FROM python:3.9

WORKDIR /pipeline

RUN pip install poetry
RUN pip install nltk
RUN pip install toml

COPY ./ /pipeline/

RUN poetry config virtualenvs.create false
RUN poetry install --without dev
RUN poetry shell

COPY . /pipeline

9 changes: 9 additions & 0 deletions Lab_strains/format_all.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
for lab in *_lab
do
cd $lab
if test -f "format.py"; then
echo "running in $lab"
python format.py
fi
cd ..
done
41 changes: 41 additions & 0 deletions example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# %%

from nltk.chunk.regexp import RegexpChunkRule, ChunkString
import re
from nltk.tree import Tree
from nltk.chunk import RegexpParser
# %%
grammar = """
GENE_DELETION|BLAH: {<GENE><SPACER>?<other>?<SPACER>?<MARKER>}
"""

custom_tag_parser = RegexpParser(grammar, root_label='ROOT')

input = Tree('ROOT', [
Tree('GENE', ['mph1']),
Tree('SPACER', ['::']),
Tree('other', ['hello']),
Tree('SPACER', ['::']),
Tree('MARKER', ['kanr'])
])
result: Tree = custom_tag_parser.parse_all(input)
# custom_tag_parser

# %%

# match = re.match('(aa)aa', 'aaaa')
# match.group()
# %%
cs = ChunkString(input)

rule = RegexpChunkRule.fromstring(
'{<GENE><SPACER>?<other>?<SPACER>?<MARKER>}')

print(rule._regexp)

match = re.match(rule._regexp, cs._str)
print(rule._regexp)
print(match.groups())
# cs.xform(rule._regexp, '{\g<chunk>}')
rule._regexp.flags
# print(cs._str)
6 changes: 0 additions & 6 deletions genestorian_module/genestorian_module.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
@@ -1,9 +1,3 @@
Metadata-Version: 2.1
Name: genestorian-module
Version: 0.0.0
Summary: UNKNOWN
License: UNKNOWN
Platform: UNKNOWN

UNKNOWN

12 changes: 9 additions & 3 deletions genestorian_module/genestorian_module.egg-info/SOURCES.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
setup.py
genestorian_module/__init__.py
genestorian_module/converge.py
genestorian_module/fourth_version_pipeline.py
genestorian_module/build_nltk_tags.py
genestorian_module/build_nltk_trees.py
genestorian_module/replace_feature.py
genestorian_module/summary_nltk_tags.py
genestorian_module/third_version_pipeline.py
genestorian_module.egg-info/PKG-INFO
genestorian_module.egg-info/SOURCES.txt
genestorian_module.egg-info/dependency_links.txt
genestorian_module.egg-info/top_level.txt
genestorian_module.egg-info/top_level.txt
test/test_build_nltk_tags.py
test/test_build_grammar.py
test/test_build_nltk_tags.py
test/test_nltk_trees.py
test/test_summary_nltk_tags.py
18 changes: 18 additions & 0 deletions genestorian_module/genestorian_module/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,15 @@


def excel_to_tsv(excel_file, read_cols, tsv_file):
'''Extracts genotype and strain id from excel file to tsv file
Parameter:
excel_file(path to file): path to the excel file
read_cols(list) : list of coloumn names to be read
tsv_file(path): path to tsv file
Returns:
None'''
#read_cols = ['strain_id/Sample Name', 'genotype']
read_file = pd.read_excel(excel_file, usecols=read_cols, na_filter=False)
read_file = read_file.rename(
Expand All @@ -23,6 +32,15 @@ def excel_to_tsv(excel_file, read_cols, tsv_file):


def read_strains_tsv(tsv_file):
'''
Reads the genotype and strain_id coloumn from strain.tsv file
Parameter:
tsv_file(path): path to strains.tsv
Return:
data(pandas dataframe): pandas dataframe where columns are strain_id and genotype
'''
data = pd.read_csv(tsv_file, sep='\t', na_filter=False)
data['genotype'] = data['genotype'].astype(str)
data['strain_id'] = data['strain_id'].astype(str)
Expand Down
69 changes: 58 additions & 11 deletions genestorian_module/genestorian_module/build_nltk_tags.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,21 @@
from genestorian_module.replace_feature import build_feature_dict
from genestorian_module.third_version_pipeline import build_strain_list
from genestorian_module.replace_feature import (build_feature_dict,
build_strain_list)
import re
import json
import sys
import os
ROOT_DIR = os.path.realpath(os.path.join(os.path.dirname(__file__)))


def build_separators_dict():
'''
Builds a dictionary where separators are the key from the text file
Parameter:
None
Return:
separators_dict(dict): dict of separators
'''
separators_dict = {}
with open("../../allele_components/separators.txt", "r") as fp:
for x in fp:
Expand All @@ -16,24 +25,43 @@ def build_separators_dict():


def add_other_tag(pattern_list):
'''
Tokenizes the unidentified remaining elements of the alleles as other
Parameter:
pattern_list(list): list of tokenized allele components along with untokenised components
Return"
pattern_list(list): list of tokenized allele components
'''
for feature in pattern_list:
if type(feature) != list:
idx = pattern_list.index(feature)
pattern_list[idx] = ['other', [feature]]
return pattern_list


def replace_allele_features(feature_dict, pattern_list, feature_name, matches):
def tokenize_allele_features(feature_dict, pattern_list, feature_name, matches):
'''Tokenizes the components of alleles according to the match found in feature dict
Parameters:
feature_dict(dict): dictionary of features to be matched
pattern_list(list): list of features of an allele (tokenised and untokenised)
feature_name(str): name of the feature or tokens
matches(list): list of matches of an allele found in feature_dict
Returns:
out_list(list): list of patterns(tokenized and untokenized)
'''
out_list = list()
for i in range(len(pattern_list)):
if type(pattern_list[i]) != str:
out_list.append(pattern_list[i])
continue
if len(matches) == 0:
for feature in feature_dict.keys():
if feature.lower() in pattern_list[i]:
matches.append(feature.lower())
matches.sort(key=len, reverse=True)
for feature in feature_dict.keys():
if feature.lower() in pattern_list[i]:
matches.append(feature.lower())
matches.sort(key=len, reverse=True)
allele_substring = pattern_list[i]
this_list = [allele_substring]
for match in matches:
Expand All @@ -44,7 +72,7 @@ def replace_allele_features(feature_dict, pattern_list, feature_name, matches):
feature_name, [allele_substring[start:end]]], allele_substring[end:]]
# Remove empty strings
this_list = list(filter(lambda x: x != '', this_list))
this_list = replace_allele_features(
this_list = tokenize_allele_features(
feature_dict, this_list, feature_name, matches)
break
out_list += this_list
Expand All @@ -53,29 +81,48 @@ def replace_allele_features(feature_dict, pattern_list, feature_name, matches):


def build_nltk_tag(allele_names, toml_files):
'''
Builds a dict of allele names and a list of tokens of the allele features
Parameter:
allele_names(list): list of alleles
toml_files(list): list of toml files in allele directory
Return:
output_list: list of dictionary of allele names and pattern '''
output_list = []
for allele_name in allele_names:
output_list.append({
'name': allele_name,
'pattern': [allele_name],
})
for toml_file in toml_files:
print('finding features using', toml_file.split('/')[-1])
feature_dict, feature_name = build_feature_dict(toml_file)
for allele_dict in output_list:
allele_dict['pattern'] = replace_allele_features(
allele_dict['pattern'] = tokenize_allele_features(
feature_dict, allele_dict['pattern'], feature_name, [])

separators_dict = build_separators_dict()
for allele_dict in output_list:
# replace separators
allele_dict['pattern'] = replace_allele_features(
allele_dict['pattern'] = tokenize_allele_features(
separators_dict, allele_dict['pattern'], '-', [])
# add other tags to untagged elements:
allele_dict['pattern'] = add_other_tag(allele_dict['pattern'])
return output_list


def prettier_json(input_dict):
'''
Formats json file to make it more readable
Parameter:
input_dict(dict): dictionary of alleles
Returns:
outpur_str(str): formatted input_dict
'''
output_str = json.dumps(input_dict, indent=3, ensure_ascii=False)

match = re.search(r'\[(?=\n)(\n|(?![{}]).)+\]', output_str)
Expand Down

0 comments on commit 50df31f

Please sign in to comment.