Skip to content

Commit

Permalink
Merge pull request #38 from leobeeson/refactor-mwe-module
Browse files Browse the repository at this point in the history
Refactor mwe module
  • Loading branch information
meghdadFar committed May 27, 2023
2 parents 283995f + 2af5c78 commit f5bf642
Show file tree
Hide file tree
Showing 10 changed files with 1,301 additions and 1,123 deletions.
12 changes: 12 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"version": "0.2.0",
"configurations": [
{
"name": "Pytest",
"type": "python",
"request": "launch",
"module": "pytest",
"console": "integratedTerminal"
}
]
}
7 changes: 6 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
{
"esbonio.sphinx.confDir": ""
"esbonio.sphinx.confDir": "",
"python.testing.pytestArgs": [
"tests"
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true
}
2,031 changes: 1,006 additions & 1,025 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ pytest = ">=7.1"
pytest-cov = ">=3.0.0"
ipython = ">=8.4.0"
sphinx = ">=v6.1.3"
sphinx-rtd-theme= "1.2.0"
sphinx-rtd-theme= "1.2.1"

[tool.poetry.scripts]
script_download = "wordview.bin.downloads:download_nltk_req"
Expand Down
File renamed without changes.
86 changes: 86 additions & 0 deletions tests/mwe/test_mwe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import pytest
from unittest.mock import patch, MagicMock
import pandas as pd
from wordview.mwes.mwe import MWE


@pytest.fixture
def dummy_text_pandas():
text = [
"new york is the capital of new york state",
"new york is called the big apple",
"i am going to the big apple",
]
return pd.DataFrame(data = text, columns = ["text"])


@pytest.fixture
def dummy_text_pandas_with_a_noun_compund():
text = [
"this sentence has a noun compound",
]
return pd.DataFrame(data = text, columns = ["text"])


dummy_pos_tags_with_noun_compund = [("this", "XXX"),("sentence", "XXX"),("has", "XXX"),("a", "XXX"),("noun", "NN"),("compound", "NN")]


@pytest.fixture
def dummy_text_pandas_with_no_noun_compund():
text = [
"no sequence of nouns in this one",
]
return pd.DataFrame(data = text, columns = ["text"])


dummy_pos_tags_without_noun_compund = [("no", "XXX"),("sequence", "XXX"),("of", "XXX"),("nouns", "XXX"),("in", "XXX"),("this", "XXX"),("one", "XXX")]


class TestMweInitialisation:

def test_mwe_does_not_tokenize_text_with_multiple_whitespaces(self, dummy_text_pandas):
mwe = MWE(df = dummy_text_pandas, text_column = "text", tokenize=False)
assert mwe.df["text"][0] == "new york is the capital of new york state"


def test_mwe_tokenizes_text_with_multiple_whitespaces(self, dummy_text_pandas):
mwe = MWE(df = dummy_text_pandas, text_column = "text", tokenize=True)
assert mwe.df["text"][0] == "new york is the capital of new york state"


def test_mwe_with_wrong_mwe_type_raises_value_error(self, dummy_text_pandas):
with pytest.raises(ValueError):
mwe = MWE(df = dummy_text_pandas, text_column = "text", mwe_types = ["XXX"])


def test_mwe_with_empty_mwe_type_raises_value_error(self, dummy_text_pandas):
with pytest.raises(ValueError):
mwe = MWE(df = dummy_text_pandas, text_column = "text", mwe_types = [])


def test_mwe_with_non_list_mwe_type_raises_type_error(self, dummy_text_pandas):
with pytest.raises(TypeError):
mwe = MWE(df = dummy_text_pandas, text_column = "text", mwe_types = "NC")


class TestMweCounter:

@patch("wordview.mwes.mwe.get_pos_tags", MagicMock(return_value = dummy_pos_tags_with_noun_compund))
def test_mwe_if_nc_present_returns_counts_with_nc(self, dummy_text_pandas_with_a_noun_compund):
mwe = MWE(df = dummy_text_pandas_with_a_noun_compund, text_column = "text", tokenize=True, mwe_types = ["NC"])
counts = mwe.get_counts()
assert counts["NC"] == {"noun compound": 1}


@patch("wordview.mwes.mwe.get_pos_tags", MagicMock(return_value=dummy_pos_tags_without_noun_compund))
def test_mwe_if_no_nc_returns_empty_mwe_counts(self, dummy_text_pandas_with_no_noun_compund):
mwe = MWE(df = dummy_text_pandas_with_no_noun_compund, text_column = "text", tokenize=True, mwe_types = ["NC"])
counts = mwe.get_counts()
assert counts["NC"] == {}


@pytest.mark.xfail
def test_mwe_build_counts(dummy_text_pandas_with_no_noun_compund):
mwe = MWE(df = dummy_text_pandas_with_no_noun_compund, text_column = "text", tokenize=True, mwe_types = ["NC"])
counts = mwe.get_counts()
assert False
84 changes: 83 additions & 1 deletion tests/mwe_utils/test_mwe_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import pytest
from wordview.mwes.mwe_utils import get_ngrams
from wordview.mwes.mwe_utils import get_ngrams, is_alphanumeric_latinscript_multigram


def test_get_ngrams_int_input():
Expand All @@ -15,3 +15,85 @@ def test_get_ngrams_int_list_input():
def test_get_ngrams_mixed_list_input():
with pytest.raises(TypeError):
get_ngrams(sentence=[1, 2, 'test'], n=2)


class TestIsAlphanumericLatinscriptMultigram:

def test_is_alphanumeric_latinscript_multigram_matches_alphabetic_bigram(self):
match = is_alphanumeric_latinscript_multigram("ab")
assert match is not None


def test_is_alphanumeric_latinscript_multigram_matches_numeric_bigram(self):
match = is_alphanumeric_latinscript_multigram("01")
assert match is not None


def test_is_alphanumeric_latinscript_multigram_matches_alphanumeric_bigram(self):
match = is_alphanumeric_latinscript_multigram("a0")
assert match is not None


def test_is_alphanumeric_latinscript_multigram_matches_numeralphabetic_bigram(self):
match = is_alphanumeric_latinscript_multigram("0a")
assert match is not None


def test_is_alphanumeric_latinscript_multigram_matches_largest_alphanumeric_unique_letter_ngram(self):
match = is_alphanumeric_latinscript_multigram("abcdefghijklmnopqrstuvwxyz0123456789")
assert match is not None


def test_is_alphanumeric_latinscript_multigram_does_not_match_alphabetical_unigram(self):
match = is_alphanumeric_latinscript_multigram("a")
assert match is None


def test_is_alphanumeric_latinscript_multigram_does_not_match_numeric_unigram(self):
match = is_alphanumeric_latinscript_multigram("0")
assert match is None


def test_is_alphanumeric_latinscript_multigram_does_not_match_symbol_unigram(self):
match = is_alphanumeric_latinscript_multigram("%")
assert match is None


def test_is_alphanumeric_latinscript_multigram_matches_ngram_with_symbol_after_second_char(self):
match = is_alphanumeric_latinscript_multigram("abcd$efg")
assert match is not None


def test_is_alphanumeric_latinscript_multigram_does_not_match_ngram_with_symbol_before_third_char(self):
match = is_alphanumeric_latinscript_multigram("a$bcdefg")
assert match is None


def test_is_alphanumeric_latinscript_multigram_matches_ngram_with_hyphen_after_second_char(self):
match = is_alphanumeric_latinscript_multigram("abcd-efg")
assert match is not None


def test_is_alphanumeric_latinscript_multigram_does_not_match_ngram_with_hyphen_before_third_char(self):
match = is_alphanumeric_latinscript_multigram("a-bcdefg")
assert match is None


def test_is_alphanumeric_latinscript_multigram_does_not_match_ngram_with_greek_characters(self):
match = is_alphanumeric_latinscript_multigram("αβγ")
assert match is None


def test_is_alphanumeric_latinscript_multigram_does_not_match_ngram_with_hebrew_characters(self):
match = is_alphanumeric_latinscript_multigram("אָלֶף")
assert match is None


def test_is_alphanumeric_latinscript_multigram_does_not_match_ngram_with_arabic_characters(self):
match = is_alphanumeric_latinscript_multigram("أَلِف")
assert match is None


def test_is_alphanumeric_latinscript_multigram_does_not_match_ngram_with_russian_cyrillic_characters(self):
match = is_alphanumeric_latinscript_multigram("азъ")
assert match is None
2 changes: 1 addition & 1 deletion wordview/mwes/am.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from typing import Dict, List
import math
from typing import Dict, List


def calculate_pmi(
Expand Down
110 changes: 98 additions & 12 deletions wordview/mwes/mwe.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import json
import re
from collections import Counter
from pathlib import Path
from typing import Dict, List, Union
from typing import Dict, List, Optional, Union

import pandas
import tqdm
from nltk import word_tokenize

from wordview import logger
from wordview.mwes.am import calculate_am
from wordview.mwes.mwe_utils import get_counts
from wordview.mwes.mwe_utils import get_pos_tags, is_alphanumeric_latinscript_multigram


class MWE(object):
Expand All @@ -19,7 +22,7 @@ def __init__(
self,
df: pandas.DataFrame,
text_column: str,
mwe_types: List[str] = ["NC"],
mwe_types: list[str] = ["NC"],
tokenize=False,
) -> None:
"""Initialize a new MWE object with the given df, text_column and mwe_types.
Expand All @@ -35,6 +38,10 @@ def __init__(
"""
self.df = df
self.text_column = text_column
if not mwe_types:
raise ValueError("mwe_types is empty.")
if not isinstance(mwe_types, list):
raise TypeError("mwe_types is not a list.")
for mt in mwe_types:
if mt not in ["NC", "JNC"]:
raise ValueError(f"{mt} type is not recognized.")
Expand Down Expand Up @@ -84,7 +91,7 @@ def _check_tokenized(self) -> None:
f"It seems that the content of {self.text_column} in the input data frame is not (fully) tokenized.\nThis can lead to poor results. Consider re-instantiating your MWE instance with 'tokenize' flag set to True.\nNote that this might lead to a slower instantiation."
)

def build_counts(self, counts_filename: str = "") -> Union[None, Dict]: # type: ignore
def build_counts(self, counts_filename: Optional[str] = None) -> Optional[Dict]:
"""Create various count files to be used by downstream methods
by calling wordview.mwes.mwe_utils.
Expand All @@ -95,9 +102,7 @@ def build_counts(self, counts_filename: str = "") -> Union[None, Dict]: # type:
None when no counts_filename is provided, otherwise res which is a dictionary of counts.
"""
logger.info("Creating counts...")
res = get_counts(
df=self.df, text_column=self.text_column, mwe_types=self.mwe_types
)
res = self.get_counts()
if not counts_filename:
return res
else:
Expand All @@ -107,14 +112,15 @@ def build_counts(self, counts_filename: str = "") -> Union[None, Dict]: # type:
except Exception as e:
logger.error(e)
raise e
return None

def extract_mwes(
self,
am: str = "pmi",
mwes_filename: str = "",
counts_filename: str = "",
counts: Dict = {},
) -> Dict:
mwes_filename: Optional[str] = None,
counts_filename: Optional[str] = None,
counts: Optional[dict] = None,
) -> dict:
"""
Extract MWEs from counts_filename with respect to the association measure specified by `am`.
Expand All @@ -128,7 +134,7 @@ def extract_mwes(
"""
if counts:
count_data = counts
else:
elif counts_filename is not None:
try:
with open(counts_filename, "r") as file:
count_data = json.load(file)
Expand All @@ -138,6 +144,8 @@ def extract_mwes(
"Counts must be provided either via input argument `counts` or `counts_filename`. Argument `counts` is not specified and it seems like there was an error reading the counts from `counts_filename`."
)
raise e
else:
raise ValueError("Either 'counts' or 'counts_filename' must be provided.")

logger.info(f"Extracting {self.mwe_types} based on {am}")
mwe_am_dict = calculate_am(
Expand All @@ -154,3 +162,81 @@ def extract_mwes(
return mwe_am_dict
else:
return mwe_am_dict

def get_counts(self) -> Dict:
"""Read a corpus in pandas.DataFrame format and generates all counts necessary for calculating AMs.
Args:
None
Returns:
res: Dictionary of mwe_types to dictionary of individual mwe within that type and their count.
E.g. {'NC':{'climate change': 10, 'brain drain': 3}, 'JNC': {'black sheep': 3, 'red flag': 2}}
"""
res: Dict = {}
for mt in self.mwe_types:
res[mt] = {}
res["WORDS"] = {}
for sent in tqdm.tqdm(self.df[self.text_column]):
tokens = sent.split(" ")
word_count_dict = Counter(tokens)
for k, v in word_count_dict.items():
if k in res["WORDS"]:
res["WORDS"][k] += v
else:
res["WORDS"][k] = v
for mt in self.mwe_types:
mwes_count_dic = self.extract_mwes_from_sent(tokens, mwe_type=mt)
for k, v in mwes_count_dic.items():
if k in res[mt]:
res[mt][k] += v
else:
res[mt][k] = v
return res

def extract_mwes_from_sent(self, tokens: list[str], mwe_type: str) -> Dict:
"""Extract two-word noun compounds from tokenized input.
Args:
tokens: A tokenized sentence, i.e. list of tokens.
type: Type of MWE. Any of ['NC', 'JNC'].
Returns:
mwes_count_dic: Dictionary of compounds to their count.
"""
if not isinstance(tokens, list):
raise TypeError(
f'Input argument "tokens" must be a list of string. Currently it is of type {type(tokens)} \
with a value of: {tokens}.'
)
if len(tokens) == 0:
return {}
mwes = []
postag_tokens: list[tuple[str, str]] = get_pos_tags(tokens)
w1_pos_tags = []
w2_pos_tags = []
if mwe_type == "NC":
w1_pos_tags = ["NN", "NNS"]
w2_pos_tags = ["NN", "NNS"]
elif mwe_type == "JNC":
w1_pos_tags = ["JJ"]
w2_pos_tags = ["NN", "NNS"]
for i in range(len(postag_tokens) - 1):
w1 = postag_tokens[i]
if w1[1] not in w1_pos_tags:
continue
else:
w2 = postag_tokens[i + 1]
if not is_alphanumeric_latinscript_multigram(
w1[0]
) or not is_alphanumeric_latinscript_multigram(w2[0]):
continue
if w2[1] in w2_pos_tags:
if i + 2 < len(postag_tokens):
w3 = postag_tokens[i + 2]
if w3 not in ["NN", "NNS"]:
mwes.append(w1[0] + " " + w2[0])
else:
mwes.append(w1[0] + " " + w2[0])
mwes_count_dic = Counter(mwes)
return mwes_count_dic
Loading

0 comments on commit f5bf642

Please sign in to comment.