Merge pull request #38 from leobeeson/refactor-mwe-module

Refactor mwe module
meghdadFar · May 27, 2023 · f5bf642 · f5bf642
2 parents 283995f + 2af5c78
commit f5bf642
Show file tree

Hide file tree

Showing 10 changed files with 1,301 additions and 1,123 deletions.
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,12 @@
+{
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Pytest",
+            "type": "python",
+            "request": "launch",
+            "module": "pytest",
+            "console": "integratedTerminal"
+          }
+    ]
+}
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,3 +1,8 @@
 {
-    "esbonio.sphinx.confDir": ""
+    "esbonio.sphinx.confDir": "",
+    "python.testing.pytestArgs": [
+        "tests"
+    ],
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestEnabled": true
 }
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -27,7 +27,7 @@ pytest = ">=7.1"
 pytest-cov = ">=3.0.0"
 ipython = ">=8.4.0"
 sphinx = ">=v6.1.3"
-sphinx-rtd-theme= "1.2.0"
+sphinx-rtd-theme= "1.2.1"
 
 [tool.poetry.scripts]
 script_download = "wordview.bin.downloads:download_nltk_req"

diff --git a/tests/__init__.py → tests/clustering/__init__.py b/tests/__init__.py → tests/clustering/__init__.py
diff --git a/tests/mwe/test_mwe.py b/tests/mwe/test_mwe.py
@@ -0,0 +1,86 @@
+import pytest
+from unittest.mock import patch, MagicMock
+import pandas as pd
+from wordview.mwes.mwe import MWE
+
+
+@pytest.fixture
+def dummy_text_pandas():
+    text = [
+        "new   york  is the capital     of new  york    state", 
+        "new york is called the big apple", 
+        "i am going to the big apple",
+        ]
+    return pd.DataFrame(data = text, columns = ["text"])
+
+
+@pytest.fixture
+def dummy_text_pandas_with_a_noun_compund():
+    text = [
+        "this sentence has a noun compound",
+        ]
+    return pd.DataFrame(data = text, columns = ["text"])
+
+
+dummy_pos_tags_with_noun_compund = [("this", "XXX"),("sentence", "XXX"),("has", "XXX"),("a", "XXX"),("noun", "NN"),("compound", "NN")]
+
+
+@pytest.fixture
+def dummy_text_pandas_with_no_noun_compund():
+    text = [
+        "no sequence of nouns in this one",
+        ]
+    return pd.DataFrame(data = text, columns = ["text"])
+
+
+dummy_pos_tags_without_noun_compund = [("no", "XXX"),("sequence", "XXX"),("of", "XXX"),("nouns", "XXX"),("in", "XXX"),("this", "XXX"),("one", "XXX")]
+
+
+class TestMweInitialisation:
+
+    def test_mwe_does_not_tokenize_text_with_multiple_whitespaces(self, dummy_text_pandas):
+        mwe = MWE(df = dummy_text_pandas, text_column = "text", tokenize=False)
+        assert mwe.df["text"][0] == "new   york  is the capital     of new  york    state"
+
+
+    def test_mwe_tokenizes_text_with_multiple_whitespaces(self, dummy_text_pandas):
+        mwe = MWE(df = dummy_text_pandas, text_column = "text", tokenize=True)
+        assert mwe.df["text"][0] == "new york is the capital of new york state"
+
+
+    def test_mwe_with_wrong_mwe_type_raises_value_error(self, dummy_text_pandas):
+        with pytest.raises(ValueError):
+            mwe = MWE(df = dummy_text_pandas, text_column = "text", mwe_types = ["XXX"])
+
+
+    def test_mwe_with_empty_mwe_type_raises_value_error(self, dummy_text_pandas):
+        with pytest.raises(ValueError):
+            mwe = MWE(df = dummy_text_pandas, text_column = "text", mwe_types = [])
+
+
+    def test_mwe_with_non_list_mwe_type_raises_type_error(self, dummy_text_pandas):
+        with pytest.raises(TypeError):
+            mwe = MWE(df = dummy_text_pandas, text_column = "text", mwe_types = "NC")
+
+
+class TestMweCounter:
+
+    @patch("wordview.mwes.mwe.get_pos_tags", MagicMock(return_value = dummy_pos_tags_with_noun_compund))
+    def test_mwe_if_nc_present_returns_counts_with_nc(self, dummy_text_pandas_with_a_noun_compund):
+        mwe = MWE(df = dummy_text_pandas_with_a_noun_compund, text_column = "text", tokenize=True, mwe_types = ["NC"])
+        counts = mwe.get_counts()
+        assert counts["NC"] == {"noun compound": 1}
+
+
+    @patch("wordview.mwes.mwe.get_pos_tags", MagicMock(return_value=dummy_pos_tags_without_noun_compund))
+    def test_mwe_if_no_nc_returns_empty_mwe_counts(self, dummy_text_pandas_with_no_noun_compund):
+        mwe = MWE(df = dummy_text_pandas_with_no_noun_compund, text_column = "text", tokenize=True, mwe_types = ["NC"])
+        counts = mwe.get_counts()
+        assert counts["NC"] == {}
+
+
+@pytest.mark.xfail
+def test_mwe_build_counts(dummy_text_pandas_with_no_noun_compund):
+    mwe = MWE(df = dummy_text_pandas_with_no_noun_compund, text_column = "text", tokenize=True, mwe_types = ["NC"])
+    counts = mwe.get_counts()
+    assert False
diff --git a/tests/mwe_utils/test_mwe_utils.py b/tests/mwe_utils/test_mwe_utils.py
@@ -1,5 +1,5 @@
 import pytest
-from wordview.mwes.mwe_utils import get_ngrams
+from wordview.mwes.mwe_utils import get_ngrams, is_alphanumeric_latinscript_multigram
 
 
 def test_get_ngrams_int_input():
@@ -15,3 +15,85 @@ def test_get_ngrams_int_list_input():
 def test_get_ngrams_mixed_list_input():
     with pytest.raises(TypeError):
         get_ngrams(sentence=[1, 2, 'test'], n=2)
+
+
+class TestIsAlphanumericLatinscriptMultigram:
+
+    def test_is_alphanumeric_latinscript_multigram_matches_alphabetic_bigram(self):
+        match = is_alphanumeric_latinscript_multigram("ab")
+        assert match is not None
+
+
+    def test_is_alphanumeric_latinscript_multigram_matches_numeric_bigram(self):
+        match = is_alphanumeric_latinscript_multigram("01")
+        assert match is not None
+
+
+    def test_is_alphanumeric_latinscript_multigram_matches_alphanumeric_bigram(self):
+        match = is_alphanumeric_latinscript_multigram("a0")
+        assert match is not None
+
+
+    def test_is_alphanumeric_latinscript_multigram_matches_numeralphabetic_bigram(self):
+        match = is_alphanumeric_latinscript_multigram("0a")
+        assert match is not None
+
+
+    def test_is_alphanumeric_latinscript_multigram_matches_largest_alphanumeric_unique_letter_ngram(self):
+        match = is_alphanumeric_latinscript_multigram("abcdefghijklmnopqrstuvwxyz0123456789")
+        assert match is not None
+
+
+    def test_is_alphanumeric_latinscript_multigram_does_not_match_alphabetical_unigram(self):
+        match = is_alphanumeric_latinscript_multigram("a")
+        assert match is None
+
+
+    def test_is_alphanumeric_latinscript_multigram_does_not_match_numeric_unigram(self):
+        match = is_alphanumeric_latinscript_multigram("0")
+        assert match is None
+
+
+    def test_is_alphanumeric_latinscript_multigram_does_not_match_symbol_unigram(self):
+        match = is_alphanumeric_latinscript_multigram("%")
+        assert match is None
+
+
+    def test_is_alphanumeric_latinscript_multigram_matches_ngram_with_symbol_after_second_char(self):
+        match = is_alphanumeric_latinscript_multigram("abcd$efg")
+        assert match is not None
+
+
+    def test_is_alphanumeric_latinscript_multigram_does_not_match_ngram_with_symbol_before_third_char(self):
+        match = is_alphanumeric_latinscript_multigram("a$bcdefg")
+        assert match is None
+
+
+    def test_is_alphanumeric_latinscript_multigram_matches_ngram_with_hyphen_after_second_char(self):
+        match = is_alphanumeric_latinscript_multigram("abcd-efg")
+        assert match is not None
+
+
+    def test_is_alphanumeric_latinscript_multigram_does_not_match_ngram_with_hyphen_before_third_char(self):
+        match = is_alphanumeric_latinscript_multigram("a-bcdefg")
+        assert match is None
+
+
+    def test_is_alphanumeric_latinscript_multigram_does_not_match_ngram_with_greek_characters(self):
+        match = is_alphanumeric_latinscript_multigram("αβγ")
+        assert match is None
+
+
+    def test_is_alphanumeric_latinscript_multigram_does_not_match_ngram_with_hebrew_characters(self):
+        match = is_alphanumeric_latinscript_multigram("אָלֶף")
+        assert match is None
+
+
+    def test_is_alphanumeric_latinscript_multigram_does_not_match_ngram_with_arabic_characters(self):
+        match = is_alphanumeric_latinscript_multigram("أَلِف")
+        assert match is None
+
+
+    def test_is_alphanumeric_latinscript_multigram_does_not_match_ngram_with_russian_cyrillic_characters(self):
+        match = is_alphanumeric_latinscript_multigram("азъ")
+        assert match is None
diff --git a/wordview/mwes/am.py b/wordview/mwes/am.py
@@ -1,5 +1,5 @@
-from typing import Dict, List
 import math
+from typing import Dict, List
 
 
 def calculate_pmi(

diff --git a/wordview/mwes/mwe.py b/wordview/mwes/mwe.py
@@ -1,13 +1,16 @@
 import json
+import re
+from collections import Counter
 from pathlib import Path
-from typing import Dict, List, Union
+from typing import Dict, List, Optional, Union
 
 import pandas
+import tqdm
 from nltk import word_tokenize
 
 from wordview import logger
 from wordview.mwes.am import calculate_am
-from wordview.mwes.mwe_utils import get_counts
+from wordview.mwes.mwe_utils import get_pos_tags, is_alphanumeric_latinscript_multigram
 
 
 class MWE(object):
@@ -19,7 +22,7 @@ def __init__(
         self,
         df: pandas.DataFrame,
         text_column: str,
-        mwe_types: List[str] = ["NC"],
+        mwe_types: list[str] = ["NC"],
         tokenize=False,
     ) -> None:
         """Initialize a new MWE object with the given df, text_column and mwe_types.
@@ -35,6 +38,10 @@ def __init__(
         """
         self.df = df
         self.text_column = text_column
+        if not mwe_types:
+            raise ValueError("mwe_types is empty.")
+        if not isinstance(mwe_types, list):
+            raise TypeError("mwe_types is not a list.")
         for mt in mwe_types:
             if mt not in ["NC", "JNC"]:
                 raise ValueError(f"{mt} type is not recognized.")
@@ -84,7 +91,7 @@ def _check_tokenized(self) -> None:
                 f"It seems that the content of {self.text_column} in the input data frame is not (fully) tokenized.\nThis can lead to poor results. Consider re-instantiating your MWE instance with 'tokenize' flag set to True.\nNote that this might lead to a slower instantiation."
             )
 
-    def build_counts(self, counts_filename: str = "") -> Union[None, Dict]:  # type: ignore
+    def build_counts(self, counts_filename: Optional[str] = None) -> Optional[Dict]:
         """Create various count files to be used by downstream methods
         by calling wordview.mwes.mwe_utils.
 
@@ -95,9 +102,7 @@ def build_counts(self, counts_filename: str = "") -> Union[None, Dict]:  # type:
             None when no counts_filename is provided, otherwise res which is a dictionary of counts.
         """
         logger.info("Creating counts...")
-        res = get_counts(
-            df=self.df, text_column=self.text_column, mwe_types=self.mwe_types
-        )
+        res = self.get_counts()
         if not counts_filename:
             return res
         else:
@@ -107,14 +112,15 @@ def build_counts(self, counts_filename: str = "") -> Union[None, Dict]:  # type:
             except Exception as e:
                 logger.error(e)
                 raise e
+            return None
 
     def extract_mwes(
         self,
         am: str = "pmi",
-        mwes_filename: str = "",
-        counts_filename: str = "",
-        counts: Dict = {},
-    ) -> Dict:
+        mwes_filename: Optional[str] = None,
+        counts_filename: Optional[str] = None,
+        counts: Optional[dict] = None,
+    ) -> dict:
         """
         Extract MWEs from counts_filename with respect to the association measure specified by `am`.
 
@@ -128,7 +134,7 @@ def extract_mwes(
         """
         if counts:
             count_data = counts
-        else:
+        elif counts_filename is not None:
             try:
                 with open(counts_filename, "r") as file:
                     count_data = json.load(file)
@@ -138,6 +144,8 @@ def extract_mwes(
                     "Counts must be provided either via input argument `counts` or `counts_filename`. Argument `counts` is not specified and it seems like there was an error reading the counts from `counts_filename`."
                 )
                 raise e
+        else:
+            raise ValueError("Either 'counts' or 'counts_filename' must be provided.")
 
         logger.info(f"Extracting {self.mwe_types} based on {am}")
         mwe_am_dict = calculate_am(
@@ -154,3 +162,81 @@ def extract_mwes(
                 return mwe_am_dict
         else:
             return mwe_am_dict
+
+    def get_counts(self) -> Dict:
+        """Read a corpus in pandas.DataFrame format and generates all counts necessary for calculating AMs.
+
+        Args:
+            None
+
+        Returns:
+            res: Dictionary of mwe_types to dictionary of individual mwe within that type and their count.
+                E.g. {'NC':{'climate change': 10, 'brain drain': 3}, 'JNC': {'black sheep': 3, 'red flag': 2}}
+        """
+        res: Dict = {}
+        for mt in self.mwe_types:
+            res[mt] = {}
+        res["WORDS"] = {}
+        for sent in tqdm.tqdm(self.df[self.text_column]):
+            tokens = sent.split(" ")
+            word_count_dict = Counter(tokens)
+            for k, v in word_count_dict.items():
+                if k in res["WORDS"]:
+                    res["WORDS"][k] += v
+                else:
+                    res["WORDS"][k] = v
+            for mt in self.mwe_types:
+                mwes_count_dic = self.extract_mwes_from_sent(tokens, mwe_type=mt)
+                for k, v in mwes_count_dic.items():
+                    if k in res[mt]:
+                        res[mt][k] += v
+                    else:
+                        res[mt][k] = v
+        return res
+
+    def extract_mwes_from_sent(self, tokens: list[str], mwe_type: str) -> Dict:
+        """Extract two-word noun compounds from tokenized input.
+
+        Args:
+            tokens: A tokenized sentence, i.e. list of tokens.
+            type: Type of MWE. Any of ['NC', 'JNC'].
+
+        Returns:
+            mwes_count_dic: Dictionary of compounds to their count.
+        """
+        if not isinstance(tokens, list):
+            raise TypeError(
+                f'Input argument "tokens" must be a list of string. Currently it is of type {type(tokens)} \
+                with a value of: {tokens}.'
+            )
+        if len(tokens) == 0:
+            return {}
+        mwes = []
+        postag_tokens: list[tuple[str, str]] = get_pos_tags(tokens)
+        w1_pos_tags = []
+        w2_pos_tags = []
+        if mwe_type == "NC":
+            w1_pos_tags = ["NN", "NNS"]
+            w2_pos_tags = ["NN", "NNS"]
+        elif mwe_type == "JNC":
+            w1_pos_tags = ["JJ"]
+            w2_pos_tags = ["NN", "NNS"]
+        for i in range(len(postag_tokens) - 1):
+            w1 = postag_tokens[i]
+            if w1[1] not in w1_pos_tags:
+                continue
+            else:
+                w2 = postag_tokens[i + 1]
+                if not is_alphanumeric_latinscript_multigram(
+                    w1[0]
+                ) or not is_alphanumeric_latinscript_multigram(w2[0]):
+                    continue
+                if w2[1] in w2_pos_tags:
+                    if i + 2 < len(postag_tokens):
+                        w3 = postag_tokens[i + 2]
+                        if w3 not in ["NN", "NNS"]:
+                            mwes.append(w1[0] + " " + w2[0])
+                    else:
+                        mwes.append(w1[0] + " " + w2[0])
+        mwes_count_dic = Counter(mwes)
+        return mwes_count_dic