Import libraries

In [1]:
import json
import os
import re
import zipfile

import pandas as pd

Define Regex patterns

In [2]:
# start with an "in" tag and are followed by one or more "nn" tags.
# PATTERN_1 = r"\bin\b\s+(\bnn\b\s+)+"
PATTERN_1 = r"(?<!\S|-)in\b\s+(\bnn\b\s+)+"

# start with an "jj" tag and are followed by one or more "nn" tags
# PATTERN_2 = r"\bjj\b\s+(\bnn\b\s+)+"
# the above regex did not work because '\b' (word boundary) was matching other non-space characters like hypen. Example "in-jj nn" was matching but it should not.
# in the new pattern, we are using negative lookbehind to ensure that the "jj" tag is not preceded by a non-space character or hyphen.
PATTERN_2 = r"(?<!\S|-)jj\b\s+(\bnn\b\s+)+"
EMAIL = "omkarkabde@gmail.com"

patterns = {"pattern_1": PATTERN_1, "pattern_2": PATTERN_2}

Since we are using regex to match the tags, and each match returns match.span() , we find the start positions for each tag and word separately, then create a mapping dictionary.  

For example -   
Sentence = "She/pps looked/vbd at/in me/ppo provocatively/rb ."  
Words = "She looked at me provocatively"  
Tags = "pps vbd in ppo rb"  

```json
{
    0: 0,   # "She" -> "pps"
    4: 4,   # "looked" -> "vbd"
    10: 8,  # "at" -> "in"
    13: 11, # "me" -> "ppo"
    16: 15  # "provocatively" -> "rb"
}
```

This helps us get the `begin` and `end` of the text, not just tags

In [3]:
def create_match_dict(words, tags):
    """Creates a mapping of tag positions to word positions."""
    tag_length = 0
    word_length = 0
    match_dict = {0: 0}

    for word, tag in zip(words, tags):
        tag_length += len(tag) + 1
        word_length += len(word) + 1
        match_dict[tag_length] = word_length

    return match_dict

This function creates an entry for each row of the dataset.  
First split the sentence into words and tags, return words text and tags text.  
Then create the `match_dict` and find matches for the given regex pattern.  


In [4]:
def create_entry(row, pattern):
    """Create an entry for a given row and pattern."""
    text = row["raw_text"]

    # separate the words
    splits = text.split(" ")
    # separate the words and tags
    words = [i.split("/")[0] for i in splits]
    tags = [i.split("/")[1] for i in splits]

    # create separate text sentences
    tags_text, words_text = " ".join(tags), " ".join(words)

    # initialize the entry with the row data
    entry = row.to_dict()
    entry.pop("raw_text")
    entry["sent_text"] = words_text

    match_dict = create_match_dict(words, tags)
    matches = re.finditer(pattern, tags_text)

    phrases = []
    for match in matches:
        phrase = {}
        tag_begin, tag_end = match.span()
        phrase["begin"] = match_dict[tag_begin]
        phrase["end"] = match_dict[tag_end]
        # map the tag positions to word positions
        phrase["text"] = words_text[phrase["begin"] : phrase["end"] - 1]
        # remove the last space from the text
        phrase["phrase_type"] = match.group().removesuffix(" ")
        phrases.append(phrase)
    entry["phrases"] = phrases

    return entry

This is just a basic function to create the jsons and zips from the dicts

In [5]:
def create_zip(pattern_dict, filename):
    json_filename = f"{filename}.json"
    with open(json_filename, "w", encoding="utf-8") as f:
        json.dump(pattern_dict, f, indent=4, ensure_ascii=False)

    zip_filename = f"{filename}.zip"
    with zipfile.ZipFile(zip_filename, "w", zipfile.ZIP_DEFLATED) as zipf:
        zipf.write(json_filename, os.path.basename(json_filename))

    print(f"Created {zip_filename} containing {json_filename}")


def create_script_zip():
    """Function to zip the ipynb file"""
    script_filename = "task_b.ipynb"
    filename = f"{EMAIL}.zip"
    with zipfile.ZipFile(filename, "w", zipfile.ZIP_DEFLATED) as zipf:
        zipf.write(script_filename, os.path.basename(script_filename))

    print(f"Created {filename} containing {script_filename}")
    # os.remove(script_filename)

This function is the main one, It checks if the entry has any matching phrases then adds it to the json. and then creates the zips

In [6]:
def check_pattern(pattern_name):
    """Create a zip file for the given pattern."""
    df = pd.read_csv("dataset_B.csv")
    pattern_dict = {}
    pattern_dict["pattern"] = pattern_name.replace("_", " ")
    pattern = patterns[pattern_name]

    sents = []
    for idx, row in df.iterrows():
        try:
            entry = create_entry(row, pattern)
            if len(entry["phrases"]) > 0:
                # print(f"Entry: {entry}")
                sents.append(entry)
        except Exception as e:
            print(f"ERROR --- {idx} --- {e}")

    pattern_dict["sents"] = sents

    filename = f"{pattern_name}_{EMAIL}"
    create_zip(pattern_dict, filename)
    return pattern_dict

Run this function to get the json files.

In [7]:
for pattern_name in patterns:
    check_pattern(pattern_name)

create_script_zip()
print("All patterns processed.")


Created pattern_1_omkarkabde@gmail.com.zip containing pattern_1_omkarkabde@gmail.com.json
Created pattern_2_omkarkabde@gmail.com.zip containing pattern_2_omkarkabde@gmail.com.json
Created omkarkabde@gmail.com.zip containing task_b.ipynb
All patterns processed.
