In [5]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import re

import spacy

In [2]:
# ! python -m spacy download en_core_web_sm

In [3]:
nlp = spacy.load("en_core_web_sm")

#Create an EntityRuler for some of the names
nameRuler = nlp.add_pipe("entity_ruler")
patterns = [
    {"label": "PERSON", "pattern": [{"lower": 'delainey'}]},
    {"label": "PERSON", "pattern": [{"lower": 'jmf'}]},
    {"label": "PERSON", "pattern": [{"lower": 'dave'}]},
    {"label": "PERSON", "pattern": [{"lower": 'forney'}]},
    {"label": "PERSON", "pattern": [{"lower": 'lloyd'}]},
    {"label": "PERSON", "pattern": [{"lower": 'phillip'}]},
    {"label": "PERSON", "pattern": [{"lower": 'tj'}]},
    {"label": "ORG", "pattern": [{"lower": 'ercot'}]},
]
nameRuler.add_patterns(patterns)

In [4]:
ent_replacements = {
    'PERSON': "Steve",
    'ORG': "Apple",
    'GPE': "Cupertino",
}

def change_ents(doc, ent_replacements):
    # find all the ents
    regex_ent_replacements = {key: "" for key in ent_replacements}
    for ent in doc.ents:
        if ent.label_ in ent_replacements:
            text = re.sub("\(.*|\).*|\+.*", "", ent.text)
            regex_ent_replacements[ent.label_] = f"{regex_ent_replacements[ent.label_]}|{text}"
    regex_ent_replacements = {key: regex_ent_replacements[key][1:] for key in regex_ent_replacements}

    # replace all the ends
    new_text = doc.text
    for ent_label in regex_ent_replacements:
        if regex_ent_replacements[ent_label] != "":
            try:
                new_text = re.sub(regex_ent_replacements[ent_label], ent_replacements[ent_label], new_text)
            except:
                continue
    new_text = re.sub("[ \n\t]+", " ", new_text)
    
    return new_text

In [6]:
email_file = "processed_emails.csv"

df = pd.read_csv(f"../../data/{email_file}", index_col='Original Index')
# token_list = [token for token in doc if " " not in token.text]

replaced_emails = [change_ents(doc, ent_replacements) for doc in nlp.pipe(tqdm(df['Email']))]

df['Classify Email'] = replaced_emails

df.to_csv(email_file)

  0%|          | 0/46777 [00:00<?, ?it/s]

ValueError: [E088] Text of length 1621936 exceeds maximum of 1000000. The parser and NER models require roughly 1GB of temporary memory per 100,000 characters in the input. This means long texts may cause memory allocation errors. If you're not using the parser or NER, it's probably safe to increase the `nlp.max_length` limit. The limit is in number of characters, so you can check whether your inputs are too long by checking `len(text)`.

In [None]:
email_file = "exec_emails.csv"

df = pd.read_csv(f"../data/{email_file}", index_col='Original Index')
# token_list = [token for token in doc if " " not in token.text]

replaced_emails = [change_ents(doc, ent_replacements) for doc in nlp.pipe(df['Email'])]

df['Classify Email'] = replaced_emails

df.to_csv(email_file)

In [None]:
email_file = "poi_emails.csv"

df = pd.read_csv(f"../data/{email_file}", index_col='Original Index')
# token_list = [token for token in doc if " " not in token.text]

replaced_emails = [change_ents(doc, ent_replacements) for doc in nlp.pipe(df['Email'])]

df['Classify Email'] = replaced_emails

df.to_csv(email_file)

In [None]:
replaced_emails = [change_ents(doc, ent_replacements) for doc in nlp.pipe(df['Email'])]

In [None]:
df['Classify Email'] = replaced_emails

In [None]:
df

In [None]:
# name = 'Skilling'
# for i in df[df['Sender'] == name]['Classify Email'].sample(10):
#     print(i)
#     print()

In [None]:
df.to_csv(email_file)