In [3]:
import sys
import os
import re
import pprint

In [14]:
#For Part 1, enhance the email and phone regex patterns to capture
#more examples of obscured formats from the text.
#For Part 3, you might need to introduce new patterns for specific cases.
#"""

# Email patterns for .edu domains

# Each pattern should include two capturing groups:
#   Group 1: The "name" portion of an email (before @)
#   Group 2: The domain portion (after @, before .edu)
email_patterns = []
email_patterns.append(r'([A-Za-z.]+)@([A-Za-z.]+)\.edu')  # Normal pattern
email_patterns.append(r'([A-Za-z.]+)\s@\s([A-Za-z.]+)\.edu')
email_patterns.append(r'([A-Za-z.]+)\s*(?:@|at)\s*([A-Za-z.]+)\s*(?:\.|dot)\s*[Ee][Dd][Uu]')
email_patterns.append(r'([A-Za-z.]+)(?:<del>)?\s*(?:@|at)\s*([A-Za-z.]+)\.edu')
email_patterns.append(r'([A-Za-z.]+)\s*(?:@|at|&#x40;)\s*([A-Za-z.]+)\.edu')

# Pattern to handle "at" and "dot" obfuscations



  # Pattern with spaces around @
phone_patterns = []
phone_patterns.append(r'(\d{3})-(\d{3})-(\d{4})') 
# Phone number patterns handling different separators and parentheses
phone_patterns.append(r'\(?(\d{3})\)?[ \-\.]?(\d{3})[ \-\.]?(\d{4})')


In [5]:
def extract_contacts(filename, file_object):
    results = []
    for line in file_object:
        # Process the line if needed before matching patterns

        # Check against email patterns
        for email_pat in email_patterns:
            matches = re.findall(email_pat, line)
            for match in matches:
                email = '{}@{}.edu'.format(match[0], match[1])
                results.append((filename, 'e', email))

        # Check against phone patterns
        for phone_pat in phone_patterns:
            matches = re.findall(phone_pat, line)
            for match in matches:
                phone = '{}-{}-{}'.format(match[0], match[1], match[2])
                results.append((filename, 'p', phone))
    return results

In [6]:
#Processes all files in a given directory and extracts email and phone numbers.
#Returns a list of guesses and the filenames processed.

def process_directory(data_directory):
    guesses = []
    filenames = []

    for filename in os.listdir(data_directory):
        if filename.startswith('.'):
            continue
        filenames.append(filename)
        file_path = os.path.join(data_directory, filename)
        with open(file_path, 'r', encoding='latin-1') as file:
            file_guesses = extract_contacts(filename, file)
            guesses.extend(file_guesses)
    return guesses, filenames

In [7]:
def load_gold(gold_path):
    gold_data = []
    with open(gold_path, 'r', encoding='latin-1') as gold_file:
        for line in gold_file:
            gold_data.append(tuple(line.strip().split('\t')))
    return gold_data

In [8]:

#Compares the extracted guesses with the gold standard, printing true positives,
#false positives, and false negatives.

def evaluate(guesses, gold_data, filenames):
    guesses = [(filename, t, value.lower()) for (filename, t, value) in guesses]
    gold_data = [(filename, t, value.lower()) for (filename, t, value) in gold_data]

    guess_set = set(guesses)
    gold_set = set(gold_data)

    gold_dict = {fname: [g for g in gold_data if fname == g[0]] for fname in filenames}

    tp = guess_set.intersection(gold_set)
    fp = guess_set - gold_set
    fn = gold_set - guess_set

    pp = pprint.PrettyPrinter()

    print(f'True Positives ({len(tp)}): ')
    pp.pprint(tp)

    print(f'False Positives ({len(fp)}): ')
    for item in fp:
        print(item)
        for gold in gold_dict[item[0]]:
            print(f'   gold: {gold}')

    print(f'False Negatives ({len(fn)}): ')
    pp.pprint(fn)

    print(f'Summary: tp={len(tp)}, fp={len(fp)}, fn={len(fn)}')

In [15]:

#Main function to process data and evaluate against the gold standard.

def main(data_directory, gold_file):
    guesses, filenames = process_directory(data_directory)
    gold_data = load_gold(gold_file)
    evaluate(guesses, gold_data, filenames)

if __name__ == '__main__':
    print('Running Contact Finder in the current directory...')
    main('data/dev', 'data/devGOLD')

Running Contact Finder in the current directory...
True Positives (100): 
{('ashishg', 'e', 'ashishg@stanford.edu'),
 ('ashishg', 'e', 'rozm@stanford.edu'),
 ('ashishg', 'p', '650-723-1614'),
 ('ashishg', 'p', '650-723-4173'),
 ('ashishg', 'p', '650-814-1478'),
 ('balaji', 'e', 'balaji@stanford.edu'),
 ('bgirod', 'p', '650-723-4539'),
 ('bgirod', 'p', '650-724-3648'),
 ('bgirod', 'p', '650-724-6354'),
 ('cheriton', 'e', 'cheriton@cs.stanford.edu'),
 ('cheriton', 'e', 'uma@cs.stanford.edu'),
 ('cheriton', 'p', '650-723-1131'),
 ('cheriton', 'p', '650-725-3726'),
 ('dabo', 'e', 'dabo@cs.stanford.edu'),
 ('dabo', 'p', '650-725-3897'),
 ('dabo', 'p', '650-725-4671'),
 ('engler', 'e', 'engler@lcs.mit.edu'),
 ('eroberts', 'e', 'eroberts@cs.stanford.edu'),
 ('eroberts', 'p', '650-723-3642'),
 ('eroberts', 'p', '650-723-6092'),
 ('fedkiw', 'e', 'fedkiw@cs.stanford.edu'),
 ('hager', 'p', '410-516-5521'),
 ('hager', 'p', '410-516-5553'),
 ('hager', 'p', '410-516-8000'),
 ('hanrahan', 'e', 'hanra