# **My Tokenizer**

In [69]:
def my_tokenizer(corpus_raw):
    '''
    type corpus_raw: string
    param corpus_raw: The raw output of the corpus to be tokenized
    rtype: list
    return: a list of tokens extracted from the corpus_raw
    '''

    # Convert raw text to lowercase
    corpus_raw = corpus_raw.lower()
    # Tokenization with Regular Expression (explained in detail on report)
    pattern = r"\w+(?:[-']\w+)*|[.,!?;:]"
    # Find this pattern on raw corpus and get a list
    token_list = re.findall(pattern, corpus_raw)

    clean_token_list = [] # Temporary list to hold cleaned tokens
    for token in token_list:
        token = token.strip() # Remove whitespaces
        if token == "":
          # Skip if it is empty
          continue
        if len(token) == 1 and not token.isalnum():
          # Skip if token is single and non alphanumeric
          continue

        # Add cleaned token to new list
        clean_token_list.append(token)

    token_list = clean_token_list

    return token_list

In [70]:
import nltk
import re

corpus_name = 'webtext'

#download the corpus and import it.
nltk.download(corpus_name)
from nltk.corpus import webtext

#get the raw text output of the corpus to the corpus_raw variable.
corpus_raw = webtext.raw()

#call your tokenizer method
my_tokenized_list = my_tokenizer(corpus_raw)



[nltk_data] Downloading package webtext to /root/nltk_data...
[nltk_data]   Package webtext is already up-to-date!


## Evaluate the tokenizer with the nltk word tokenizer.

In [71]:
def similarity_score(set_a, set_b):
    '''
    type set_a: set
    param set_a: The first set to be compared
    type set_b: set
    param set_b: The tokens extracted from the corpus_raw
    rtype: float
    return: similarity score with two sets using Jaccard similarity.
    '''

    jaccard_similarity = float(len(set_a.intersection(set_b)) / len(set_a.union(set_b)))

    return jaccard_similarity

In [72]:
from nltk import word_tokenize
nltk.download('punkt')
from nltk import punkt

def evaluation(corpus_raw, token_list):
    '''
    type corpus_raw: string
    param corpus_raw: The raw output of the corpus
    type token_list: list
    param token_list: The tokens extracted from the corpus_raw
    rtype: float
    return: comparison score with the given token list and the nltk tokenizer.
    '''

    #The comparison score only looks at the tokens but not the frequencies of the tokens.
    #we assume case folding is already applied to the token_list
    corpus_raw = corpus_raw.lower()
    nltk_tokens = word_tokenize(corpus_raw, language='english')

    score = similarity_score(set(token_list), set(nltk_tokens))

    return score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [73]:
#Evaluation

eval_score = evaluation(corpus_raw, my_tokenized_list)

print('The similarity score is {:.2f}'.format(eval_score))

The similarity score is 0.86
