In [5]:
import time
import spacy
from textblob import TextBlob
import nltk
import string
import re
import pandas as pd

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')


from nltk.tokenize import word_tokenize



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/paramatephuengtrakul/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/paramatephuengtrakul/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/paramatephuengtrakul/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
nlp = spacy.load("en_core_web_sm")

In [7]:
def clean_text(text, remove_punctuation: bool):

    text = re.sub(r'[^\x00-\x7F]+', ' ', text) # Remove non-ASCII characters
    
    text = re.sub(r'\s+', ' ', text) # replace multiple white space with sigle white space
    
    text = re.sub(r'[^\w\s.,!?]', '', text) # remove specific char, `¬`, or quotes around words

    if remove_punctuation:
        text = re.sub(r'[^\w\s]', '', text) # Remove all punctuation
    
    text = text.strip()
    
    text = text.lower() # convert to lower 
    
    return text

def tokenized_sentences_words(text):
    
    text = clean_text(text, remove_punctuation=False)

    sentences = re.split(r'[.!?]', text)

    # remove white space if the particular sentence has white space
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    
    
    tokenized_sentences = [sentence.split() for sentence in sentences] # split by white space
    
    return tokenized_sentences

def count_word_frequency(text):

    # tokenize
    words = text.split(" ")
    
    word_freq = {}
    

    for word in words:
        if word in word_freq:
            word_freq[word] += 1 
        else:
            word_freq[word] = 1 

    return word_freq

def get_top_k_words(text, k: int):

    word_freq = count_word_frequency(text)
    
    df_top_k = pd.DataFrame(list(word_freq.items()), columns=['Word', 'Frequency'])

    # sort by descending order
    df_top_k_sorted = df_top_k.sort_values(by='Frequency', ascending=False).head(k)
    
    return df_top_k_sorted


# tokenization across nltk, textblob and spacy
def tokenize_nltk(text):
    token_list= word_tokenize(text)
    return token_list

def tokenize_textblob(text):
    blob = TextBlob(text)
    token_list = blob.words
    return token_list

def tokenize_spacy(text):
    doc = nlp(text)
    token_list = [token.text for token in doc]
    return token_list


In [8]:
file_path = "aliced29.txt"
with open(file_path, 'r') as file:
    texts = file.read()

# print(texts)

## A cleaned version of text

In [9]:
cleaned_text = clean_text(texts, remove_punctuation=True)
cleaned_text

'or longitude ive got to alice had no idea what latitude was or longitude either but thought they were nice grand words to say presently she began again i wonder if i shall fall right through the earth how funny itll seem to come out among the people that walk with their heads downward the antipathies i think she was rather glad there was no one listening this time as it didnt sound at all the right word but i shall have to ask them what the name of the country is you know please maam is this new zealand or australia and she tried to curtsey as she spokefancy curtseying as youre falling through the air do you think you could manage it and what an ignorant little girl shell think me for asking no itll never do to ask perhaps i shall see it written up somewhere down down down there was nothing else to do so alice soon began talking again dinahll miss me very much tonight i should think dinah was the cat i hope theyll remember her saucer of milk at teatime dinah my dear i wish you were'

## List of tokenized sentences and words

In [10]:
sentences_token = tokenized_sentences_words(texts)
sentences_token[:2] # show first 2 elements result

[['or', 'longitude', 'ive', 'got', 'to'],
 ['alice',
  'had',
  'no',
  'idea',
  'what',
  'latitude',
  'was,',
  'or',
  'longitude',
  'either,',
  'but',
  'thought',
  'they',
  'were',
  'nice',
  'grand',
  'words',
  'to',
  'say']]

## A printed table of the top 10 most frequent words and their counts

In [11]:
get_top_k_words(cleaned_text, 10)

Unnamed: 0,Word,Frequency
25,i,8
32,the,8
4,to,7
11,was,5
71,you,4
22,she,4
49,think,4
103,down,3
57,as,3
28,shall,3


## Compare performance

In [14]:
result_dict = {}

start_time = time.time()
token_spc = tokenize_spacy(cleaned_text)
end_time = time.time()
result_dict["Spacy"] = {
    'execution_time_sec': end_time - start_time,
    'tokens':token_spc
}

start_time = time.time()
token_txtblob = tokenize_textblob(cleaned_text)
end_time = time.time()
result_dict["TextBlob"] = {
    'execution_time_sec': end_time - start_time,
    'tokens':token_txtblob
}

start_time = time.time()
token_nltk = tokenize_nltk(cleaned_text)
end_time = time.time()
result_dict["nltk"] = {
    'execution_time_sec': end_time - start_time,
    'tokens':token_nltk
}

all_results = {
    'method': [],
    'execution_time_sec': [],
    'tokens': []
}

for method, result in result_dict.items():
    all_results['method'].append(method)
    all_results['execution_time_sec'].append(result['execution_time_sec'])
    all_results['tokens'].append(result['tokens'])

In [16]:
results = pd.DataFrame(all_results)
results

Unnamed: 0,method,execution_time_sec,tokens
0,Spacy,0.0183,"[or, longitude, i, ve, got, to, alice, had, no..."
1,TextBlob,0.000854,"[or, longitude, ive, got, to, alice, had, no, ..."
2,nltk,0.000369,"[or, longitude, ive, got, to, alice, had, no, ..."
