<img align="left" src="https://upload.wikimedia.org/wikipedia/commons/e/e1/CC_BY_icon.svg"><br />

Created by Lorenzo Babini and under [Creative Commons CC BY License](https://creativecommons.org/licenses/by/4.0/)<br />
For questions/comments/improvements, email lorenzo.babini@unicatt.it.<br />

# HOW TO TOKENIZE A TEXT AND COUNT BIGRAMS AND TRIGRAMS

In [None]:
#Creating a statement to give a variable name to the file. The file must be indicate as 'folder_name/file_name'
text_name = 'data/apocalisse.txt'

In [None]:
# Opening a file in read mode. 
## If the text is too long, it could be better to not print it because there might be crash errors
### If you are using Constellate, the right environment is all inside the folder 'constellate-notebooks'.
with open(text_name, 'r') as f:
    text = f.read()
    print(text)

In [None]:
# Splitting the text string into a list of strings. 
# To choose multiple delimiters (empty spaces and apostrophes), we  use re.split function, but this function don't recognize the /n espace character (so we must add it as delimiters) 
## If you choose a single delimiter (for ex. empty spaces or commas) you can use the most simple .split method:
##tokenized_list = text.split()
##list(tokenized_list)

import re
tokenized_list = re.split(" |'|\n", text)
list(tokenized_list)

In [None]:
#Removing the empty items from the list.
#The empty items has been generated using \n as delimiter in the re.split function
#This is the system to remove not only the first but all the empty spaces.
while '' in tokenized_list:
    tokenized_list.remove('')
    
list(tokenized_list)

In [None]:
# Cleaning up the tokens
unigrams_list = []

for token in tokenized_list:
    token = token.lower() # lowercase tokens ## if some cases, the capital letters are a distinctive marks (ex. brown is a color and Brown is a surname), you have to differently rename one of the two cases (ex. brown and name-brown). 
    token = token.replace('(', '')
    token = token.replace(')', '')
    token = token.replace('.', '')
    token = token.replace(';', '')
    token = token.replace(',', '')
    token = token.replace(':', '')
    token = token.replace('!', '')
    token = token.replace('?', '')
    token = token.replace('"', '')
    
    unigrams_list.append(token)
    
print(unigrams_list)

In [None]:
#importing a function for automatic tokenization from the Natural Language Tool Kit Library
import nltk
from nltk.tokenize import TreebankWordTokenizer

In [None]:
# Creating our bigrams and trigrams
bigrams = list(nltk.bigrams(unigrams_list))
trigrams = list(nltk.trigrams(unigrams_list))

print('Bigrams: \n ', bigrams, '\n')
    
print('Trigrams: \n,', trigrams)

In [None]:
# Function definitions for Converting NLTK tuples into strings

from collections import Counter

def convert_tuple_bigrams(tuples_to_convert):
    """Converts NLTK tuples into bigram strings"""
    string_grams = []
    for tuple_grams in tuples_to_convert:
        first_word = tuple_grams[0]
        second_word = tuple_grams[1]
        gram_string = f'{first_word} {second_word}'
        string_grams.append(gram_string)
    return string_grams

def convert_tuple_trigrams(tuples_to_convert):
    """Converts NLTK tuples into trigram strings"""
    string_grams = []
    for tuple_grams in tuples_to_convert:
        first_word = tuple_grams[0]
        second_word = tuple_grams[1]
        third_word = tuple_grams[2]
        gram_string = f'{first_word} {second_word} {third_word}'
        string_grams.append(gram_string)
    return string_grams

def convert_strings_to_counts(string_grams):
    """Converts a Counter of n-grams into a dictionary"""
    counter_of_grams = Counter(string_grams)
    dict_of_grams = dict(counter_of_grams)
    return dict_of_grams

In [None]:
# Converting the tuples
string_bigrams = convert_tuple_bigrams(bigrams)
bigramCount = convert_strings_to_counts(string_bigrams)

print('Bigrams as a dictionary of counts')
print(bigramCount, '\n')

string_trigrams = convert_tuple_trigrams(trigrams)
trigramCount = convert_strings_to_counts(string_trigrams)

print('Trigrams as a dictionary of counts')
print(trigramCount)

In [None]:
# Sorting bigrams, to look at the most common ones.
sort_bigramCount = sorted(bigramCount.items(), key=lambda x: x[1], reverse=True)

for i in sort_bigramCount:
    print(i[0].ljust(20), i[1])

In [None]:
# Sorting trigrams, to look at the most common ones.
sort_trigramCount = sorted(trigramCount.items(), key=lambda x: x[1], reverse=True)

for i in sort_trigramCount:
    print(i[0].ljust(27), i[1])