# Natural Language Processing - Text Preprocessing

## Libraries and settings

In [1]:
# Libraries
import os
import re
import string
import numpy as np
import pandas as pd
from pprint import pprint

import nltk

# Import only once
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.chunk import tree2conlltags
from nltk.chunk import conlltags2tree
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Current working directory
print('Current working directory:', os.getcwd())

[nltk_data] Downloading package stopwords to /Users/loic/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /Users/loic/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /Users/loic/nltk_data...
[nltk_data] Downloading package omw-1.4 to /Users/loic/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/loic/nltk_data...


Current working directory: /Users/loic/Documents/data_analytics/Week_11


[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


## Defining documents

In [18]:
# Defining documents (=sentenses)
d1 = 'The train drives on tracks.'
d2 = 'The bus drives on roads.'
d3 = 'The gondola drives on ropes.'

corpus_01 = d1 + ' ' + d2 + ' ' + d3
corpus_01

'The train drives on tracks. The bus drives on roads. The gondola drives on ropes.'

## Text preprocessing
#### Steps:
- Text to lowercase
- Removing punctuations
- Tokenization
- Removal of stop words
- Lemmatization

### Text to lowercase

In [19]:
# Text to lowercase function
def text_lowercase(text):
    return text.lower()

# Text to lowercase
corpus_02 = text_lowercase(corpus_01)
corpus_02

'the train drives on tracks. the bus drives on roads. the gondola drives on ropes.'

### Removing punctuation

In [20]:
# Remove punctuation function
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

# Remove punctuation
corpus_03 = remove_punctuation(corpus_02)
corpus_03

'the train drives on tracks the bus drives on roads the gondola drives on ropes'

### Tokenize text & removal of stopwords

In [21]:
# Show english stopwords
eng_stopwords = set(stopwords.words('english'))
print("List of english stopwords:")
print(eng_stopwords)

List of english stopwords:
{"that'll", 'did', 'shouldn', 'ourselves', 'here', 'haven', 'on', 'but', 'don', 'down', 'few', 'that', 'having', 'into', 'all', 'isn', 'until', "you've", 'between', 'some', 'once', 'they', 'the', 'you', 'when', 'with', 'of', 'weren', 't', 'then', 'your', 'won', 'whom', 'aren', 'do', 'where', 'will', 'up', 'she', "didn't", 'needn', "shan't", 'does', 'he', "she's", 'shan', 'are', 'what', 'me', 'himself', 'those', 'o', 'can', 'her', "weren't", "doesn't", 'has', 'after', 'other', 'as', 'our', 'both', 'been', 'not', "wouldn't", 'too', 'theirs', 'm', 'such', "you'll", 'most', 're', "isn't", "shouldn't", 'have', 'about', 'had', 'who', 'll', 'at', 'which', 'ours', "should've", 'should', 'if', 'am', 'itself', 'i', 's', "mustn't", 'while', 'under', 'a', 'them', 'how', 'from', "hasn't", 'yours', 'it', 'why', 'same', 'an', "you'd", 'ain', 'to', 'before', 'any', 'these', 'and', 'didn', 'further', 'ma', 'each', 'yourself', 'or', 'yourselves', 'being', 'more', 'myself', 'by

In [22]:
# Function for tokenization and the removal of stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return filtered_text
 
# Remove stopwords
corpus_04 = remove_stopwords(corpus_03)
print(corpus_04, end="")

['train', 'drives', 'tracks', 'bus', 'drives', 'roads', 'gondola', 'drives', 'ropes']

### Lemmatization

In [23]:
# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize string function
def lemmatize_word(text):
    word_tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens]
    return lemmas

# Lemmatize
lem = []
for i in corpus_04:
    lem.append(lemmatize_word(i))

# Nested list to list
corpus_05 = [' '.join([str(x) for x in lst]) for lst in lem]

print('Before lemmatization:')
print(corpus_04, '\n')

print('After lemmatization:')
print(corpus_05, end="")

Before lemmatization:
['train', 'drives', 'tracks', 'bus', 'drives', 'roads', 'gondola', 'drives', 'ropes'] 

After lemmatization:
['train', 'drive', 'track', 'bus', 'drive', 'roads', 'gondola', 'drive', 'rope']

## Redefine the text corpus (pre-processed)

In [27]:
# We will use the lemmatized words above to re-define our corpus 
corpus = ['train drive track', 
          'bus drive roads', 
          'gondola drive rope']

## Document-term matrix with ngram_range=(1,1)

In [28]:
# Vectorizer with ngram_range=(1,1)
vectorizer = CountVectorizer(min_df=0.0, ngram_range=(1,1))

# Transform 
count = vectorizer.fit_transform(corpus)
 
# Create dataframe
df_count = pd.DataFrame(count.toarray(),
                        columns=vectorizer.get_feature_names_out())

print('Document-term matrix')
print(df_count)

Document-term matrix
   bus  drive  gondola  roads  rope  track  train
0    0      1        0      0     0      1      1
1    1      1        0      1     0      0      0
2    0      1        1      0     1      0      0


## Document-term matrix with ngram_range=(2,2)

In [29]:
# Vectorizer with with ngram_range=(2,2)
vectorizer = CountVectorizer(min_df=0.0, ngram_range=(2,2))

# Transform 
count = vectorizer.fit_transform(corpus)
 
# Create dataframe
df_count = pd.DataFrame(count.toarray(),
                        columns=vectorizer.get_feature_names_out())

print('Document-term matrix')
print(df_count)

Document-term matrix
   bus drive  drive roads  drive rope  drive track  gondola drive  train drive
0          0            0           0            1              0            1
1          1            1           0            0              0            0
2          0            0           1            0              1            0


## Term frequency-inverse document frequency (TF-IDF)
- For details see: https://www.learndatasci.com/glossary/tf-idf-term-frequency-inverse-document-frequency

### Term Frequency (TF)

In [30]:
# Compute Term Frequency (TF)
words_set = set()
for doc in corpus:
    words = doc.split(' ')
    words_set = words_set.union(set(words))
    
print('Number of words in the corpus:',len(words_set), '\n')
print('The words in the corpus: \n', words_set)

# Number of documents in the corpus
n_docs = len(corpus)

# Number of unique words in the corpus 
n_words_set = len(words_set)

df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), 
                     columns=list(words_set))


print("\nTerm Frequency (TF):")
for i in range(n_docs):
    # Words in the document
    words = corpus[i].split(' ')
    for w in words:
        df_tf[w][i] = df_tf[w][i] + (1 / len(words))
        
print(df_tf.round(4))

Number of words in the corpus: 7 

The words in the corpus: 
 {'bus', 'track', 'roads', 'rope', 'drive', 'train', 'gondola'}

Term Frequency (TF):
      bus   track   roads    rope   drive   train  gondola
0  0.0000  0.3333  0.0000  0.0000  0.3333  0.3333   0.0000
1  0.3333  0.0000  0.3333  0.0000  0.3333  0.0000   0.0000
2  0.0000  0.0000  0.0000  0.3333  0.3333  0.0000   0.3333


### Inverse Document Frequency (IDF)

In [31]:
# Computing Inverse Document Frequency (IDF)
print("\nInverse Document Frequency (IDF):")

idf = {}

for w in words_set:
    
    # k = number of documents that contain this word
    k = 0
    
    for i in range(n_docs):
        if w in corpus[i].split():
            k += 1
            
    idf[w] =  np.log10(n_docs / k).round(4)
    
    print(f'{w:>15}: {idf[w]:>10}')


Inverse Document Frequency (IDF):
            bus:     0.4771
          track:     0.4771
          roads:     0.4771
           rope:     0.4771
          drive:        0.0
          train:     0.4771
        gondola:     0.4771


### Term Frequency - Inverse Document Frequency (TF-IDF)

In [32]:
# Computing TF-IDF
df_tf_idf = df_tf.copy()

for w in words_set:
    for i in range(n_docs):
        df_tf_idf[w][i] = df_tf[w][i] * idf[w]

print('\nTF-IDF:')
print(df_tf_idf.round(4))


TF-IDF:
     bus  track  roads   rope  drive  train  gondola
0  0.000  0.159  0.000  0.000    0.0  0.159    0.000
1  0.159  0.000  0.159  0.000    0.0  0.000    0.000
2  0.000  0.000  0.000  0.159    0.0  0.000    0.159


## Part-of-Speach (POS) tagging
For meaning of POS-tags see: https://pythonexamples.org/nltk-pos-tagging

In [34]:
text = '''Airlines Race Toward a Future of Powering
Their Jets With Corn. Carriers want to replace jet
fuel with ethanol to fight global warming. That 
would require lots of corn, and lots of water.'''

def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

sent = preprocess(text)
pattern = 'NP: {<DT>?<JJ>*<NN>}'

cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)

iob_tagged = tree2conlltags(cs)

# Print the POS-tags
pprint(iob_tagged)

[('Airlines', 'NNS', 'O'),
 ('Race', 'NNP', 'O'),
 ('Toward', 'NNP', 'O'),
 ('a', 'DT', 'B-NP'),
 ('Future', 'NN', 'I-NP'),
 ('of', 'IN', 'O'),
 ('Powering', 'VBG', 'O'),
 ('Their', 'PRP$', 'O'),
 ('Jets', 'NNS', 'O'),
 ('With', 'IN', 'O'),
 ('Corn', 'NNP', 'O'),
 ('.', '.', 'O'),
 ('Carriers', 'NNP', 'O'),
 ('want', 'VBP', 'O'),
 ('to', 'TO', 'O'),
 ('replace', 'VB', 'O'),
 ('jet', 'NN', 'B-NP'),
 ('fuel', 'NN', 'B-NP'),
 ('with', 'IN', 'O'),
 ('ethanol', 'NN', 'B-NP'),
 ('to', 'TO', 'O'),
 ('fight', 'VB', 'O'),
 ('global', 'JJ', 'B-NP'),
 ('warming', 'NN', 'I-NP'),
 ('.', '.', 'O'),
 ('That', 'WDT', 'O'),
 ('would', 'MD', 'O'),
 ('require', 'VB', 'O'),
 ('lots', 'NNS', 'O'),
 ('of', 'IN', 'O'),
 ('corn', 'NN', 'B-NP'),
 (',', ',', 'O'),
 ('and', 'CC', 'O'),
 ('lots', 'NNS', 'O'),
 ('of', 'IN', 'O'),
 ('water', 'NN', 'B-NP'),
 ('.', '.', 'O')]


# Task 1 f)
<br>
NN: Noun in the singular form
<br>
IN: Preposition
<br>
JJ: Adjective
<br>
DT: Determiner
<br>
VB: Verb

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [35]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Darwin | 23.1.0
Datetime: 2023-11-30 12:24:43
Python Version: 3.9.13
-----------------------------------
