# Natural Language Processing - Text Preprocessing

## Libraries and settings

In [298]:
# Libraries
import os
import re
import string
import numpy as np
import pandas as pd
from pprint import pprint

import nltk

# Import only once
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.chunk import tree2conlltags
from nltk.chunk import conlltags2tree
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Current working directory
print('Current working directory:', os.getcwd())

Current working directory: /Users/sauternicolas/git/data_analytics/Week_11


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sauternicolas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sauternicolas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sauternicolas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/sauternicolas/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sauternicolas/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Defining documents

In [299]:
# Defining documents (=sentenses)
d1 = 'The cat knocked over everything.'
d2 = 'Snow tomorrow, boots are ready.'
d3 = 'Candlelit dinner due to blackout.'

corpus_01 = d1 + ' ' + d2 + ' ' + d3
corpus_01

'The cat knocked over everything. Snow tomorrow, boots are ready. Candlelit dinner due to blackout.'

## Text preprocessing
#### Steps:
- Text to lowercase
- Removing punctuations
- Tokenization
- Removal of stop words
- Lemmatization

### Text to lowercase

In [300]:
# Text to lowercase function
def text_lowercase(text):
    return text.lower()

# Text to lowercase
corpus_02 = text_lowercase(corpus_01)
corpus_02

'the cat knocked over everything. snow tomorrow, boots are ready. candlelit dinner due to blackout.'

### Removing punctuation

In [301]:
# Remove punctuation function (Satzzeichen entfernen)
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

# Remove punctuation
corpus_03 = remove_punctuation(corpus_02)
corpus_03

'the cat knocked over everything snow tomorrow boots are ready candlelit dinner due to blackout'

### Tokenize text & removal of stopwords

In [302]:
# Show english stopwords
eng_stopwords = set(stopwords.words('english'))
print("List of english stopwords:")
print(eng_stopwords)

List of english stopwords:
{'ours', 'through', 'under', 'these', 'out', 've', "you're", 'hers', 'up', 'to', 'there', 'nor', 'should', 'be', "you've", 's', 'its', 'just', "mightn't", 'only', "wasn't", 'above', 'we', 'about', 'own', 'didn', 'his', "shouldn't", 'how', 'did', 'can', 'as', "it's", 'd', 'into', 'myself', 'it', "mustn't", "wouldn't", 'an', 'of', "haven't", 'themselves', 'won', 'too', 'will', 'had', 'i', 'theirs', 'with', 'been', 'o', 'not', 'have', 'aren', 'yours', 'were', "hadn't", 'few', 'shouldn', 'me', 'am', 'their', 'again', 'over', "won't", 'because', 'until', 'than', 'himself', 'do', 'where', 'does', "should've", 'then', 'herself', 't', 'why', 'between', "she's", 'on', 'off', "don't", 'once', 'him', 'has', 'mustn', 'this', 'here', 'couldn', 'mightn', 'he', 'weren', 'which', 'whom', "weren't", 'shan', 'those', "you'd", 'but', 'very', 'you', 'the', "isn't", 'needn', 'ain', 'against', 'what', "shan't", 'y', 'are', 'is', 'they', 'when', "doesn't", 'and', 'your', 'wouldn', 

In [303]:
# Function for tokenization and the removal of stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return filtered_text
 
# Remove stopwords
corpus_04 = remove_stopwords(corpus_03)
print(corpus_04, end="")

['cat', 'knocked', 'everything', 'snow', 'tomorrow', 'boots', 'ready', 'candlelit', 'dinner', 'due', 'blackout']

### Lemmatization

In [304]:
# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize string function
def lemmatize_word(text):
    word_tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens]
    return lemmas

# Lemmatize
lem = []
for i in corpus_04:
    lem.append(lemmatize_word(i))

# Nested list to list
corpus_05 = [' '.join([str(x) for x in lst]) for lst in lem]

print('Before lemmatization:')
print(corpus_04, '\n')

print('After lemmatization:')
print(corpus_05, end="")

Before lemmatization:
['cat', 'knocked', 'everything', 'snow', 'tomorrow', 'boots', 'ready', 'candlelit', 'dinner', 'due', 'blackout'] 

After lemmatization:
['cat', 'knock', 'everything', 'snow', 'tomorrow', 'boot', 'ready', 'candlelit', 'dinner', 'due', 'blackout']

## Redefine the text corpus (pre-processed)

In [305]:
# We will use the lemmatized words above to re-define our corpus 
corpus = ['car drive road', 
          'truck drive highway', 
          'bicycle drive bicycle path']

## Document-term matrix with ngram_range=(1,1)

In [306]:
# Vectorizer with ngram_range=(1,1)
vectorizer = CountVectorizer(min_df=0.0, ngram_range=(1,1))

# Transform 
count = vectorizer.fit_transform(corpus)
 
# Create dataframe
df_count = pd.DataFrame(count.toarray(),
                        columns=vectorizer.get_feature_names_out())

print('Document-term matrix')
print(df_count)

Document-term matrix
   bicycle  car  drive  highway  path  road  truck
0        0    1      1        0     0     1      0
1        0    0      1        1     0     0      1
2        2    0      1        0     1     0      0


## Document-term matrix with ngram_range=(2,2)

In [307]:
# Vectorizer with with ngram_range=(2,2)
vectorizer = CountVectorizer(min_df=0.0, ngram_range=(2,2))

# Transform 
count = vectorizer.fit_transform(corpus)
 
# Create dataframe
df_count = pd.DataFrame(count.toarray(),
                        columns=vectorizer.get_feature_names_out())

print('Document-term matrix')
print(df_count)

Document-term matrix
   bicycle drive  bicycle path  car drive  drive bicycle  drive highway  \
0              0             0          1              0              0   
1              0             0          0              0              1   
2              1             1          0              1              0   

   drive road  truck drive  
0           1            0  
1           0            1  
2           0            0  


## Term frequency-inverse document frequency (TF-IDF)
- For details see: https://www.learndatasci.com/glossary/tf-idf-term-frequency-inverse-document-frequency

### Term Frequency (TF)

In [308]:
# Compute Term Frequency (TF)
words_set = set()
for doc in corpus:
    words = doc.split(' ')
    words_set = words_set.union(set(words))
    
print('Number of words in the corpus:',len(words_set), '\n')
print('The words in the corpus: \n', words_set)

# Number of documents in the corpus
n_docs = len(corpus)

# Number of unique words in the corpus 
n_words_set = len(words_set)

df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), 
                     columns=list(words_set))

print("\nTerm Frequency (TF):")
for i in range(n_docs):
    # Words in the document
    words = corpus[i].split(' ')
    for w in words:
        df_tf[w][i] = df_tf[w][i] + (1 / len(words))
        
print(df_tf.round(4))

Number of words in the corpus: 7 

The words in the corpus: 
 {'drive', 'road', 'car', 'bicycle', 'truck', 'highway', 'path'}

Term Frequency (TF):
    drive    road     car  bicycle   truck  highway  path
0  0.3333  0.3333  0.3333      0.0  0.0000   0.0000  0.00
1  0.3333  0.0000  0.0000      0.0  0.3333   0.3333  0.00
2  0.2500  0.0000  0.0000      0.5  0.0000   0.0000  0.25


### Inverse Document Frequency (IDF)

In [309]:
# Computing Inverse Document Frequency (IDF)
print("\nInverse Document Frequency (IDF):")

idf = {}

for w in words_set:
    
    # k = number of documents that contain this word
    k = 0
    
    for i in range(n_docs):
        if w in corpus[i].split():
            k += 1
            
    idf[w] =  np.log10(n_docs / k).round(4)
    
    print(f'{w:>15}: {idf[w]:>10}')


Inverse Document Frequency (IDF):
          drive:        0.0
           road:     0.4771
            car:     0.4771
        bicycle:     0.4771
          truck:     0.4771
        highway:     0.4771
           path:     0.4771


### Term Frequency - Inverse Document Frequency (TF-IDF)

In [310]:
# Computing TF-IDF
df_tf_idf = df_tf.copy()

for w in words_set:
    for i in range(n_docs):
        df_tf_idf[w][i] = df_tf[w][i] * idf[w]

print('\nTF-IDF:')
print(df_tf_idf.round(4))


TF-IDF:
   drive   road    car  bicycle  truck  highway    path
0    0.0  0.159  0.159   0.0000  0.000    0.000  0.0000
1    0.0  0.000  0.000   0.0000  0.159    0.159  0.0000
2    0.0  0.000  0.000   0.2386  0.000    0.000  0.1193


## Part-of-Speach (POS) tagging
For meaning of POS-tags see: https://pythonexamples.org/nltk-pos-tagging

In [311]:
text = '''European authorities fined Google a record $5.1 
          billion on Wednesday for abusing its power in the 
          mobile phone market.'''

def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

sent = preprocess(text)
pattern = 'NP: {<DT>?<JJ>*<NN>}'

cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)

iob_tagged = tree2conlltags(cs)

# Print the POS-tags
pprint(iob_tagged)

[('European', 'JJ', 'O'),
 ('authorities', 'NNS', 'O'),
 ('fined', 'VBD', 'O'),
 ('Google', 'NNP', 'O'),
 ('a', 'DT', 'B-NP'),
 ('record', 'NN', 'I-NP'),
 ('$', '$', 'O'),
 ('5.1', 'CD', 'O'),
 ('billion', 'CD', 'O'),
 ('on', 'IN', 'O'),
 ('Wednesday', 'NNP', 'O'),
 ('for', 'IN', 'O'),
 ('abusing', 'VBG', 'O'),
 ('its', 'PRP$', 'O'),
 ('power', 'NN', 'B-NP'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('mobile', 'JJ', 'I-NP'),
 ('phone', 'NN', 'I-NP'),
 ('market', 'NN', 'B-NP'),
 ('.', '.', 'O')]


## Task 1 c,d,e) 
### Redefine the text corpus (pre-processed)

In [312]:
corpus2 = ['cat knocked everything', 
          'snow tomorrow boots ready', 
          'candleit dinner due blackout']

### Document-term matrix with ngram_range=(1,1)

In [313]:
# Vectorizer with ngram_range=(1,1)
vectorizer = CountVectorizer(min_df=0.0, ngram_range=(1,1))

# Transform 
count = vectorizer.fit_transform(corpus2)
 
# Create dataframe
df_count = pd.DataFrame(count.toarray(),
                        columns=vectorizer.get_feature_names_out())

print('Document-term matrix')
print(df_count)

Document-term matrix
   blackout  boots  candleit  cat  dinner  due  everything  knocked  ready  \
0         0      0         0    1       0    0           1        1      0   
1         0      1         0    0       0    0           0        0      1   
2         1      0         1    0       1    1           0        0      0   

   snow  tomorrow  
0     0         0  
1     1         1  
2     0         0  


### Document-term matrix with ngram_range=(2,2)

In [314]:
# Vectorizer with ngram_range=(2,2)
vectorizer = CountVectorizer(min_df=0.0, ngram_range=(2,2))

# Transform 
count = vectorizer.fit_transform(corpus2)
 
# Create dataframe
df_count = pd.DataFrame(count.toarray(),
                        columns=vectorizer.get_feature_names_out())

print('Document-term matrix')
print(df_count)

Document-term matrix
   boots ready  candleit dinner  cat knocked  dinner due  due blackout  \
0            0                0            1           0             0   
1            1                0            0           0             0   
2            0                1            0           1             1   

   knocked everything  snow tomorrow  tomorrow boots  
0                   1              0               0  
1                   0              1               1  
2                   0              0               0  


### Term Frequency (TF) matrix

In [315]:
# Compute Term Frequency (TF)
words_set = set()
for doc in corpus2:
    words = doc.split(' ')
    words_set = words_set.union(set(words))
    
print('Number of words in the corpus:',len(words_set), '\n')
print('The words in the corpus: \n', words_set)

# Number of documents in the corpus
n_docs = len(corpus)

# Number of unique words in the corpus 
n_words_set = len(words_set)

df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), 
                     columns=list(words_set))

print("\nTerm Frequency (TF):")
for i in range(n_docs):
    # Words in the document
    words = corpus2[i].split(' ')
    for w in words:
        df_tf[w][i] = df_tf[w][i] + (1 / len(words))
        
print(df_tf.round(4))

Number of words in the corpus: 11 

The words in the corpus: 
 {'tomorrow', 'everything', 'ready', 'due', 'dinner', 'knocked', 'blackout', 'cat', 'candleit', 'boots', 'snow'}

Term Frequency (TF):
   tomorrow  everything  ready   due  dinner  knocked  blackout     cat  \
0      0.00      0.3333   0.00  0.00    0.00   0.3333      0.00  0.3333   
1      0.25      0.0000   0.25  0.00    0.00   0.0000      0.00  0.0000   
2      0.00      0.0000   0.00  0.25    0.25   0.0000      0.25  0.0000   

   candleit  boots  snow  
0      0.00   0.00  0.00  
1      0.00   0.25  0.25  
2      0.25   0.00  0.00  


### Inverse Document Frequency (IDF)

In [316]:
# Computing Inverse Document Frequency (IDF)
print("\nInverse Document Frequency (IDF):")

idf = {}

for w in words_set:
    
    # k = number of documents that contain this word
    k = 0
    
    for i in range(n_docs):
        if w in corpus2[i].split():
            k += 1
            
    idf[w] =  np.log10(n_docs / k).round(4)
    
    print(f'{w:>15}: {idf[w]:>10}')


Inverse Document Frequency (IDF):
       tomorrow:     0.4771
     everything:     0.4771
          ready:     0.4771
            due:     0.4771
         dinner:     0.4771
        knocked:     0.4771
       blackout:     0.4771
            cat:     0.4771
       candleit:     0.4771
          boots:     0.4771
           snow:     0.4771


### Term Frequency - Inverse Document Frequency (TF-IDF)

In [317]:
# Computing TF-IDF
df_tf_idf = df_tf.copy()

for w in words_set:
    for i in range(n_docs):
        df_tf_idf[w][i] = df_tf[w][i] * idf[w]

print('\nTF-IDF:')
print(df_tf_idf.round(4))


TF-IDF:
   tomorrow  everything   ready     due  dinner  knocked  blackout    cat  \
0    0.0000       0.159  0.0000  0.0000  0.0000    0.159    0.0000  0.159   
1    0.1193       0.000  0.1193  0.0000  0.0000    0.000    0.0000  0.000   
2    0.0000       0.000  0.0000  0.1193  0.1193    0.000    0.1193  0.000   

   candleit   boots    snow  
0    0.0000  0.0000  0.0000  
1    0.0000  0.1193  0.1193  
2    0.1193  0.0000  0.0000  


## Task 1 f)

In [318]:
text = '''cat knocked everything snow tomorrow boots ready candleit dinner due blackout'''

def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

sent = preprocess(text)
pattern = 'NP: {<DT>?<JJ>*<NN>}'

cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)

iob_tagged = tree2conlltags(cs)

# Print the POS-tags
pprint(iob_tagged)

[('cat', 'NN', 'B-NP'),
 ('knocked', 'VBD', 'O'),
 ('everything', 'NN', 'B-NP'),
 ('snow', 'NN', 'B-NP'),
 ('tomorrow', 'NN', 'B-NP'),
 ('boots', 'NNS', 'O'),
 ('ready', 'JJ', 'B-NP'),
 ('candleit', 'NN', 'I-NP'),
 ('dinner', 'NN', 'B-NP'),
 ('due', 'JJ', 'B-NP'),
 ('blackout', 'NN', 'I-NP')]


- cat = Noun, Singular (NN)
- knocked = Verb, Past Tense (VBD)
- boots = Noun Plural (NNS)
- ready = Adjective (JJ)
- dinner = Noun, Singular (NN)

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [319]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Darwin | 24.0.0
Datetime: 2024-11-26 15:09:27
Python Version: 3.10.14
-----------------------------------
