# Natural Language Processing - Text Preprocessing

## Libraries and settings

In [1]:
# Libraries
import os
import re
import string
import numpy as np
import pandas as pd
from pprint import pprint

import nltk

# Import only once
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.chunk import tree2conlltags
from nltk.chunk import conlltags2tree
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Current working directory
print('Current working directory:', os.getcwd())

[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /home/vscode/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...
[nltk_data] Downloading package omw-1.4 to /home/vscode/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/vscode/nltk_data...


Current working directory: /workspaces/data_analytics/Week_11


[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


## Defining documents

In [16]:
# Defining documents (=sentenses)
d1 = 'The car is driven on the road.'
d2 = 'The truck is driven on the highway.'
d3 = 'The bicycle is driven on the bicycle path.'

corpus_01 = d1 + ' ' + d2 + ' ' + d3
corpus_01

'The car is driven on the road. The truck is driven on the highway. The bicycle is driven on the bicycle path.'

## Aufgabe b)

In [17]:
# Defining documents (=sentenses)
d1 = 'Das schöne, junge Schneewittchen wächst als Dienstmagd am Hof ihres Vaters und ihrer neidischen Stiefmutter auf..'
d2 = 'Schneewittchen irrt voller Angst durch die Nacht und schläft schließlich ein..'
d3 = 'Sie erwacht am nächsten Morgen im Kreise der Tiere des Waldes'

corpus_01 = d1 + ' ' + d2 + ' ' + d3
corpus_01

'Das schöne, junge Schneewittchen wächst als Dienstmagd am Hof ihres Vaters und ihrer neidischen Stiefmutter auf.. Schneewittchen irrt voller Angst durch die Nacht und schläft schließlich ein.. Sie erwacht am nächsten Morgen im Kreise der Tiere des Waldes'

## Text preprocessing
#### Steps:
- Text to lowercase
- Removing punctuations
- Tokenization
- Removal of stop words
- Lemmatization

### Text to lowercase

In [3]:
# Text to lowercase function
def text_lowercase(text):
    return text.lower()

# Text to lowercase
corpus_02 = text_lowercase(corpus_01)
corpus_02

'the car is driven on the road. the truck is driven on the highway. the bicycle is driven on the bicycle path.'

### Removing punctuation

In [4]:
# Remove punctuation function
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

# Remove punctuation
corpus_03 = remove_punctuation(corpus_02)
corpus_03

'the car is driven on the road the truck is driven on the highway the bicycle is driven on the bicycle path'

### Tokenize text & removal of stopwords

In [5]:
# Show english stopwords
eng_stopwords = set(stopwords.words('english'))
print("List of english stopwords:")
print(eng_stopwords)

List of english stopwords:
{"hasn't", 'we', "haven't", 'ain', 'both', 'it', 'be', 'that', 'being', 'for', "weren't", 'in', 'own', 'him', 'its', "wasn't", 'on', 'you', 'above', 'under', 'weren', 'by', 'when', 'very', 'shan', 've', 'only', 'a', 'any', 'my', 'itself', 'now', 'they', 're', 'once', 'or', 'why', "mustn't", 'their', 'no', 'too', 'themselves', "she's", 'off', 'same', "you'll", 't', 'if', 'd', 'needn', 'hasn', 'herself', 'then', 'have', 'how', 'what', 's', 'isn', "needn't", 'during', 'aren', "it's", 'her', 'not', 'up', 'further', 'does', "shan't", 'was', "you'd", 'theirs', "shouldn't", 'because', 'wouldn', 'o', 'over', 'i', 'yourselves', 'out', 'me', 'to', 'all', "couldn't", 'few', 'before', 'are', 'whom', 'of', 'mightn', 'do', 'as', "didn't", 'doesn', 'below', "you're", "won't", "wouldn't", 'shouldn', 'did', 'myself', 'can', "don't", "that'll", 'who', 'at', 'ma', "doesn't", 'y', 'each', 'just', 'won', 'yourself', 'from', 'than', 'your', "mightn't", 'with', 'himself', 'such', '

In [6]:
# Function for tokenization and the removal of stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return filtered_text
 
# Remove stopwords
corpus_04 = remove_stopwords(corpus_03)
print(corpus_04, end="")

['car', 'driven', 'road', 'truck', 'driven', 'highway', 'bicycle', 'driven', 'bicycle', 'path']

### Lemmatization

In [7]:
# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize string function
def lemmatize_word(text):
    word_tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens]
    return lemmas

# Lemmatize
lem = []
for i in corpus_04:
    lem.append(lemmatize_word(i))

# Nested list to list
corpus_05 = [' '.join([str(x) for x in lst]) for lst in lem]

print('Before lemmatization:')
print(corpus_04, '\n')

print('After lemmatization:')
print(corpus_05, end="")

Before lemmatization:
['car', 'driven', 'road', 'truck', 'driven', 'highway', 'bicycle', 'driven', 'bicycle', 'path'] 

After lemmatization:
['car', 'drive', 'road', 'truck', 'drive', 'highway', 'bicycle', 'drive', 'bicycle', 'path']

## Redefine the text corpus (pre-processed)

In [26]:
# We will use the lemmatized words above to re-define our corpus 
corpus = ['das schöne junge schneewittchen wächst als dienstmagd am hof ihres vaters und ihrer neidischen stiefmutter auf',
                  'schneewittchen irrt voller angst durch die nacht und schläft schließlich ein',
                  'sie erwacht am nächsten morgen im kreise der tiere des waldes']

## Document-term matrix with ngram_range=(1,1)

In [24]:
# Vectorizer with ngram_range=(1,1)
vectorizer = CountVectorizer(min_df=0.0, ngram_range=(1,1))

# Transform 
count = vectorizer.fit_transform(corpus)
 
# Create dataframe
df_count = pd.DataFrame(count.toarray(),
                        columns=vectorizer.get_feature_names_out())

print('Document-term matrix')
print(df_count)

Document-term matrix
   als  am  angst  auf  das  der  des  die  dienstmagd  durch  ...  \
0    1   1      0    1    1    0    0    0           1      0  ...   
1    0   0      1    0    0    0    0    1           0      1  ...   
2    0   1      0    0    0    1    1    0           0      0  ...   

   schneewittchen  schöne  sie  stiefmutter  tiere  und  vaters  voller  \
0               1       1    0            1      0    1       1       0   
1               1       0    0            0      0    1       0       1   
2               0       0    1            0      1    0       0       0   

   waldes  wächst  
0       0       1  
1       0       0  
2       1       0  

[3 rows x 35 columns]


## Document-term matrix with ngram_range=(2,2)

In [27]:
# Vectorizer with with ngram_range=(2,2)
vectorizer = CountVectorizer(min_df=0.0, ngram_range=(2,2))

# Transform 
count = vectorizer.fit_transform(corpus)
 
# Create dataframe
df_count = pd.DataFrame(count.toarray(),
                        columns=vectorizer.get_feature_names_out())

print('Document-term matrix')
print(df_count)

Document-term matrix
   als dienstmagd  am hof  am nächsten  angst durch  das schöne  der tiere  \
0               1       1            0            0           1          0   
1               0       0            0            1           0          0   
2               0       0            1            0           0          1   

   des waldes  die nacht  dienstmagd am  durch die  ...  \
0           0          0              1          0  ...   
1           0          1              0          1  ...   
2           1          0              0          0  ...   

   schneewittchen wächst  schöne junge  sie erwacht  stiefmutter auf  \
0                      1             1            0                1   
1                      0             0            0                0   
2                      0             0            1                0   

   tiere des  und ihrer  und schläft  vaters und  voller angst  wächst als  
0          0          1            0           1             0 

## Term frequency-inverse document frequency (TF-IDF)
- For details see: https://www.learndatasci.com/glossary/tf-idf-term-frequency-inverse-document-frequency

### Term Frequency (TF)

In [28]:
# Compute Term Frequency (TF)
words_set = set()
for doc in corpus:
    words = doc.split(' ')
    words_set = words_set.union(set(words))
    
print('Number of words in the corpus:',len(words_set), '\n')
print('The words in the corpus: \n', words_set)

# Number of documents in the corpus
n_docs = len(corpus)

# Number of unique words in the corpus 
n_words_set = len(words_set)

df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), 
                     columns=list(words_set))

print("\nTerm Frequency (TF):")
for i in range(n_docs):
    # Words in the document
    words = corpus[i].split(' ')
    for w in words:
        df_tf[w][i] = df_tf[w][i] + (1 / len(words))
        
print(df_tf.round(4))

Number of words in the corpus: 35 

The words in the corpus: 
 {'morgen', 'als', 'ihrer', 'ihres', 'schließlich', 'erwacht', 'kreise', 'und', 'stiefmutter', 'schneewittchen', 'am', 'schläft', 'im', 'tiere', 'neidischen', 'die', 'angst', 'junge', 'nacht', 'dienstmagd', 'ein', 'vaters', 'wächst', 'sie', 'schöne', 'der', 'irrt', 'das', 'auf', 'waldes', 'durch', 'des', 'hof', 'voller', 'nächsten'}

Term Frequency (TF):
   morgen     als   ihrer   ihres  schließlich  erwacht  kreise     und  \
0  0.0000  0.0625  0.0625  0.0625       0.0000   0.0000  0.0000  0.0625   
1  0.0000  0.0000  0.0000  0.0000       0.0909   0.0000  0.0000  0.0909   
2  0.0909  0.0000  0.0000  0.0000       0.0000   0.0909  0.0909  0.0000   

   stiefmutter  schneewittchen  ...     der    irrt     das     auf  waldes  \
0       0.0625          0.0625  ...  0.0000  0.0000  0.0625  0.0625  0.0000   
1       0.0000          0.0909  ...  0.0000  0.0909  0.0000  0.0000  0.0000   
2       0.0000          0.0000  ...  0.0909

### Inverse Document Frequency (IDF)

In [12]:
# Computing Inverse Document Frequency (IDF)
print("\nInverse Document Frequency (IDF):")

idf = {}

for w in words_set:
    
    # k = number of documents that contain this word
    k = 0
    
    for i in range(n_docs):
        if w in corpus[i].split():
            k += 1
            
    idf[w] =  np.log10(n_docs / k).round(4)
    
    print(f'{w:>15}: {idf[w]:>10}')


Inverse Document Frequency (IDF):
          truck:     0.4771
           road:     0.4771
        bicycle:     0.4771
          drive:        0.0
        highway:     0.4771
            car:     0.4771
           path:     0.4771


### Term Frequency - Inverse Document Frequency (TF-IDF)

In [13]:
# Computing TF-IDF
df_tf_idf = df_tf.copy()

for w in words_set:
    for i in range(n_docs):
        df_tf_idf[w][i] = df_tf[w][i] * idf[w]

print('\nTF-IDF:')
print(df_tf_idf.round(4))


TF-IDF:
   truck   road  bicycle  drive  highway    car    path
0  0.000  0.159   0.0000    0.0    0.000  0.159  0.0000
1  0.159  0.000   0.0000    0.0    0.159  0.000  0.0000
2  0.000  0.000   0.2386    0.0    0.000  0.000  0.1193


## Part-of-Speach (POS) tagging
For meaning of POS-tags see: https://pythonexamples.org/nltk-pos-tagging

In [31]:
text = '''Back at the castle, the Magic Mirror reveals that Snow White is still living, and with the dwarfs '''

def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

sent = preprocess(text)
pattern = 'NP: {<DT>?<JJ>*<NN>}'

cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)

iob_tagged = tree2conlltags(cs)

# Print the POS-tags
pprint(iob_tagged)

[('Back', 'RB', 'O'),
 ('at', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('castle', 'NN', 'I-NP'),
 (',', ',', 'O'),
 ('the', 'DT', 'O'),
 ('Magic', 'NNP', 'O'),
 ('Mirror', 'NNP', 'O'),
 ('reveals', 'VBZ', 'O'),
 ('that', 'IN', 'O'),
 ('Snow', 'NNP', 'O'),
 ('White', 'NNP', 'O'),
 ('is', 'VBZ', 'O'),
 ('still', 'RB', 'O'),
 ('living', 'JJ', 'O'),
 (',', ',', 'O'),
 ('and', 'CC', 'O'),
 ('with', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('dwarfs', 'NN', 'I-NP')]


## Explain

> The POS-tags in the output indicate the grammatical roles of words in the sentence, such as RB (Adverb) for 'Back' and 'still', IN (Preposition or Subordinating Conjunction) for 'at', 'that', and 'with', DT (Determiner) for 'the', NN (Noun, Singular or Mass) for 'castle' and 'dwarfs', and NNP (Proper Noun, Singular) for 'Magic', 'Mirror', 'Snow', and 'White'.

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [15]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Linux | 6.5.0-1025-azure
Datetime: 2024-11-27 11:13:26
Python Version: 3.11.10
-----------------------------------
