In [17]:
# https://www.geeksforgeeks.org/text-preprocessing-in-python-set-1/
# https://colab.research.google.com/github/gal-a/blog/blob/master/docs/notebooks/nlp/nltk_preprocess.ipynb

# Import packages

In [18]:
# Run this if running in Google Collab
# Mount google drive if running from Google Collab
from google.colab import drive
drive.mount('/content/drive')

# Set current directory if running from Google Collab
import os
os.chdir('/content/drive/My Drive/Carbon_price_prediction/Workspace/Data')

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger') 

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

import pandas as pd
import matplotlib.pyplot as plt
import io
import unicodedata
import numpy as np
import re
import string
import pickle
import time

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Custom functions

In [19]:
# Remove accents function
def remove_accents(data):
    return ''.join(x for x in unicodedata.normalize('NFKD', data) if x in string.ascii_letters or x == " ")

# Parameters / Constants

In [20]:
# Constants
# POS (Parts Of Speech) for: nouns, adjectives, verbs and adverbs
DI_POS_TYPES = {'NN':'n', 'JJ':'a', 'VB':'v', 'RB':'r'} 
POS_TYPES = list(DI_POS_TYPES.keys())

# Constraints on tokens
MIN_STR_LEN = 3
RE_VALID = '[a-zA-Z]'

# Data import

In [21]:
raw_text = pd.read_csv( "./no_keyword_merged_articles.csv", index_col=0)
raw_text.head()

Unnamed: 0,date,text
0,2017-01-01,conceit every generation believe experience un...
1,2017-01-01,2016 comes close world leaders appear eager st...
2,2017-01-01,process automatic browser redirect requested c...
3,2017-01-01,labour’s divisions immigration broken party’s ...
4,2017-01-01,established political order came crashing grou...


In [22]:
raw_text.shape

(18939, 2)

In [23]:
# Convert DataFrame columns to list of tuples
raw_text_iter = list(zip(raw_text.date, raw_text.text))

In [24]:
len(raw_text_iter)

18939

In [32]:
# %time
# # Check if ETS 3-gram keywords are really not present
# keywords = ['European Trading System', 'European Trading Scheme',
#             'Emissions Trading System', 'Emissions Trading Scheme']

# keyword_matches = {}
# for keyword in keywords:
#     keyword_matches[keyword] = raw_text.text.str.contains(keyword, case=False).sum()

# print(keyword_matches)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.96 µs
{'European Trading System': 0, 'European Trading Scheme': 0, 'Emissions Trading System': 26, 'Emissions Trading Scheme': 13}


In [None]:
# # Test 3-gram keywords
# raw_text_iter = [(1, 'test European Trading System right'),
#                  (1, 'test European Trading Scheme right'),
#                  (1, 'test Emissions Trading System right'),
#                  (1, 'test Emissions Trading Scheme right'),]

# NLP preprocessing

In [None]:
# Get stopwords, stemmer and lemmatizer
stopwords = nltk.corpus.stopwords.words('english')
stemmer = nltk.stem.PorterStemmer()
lemmatizer = nltk.stem.WordNetLemmatizer()

In [None]:
%time # Jupyter notebook magic does not work as expected for some reason (shows negligible time)

start = time.time()

# Process all article texts
lemmatized_results = []

counter = 0

for date, text in raw_text_iter:

    if counter % 1000 == 0:
        print(f"Iteration: {counter + 1}/{len(raw_text_iter)}")

    if not isinstance(text, str):
        continue
    # Tokenize by sentence, then by lowercase word
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]

    lemmas = []
    # Process all tokens per article text
    for token in tokens:
        # Remove accents
        t = remove_accents(token)

        # Remove punctuation
        t = str(t).translate(string.punctuation)
        
        # Add token that represents "no lemmatization match"
        lemmas.append("-") # this token will be removed if a lemmatization match is found below

        # Process each token
        if t not in stopwords:
            if re.search(RE_VALID, t):
                if len(t) >= MIN_STR_LEN:
                    # Note that the POS (Part Of Speech) is necessary as input to the lemmatizer 
                    # (otherwise it assumes the word is a noun)
                    pos = nltk.pos_tag([t])[0][1][:2]
                    pos2 = 'n'  # set default to noun
                    if pos in DI_POS_TYPES:
                      pos2 = DI_POS_TYPES[pos]
                    
                    stem = stemmer.stem(t)
                    lem = lemmatizer.lemmatize(t, pos=pos2)  # lemmatize with the correct POS
                    
                    if pos in POS_TYPES:
                        # Remove the "-" token and append the lemmatization match
                        lemmas = lemmas[:-1] 
                        lemmas.append(lem)
    
    # Build list of strings from lemmatized tokens
    str_lemmas = ' '.join(lemmas)
    lemmatized_results.append((date, str_lemmas))
    
    # Increment counter
    counter += 1


end = time.time()
print('Code execution took', round(end-start, 2), 'seconds.')

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.48 µs
Iteration: 1/4
Code execution took 0.01 seconds.


In [None]:
lemmatized_text_df = pd.DataFrame(lemmatized_results)
lemmatized_text_df.columns = ['date', 'lemmatized_text']

print(lemmatized_text_df.shape)
print(lemmatized_text_df.head())

(4, 2)
   date                     lemmatized_text
0     1  test european trading system right
1     1  test european trading scheme right
2     1  test emission trading system right
3     1  test emission trading scheme right


# Export results

In [None]:
# CSV or pickle should be used?? --> depends on the final format, decide once preproc workflow is complete!
lemmatized_text_df.to_csv(f'./lemmatized_merged_articles.csv')
# preprocessed_text_df.to_csv(f'./lemmatized_merged_articles.csv')

In [None]:
# Store data (serialize)
with open('lemmatized_merged_articles_{}.pkl'.format(MIN_STR_LEN), 'wb') as handle:
   pickle.dump(lemmatized_results, handle)

# Support

In [None]:
lemmatizer = WordNetLemmatizer()
# lemmatize string
def lemmatize_word(text):
    word_tokens = word_tokenize(text)
    # provide context i.e. part-of-speech
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens]
    return lemmas

In [None]:
test = """deforestation forestation emission european trading system solar power nox ccus photovoltaic aren't"""
test = raw_text.text.iloc[0]

lemmatize_word(test)[:5]