# Preprocessing for BoW
Available preprocessing steps:
* Tokenizaion
* Stopword removal
* Stemming
* Lemmatization
* nGram


```.pkl``` files were used to save space



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Imports and method definitions
This cell needs to be executed before creating preprocessed sets. It provides imports and functions for our preprocessing methods.

In [2]:
import pandas as pd
import numpy as np
import time
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')
import re, string
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
from collections import defaultdict
from collections.abc import Iterable
from nltk import ngrams
nltk.download('wordnet')
import matplotlib.pyplot as plt


###
### Tokenization
###
def tokenize(text, sentenceSeperate=False, includePunctation=False, excludeSpecPuct=[]):
    data = []

    # intern functions
    def withPunctation(text):
        temp = []
        # delete unwanted punctuation
        for delPunct in excludeSpecPuct:
            text = text.replace(delPunct, " ")
        # help tokenization with replacing some untokenized punctations
        for puct in ["-", "/", "—"]:
            text = text.replace(puct, " " + puct + " ")
        # tokenize the sentence into words
        for j in word_tokenize(text):
            temp.append(j)
        return temp

    def withoutPunctation(text):
        token_pattern = re.compile(r"(?u)\b\w\w+\b")  # split on whitespace (and remove punctation)
        return token_pattern.findall(text)

    text = text.lower()

    if sentenceSeperate:
        # iterate through each sentence in the file
        for sentence in sent_tokenize(text):
            if includePunctation:
                data.append(withPunctation(sentence))
            else:
                data.append(withoutPunctation(sentence))
    else:
        if includePunctation:
            data = withPunctation(text)
        else:
            data = withoutPunctation(text)
    return data


###
### Stopword removal
###
def removeStopwords(wordArray):
    my_stopwords = set(stopwords.words('english'))
    withoutStopwords = []

    # test if its a list of words or a list of sentences with words
    if len(wordArray) > 0 and isinstance(wordArray[0], Iterable) and not isinstance(wordArray[0], str):
        for sentence in wordArray:
            withoutStopwords.append(removeStopwords(sentence))

    else:
        for item in wordArray:
            if item not in my_stopwords:
                withoutStopwords.append(item)
    return withoutStopwords


###
### Stemming
###
def applyStemming(wordArray):
    stemmer = PorterStemmer()
    stems = []

    # test if its a list of words or a list of sentences with words
    if len(wordArray) > 0 and isinstance(wordArray[0], Iterable) and not isinstance(wordArray[0], str):
        for sentence in wordArray:
            stems.append(applyStemming(sentence))
    else:
        for item in wordArray:
            stems.append(stemmer.stem(item))
    return stems


###
### Lemmatizing
###
def applyLemmatizing(
        wordArray):  # Quelle (stark verändert): https://www.guru99.com/stemming-lemmatization-python-nltk.html
    tag_map = defaultdict(lambda: wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV

    # intern function
    def lemmazizeText(text):
        temp = []
        for token, tag in pos_tag(text):
            temp.append(lemma_function.lemmatize(token, tag_map[tag[0]]))
        return temp

    lemma_function = WordNetLemmatizer()
    baseWords = []
    if len(wordArray) > 0 and isinstance(wordArray[0], Iterable) and not isinstance(wordArray[0], str):
        for sentence in wordArray:
            baseWords.append(lemmazizeText(sentence))
    else:
        baseWords = lemmazizeText(wordArray)

    return baseWords


###
### nGram
###
def addNGram(wordArray, NGramLength=2):
    holetext = wordArray
    temp = []
    if len(wordArray) > 0 and isinstance(wordArray[0], Iterable) and not isinstance(wordArray[0], str):
        print("drin")
        for sentence in wordArray:
            temp.append(' '.join(sentence))
        holetext = (' '.join(temp)).split()
    nGrams = list(ngrams(holetext, NGramLength))

    # make nGram from two words to one
    nGramsFull = pd.Series(nGrams).apply(lambda row: ' '.join(row))
    wordArrayCopy = wordArray.copy()
    wordArrayCopy.extend(nGramsFull)
    return (wordArrayCopy)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Create preprocessed set


### Load unprocessed data


In [3]:
firstTime = time.time()
df = pd.read_pickle("drive/My Drive/Feature_generated_sets/raw/Hotel_reviews_features_selected.pkl")
print ("Loaded data in: %s seconds" % round(time.time()-firstTime,4))

Loaded data in: 1.8443 seconds


### Define preprocessing steps

In [0]:
###
### This dictionary was used to create two BoW sets that we ran our Algorithms
### on and compared the results to different feature sets.
###
preprocessing_for_all = {
    "token": True,
    "token_sentenceSeperate": False,
    "token_includePunctation": False,
    "token_excludeSpecPuct": [],
    "rem_stpwrds": True,
    "stemm": True,
    "lemmatize": False,
    "nGram": False,
    "nGram_length":2
}

###
### Tried out feature sets with 53 and 1550 words with this preprocessing
###
no_stemming = {
    "token": True,
    "token_sentenceSeperate": False,
    "token_includePunctation": False,
    "token_excludeSpecPuct": [],
    "rem_stpwrds": True,
    "stemm": False,
    "lemmatize": False,
    "nGram": False,
    "nGram_length":2
}

### 3. Apply preprocessing sets

In [5]:
def preprocess(review, dict): # uses the dict above to apply preprocessing
    if dict["token"]:
        review = tokenize(review, sentenceSeperate=dict["token_sentenceSeperate"],
                          includePunctation=dict["token_includePunctation"],
                          excludeSpecPuct=dict["token_excludeSpecPuct"])
    if dict["rem_stpwrds"]:
        review = removeStopwords(review)
    if dict["stemm"]:
        review = applyStemming(review)
    if dict["lemmatize"]:
        review = applyLemmatizing(review)
    if dict["nGram"]:
        review = addNGram(review, NGramLength=dict["nGram_length"])

    return review

def createPreprocessing(dict, output_path, sharedFolder = False):
    df["Review"] = df["Review"].apply(lambda review: preprocess(review, dict))
    df.to_pickle(output_path)


firstTime = time.time()
output_path = "no_stemming.pkl" # Decides where to save the preprocessed set to
createPreprocessing(no_stemming, output_path) # Executes preprocessing based on 'dict' and 'output_path' values
print ("Generated preprocessed set in: %s seconds" % round(time.time()-firstTime,4))

Generated preprocessed set in: 73.2665 seconds
