# Preprocessing

## Abstract
<ul>
    <li><b>Input</b>: Raw data</li>
    <li><b>Process</b>: Applying some standard preprocessing algorithms for text classification and binning of our target variable.</li> 
    <li><b>Output</b>: Datasets with individual preprocessing steps applied</li>
</ul>

In [1]:
import pandas as pd
import time 
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')
import re, string
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
from collections import defaultdict
from collections.abc import Iterable
from nltk import ngrams
nltk.download('wordnet')


hotelData = pd.read_csv('../data/Hotel_Reviews.csv')  


hotelData.head(2)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,8/3/2017,7.7,Hotel Arena,Russia,I am so angry that i made this post available...,397,1403,Only the park outside of the hotel was beauti...,11,7,2.9,"[' Leisure trip ', ' Couple ', ' Duplex Double...",0 days,52.360576,4.915968
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,8/3/2017,7.7,Hotel Arena,Ireland,No Negative,0,1403,No real complaints the hotel was great great ...,105,7,7.5,"[' Leisure trip ', ' Couple ', ' Duplex Double...",0 days,52.360576,4.915968


In [2]:
hotelData["Negative_Review"] = hotelData["Negative_Review"].apply(lambda row: "" if "No Negative" == row else row)
hotelData["Positive_Review"] = hotelData["Positive_Review"].apply(lambda row: "" if "No Positive" == row else row)

hotelData["Review"] = hotelData["Positive_Review"] + " " + hotelData["Negative_Review"]

df = hotelData[["Review","Reviewer_Score"]]
df.head(2)

Unnamed: 0,Review,Reviewer_Score
0,Only the park outside of the hotel was beauti...,2.9
1,No real complaints the hotel was great great ...,7.5


In [3]:
#Different Preprocessing strategies

def tokenize(text, sentenceSeperate=False, includePunctation= False, excludeSpecPuct =[]):
    data = [] 
    #intern functions
    def withPunctation(text):
        temp = [] 
        #delete unwanted punctuation
        for delPunct in excludeSpecPuct:
            text = text.replace(delPunct, " ")
        #help tokenization with replacing some untokenized punctations
        for puct in ["-","/","—"]:
            text = text.replace(puct, " "+puct+" ")
         # tokenize the sentence into words 
        for j in word_tokenize(text): 
            temp.append(j)
        return temp
    
    def withoutPunctation(text):
        token_pattern = re.compile(r"(?u)\b\w\w+\b") # split on whitespace (and remove punctation)
        return token_pattern.findall(text)
    
    text = text.lower()
    
    if sentenceSeperate:
        # iterate through each sentence in the file 
        for sentence in sent_tokenize(text): 
            if includePunctation:
                data.append(withPunctation(sentence))
            else:
                data.append(withoutPunctation(sentence))
    else:
        if includePunctation:
            data = withPunctation(text)
        else:
            data = withoutPunctation(text)
    return data
     

def removeStopwords(wordArray):
    
    my_stopwords = set(stopwords.words('english'))
    withoutStopwords = []
    
    #test if its a list of words or a list of sentences with words
    if len(wordArray)>0 and isinstance(wordArray[0], Iterable)and not isinstance(wordArray[0], str):
        for sentence in wordArray:
            withoutStopwords.append(removeStopwords(sentence))
            
    else:  
        for item in wordArray:
            if item not in my_stopwords:
                withoutStopwords.append(item)
    return withoutStopwords

def applyStemming(wordArray):
    stemmer = PorterStemmer()
    stems = []
    
    #test if its a list of words or a list of sentences with words
    if len(wordArray)>0 and isinstance(wordArray[0], Iterable)and not isinstance(wordArray[0], str):
        for sentence in wordArray:
            stems.append(applyStemming(sentence))
    else:       
        for item in wordArray:
            stems.append(stemmer.stem(item))
    return stems


def applyLemmatizing(wordArray): # Quelle (stark verändert): https://www.guru99.com/stemming-lemmatization-python-nltk.html
    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV   
    
    #intern function
    def lemmazizeText(text):
        temp = []
        for token, tag in pos_tag(text):
            temp.append(lemma_function.lemmatize(token, tag_map[tag[0]]))
        return temp

            
    lemma_function = WordNetLemmatizer()
    baseWords = []
    if len(wordArray)>0 and isinstance(wordArray[0], Iterable)and not isinstance(wordArray[0], str):
        for sentence in wordArray:
            baseWords.append(lemmazizeText(sentence))
    else:
        baseWords =lemmazizeText(wordArray)
     
    return baseWords

def addNGram(wordArray, NGramLength=2):
    holetext= wordArray
    temp = []
    if len(wordArray)>0 and isinstance(wordArray[0], Iterable)and not isinstance(wordArray[0], str):
        print("drin")
        for sentence in wordArray:
            temp.append(' '.join(sentence))
        holetext = (' '.join(temp)).split()
    nGrams = list(ngrams(holetext, NGramLength))
    
    # make nGram from two words to one 
    nGramsFull = pd.Series(nGrams).apply(lambda row: ' '.join(row))
    wordArrayCopy = wordArray.copy()
    wordArrayCopy.extend(nGramsFull)
    return (wordArrayCopy)

In [4]:
#Binning

def binning(score):
    result = "SOMETHING VERY WEIRD HAPPENED HERE"
    
    if (score <= 10) & (score >=8.5):
        return 0
    if (score < 8.5) & (score >=7.0):
        return 1
    if (score < 7.0) & (score >=0):
        return 2
    
    return result

df["Reviewer_Score"] = df["Reviewer_Score"].apply(lambda score: binning(score))
df.head(2)

df.to_csv("../data/Hotel_reviews_features_selected.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [8]:
std_dict = {
    "token": True, #mandatory True
    "token_sentenceSeperate":False,
    "token_includePunctation":False,
    "token_excludeSpecPuct" :[],
    "rem_stpwrds": True,
    "stemm": True,
    "lemmatize": True,
    "nGram": True,
    "nGram_length":2
}

def preprocess(review,dict):
    if dict["token"]:
        review = tokenize(review,sentenceSeperate=dict["token_sentenceSeperate"],includePunctation=dict["token_includePunctation"],excludeSpecPuct=dict["token_excludeSpecPuct"] )
    if dict["remStpwrds"]:
        review = removeStopwords(review)
    if dict["stemm"]:
        review = applyStemming(review)
    if dict["lemmatize"]:
        review = applyLemmatizing(review)
    if dict["nGram"]:
        review = addNGram(review,NGramLength=dict["nGram_length"])
    
    return review
        
#abstractsPro.append(addNGram(applyLemmatizing(removeStopwords(tokenize(abstract)))))

In [10]:
dict = {
    "token": True, #mandatory True
    "token_sentenceSeperate":False,
    "token_includePunctation":False,
    "token_excludeSpecPuct" :[],
    "remStpwrds": True,
    "stemm": True,
    "lemmatize": True,
    "nGram": True,
    "nGram_length":2
}

# Making new Preprocessed sets
Do you want to make a new preprocessed set. Just start here.

In [47]:
import pandas as pd
import time 
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')
import re, string
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
from collections import defaultdict
from collections.abc import Iterable
from nltk import ngrams
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/benedikt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/benedikt/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/benedikt/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

<b>Just copy and paste this and edit to your needs</b>

In [12]:
df = pd.read_csv("../data/Hotel_reviews_features_selected.csv")

dict = {
    "token": True, #mandatory True
    "token_sentenceSeperate":False,
    "token_includePunctation":False,
    "token_excludeSpecPuct" :[],
    "remStpwrds": True,
    "stemm": True,
    "lemmatize": False,
    "nGram": True,
    "nGram_length":2
}

df["Review"] = df["Review"].apply(lambda review: preprocess(review,dict))
df.to_csv("../data/preprocessed/"+"_".join(str(key) + str(value) for key, value in dict.items())+".csv")

KeyboardInterrupt: 

In [11]:
df = pd.read_csv("../data/Hotel_reviews_features_selected.csv")

dict = {
    "token": True, #mandatory True
    "token_sentenceSeperate":False,
    "token_includePunctation":False,
    "token_excludeSpecPuct" :[],
    "remStpwrds": True,
    "stemm": False,
    "lemmatize": False,
    "nGram": True,
    "nGram_length":2
}

df["Review"] = df["Review"].apply(lambda review: preprocess(review,dict))
df.to_csv("../data/preprocessed/"+"_".join(str(key) + str(value) for key, value in dict.items())+".csv")

SyntaxError: invalid syntax (<ipython-input-11-e49f262543db>, line 6)

In [61]:
df = pd.read_csv("../data/Hotel_reviews_features_selected.csv") 

dict = {
    "token": True, #mandatory True
    "token_sentenceSeperate":False,
    "token_includePunctation":False,
    "token_excludeSpecPuct" :[],
    "remStpwrds": True,
    "stemm": True,
    "lemmatize": False,
    "nGram": True,
    "nGram_length":3
}

df["Review"] = df["Review"].apply(lambda review: preprocess(review,dict))
df.to_csv("../data/preprocessed/"+"_".join(str(key) + str(value) for key, value in dict.items())+".csv")



In [62]:
df = pd.read_csv("../data/Hotel_reviews_features_selected.csv") #

dict = {
    "token": True, #mandatory True
    "token_sentenceSeperate":False,
    "token_includePunctation":False,
    "token_excludeSpecPuct" :[],
    "remStpwrds": True,
    "stemm": True,
    "lemmatize": False,
    "nGram": False,
    "nGram_length":2
}

df["Review"] = df["Review"].apply(lambda review: preprocess(review,dict))
df.to_csv("../data/preprocessed/"+"_".join(str(key) + str(value) for key, value in dict.items())+".csv")