In [9]:
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
from textblob_de import TextBlobDE as TextBlob
from nltk.stem.snowball import SnowballStemmer
import re
import nltk
from gensim.corpora.dictionary import Dictionary
from nltk.corpus import stopwords
from itertools import chain
import numpy as np
import pickle

In [10]:
# funktion um Auswirkungen der Preprocessing Schritte zu untersuchen
def count_words(data, message=""):
    _ = [word_tokenize(x) for x in data]
    unique_words = len(set(list(chain.from_iterable(_))))
    total_words = len(list(chain.from_iterable(_)))
    print("Step: {}".format(message))
    print("Count unique Words: {}".format(unique_words))
    print("Count Words total: {}".format(total_words))
    print()
    
# regex welche für die Negationswörter abdeckt
negation_regex = "nicht[s]?|kein|keine[rsmn]|nie|niemals|ohne|kaum"

In [11]:
# einlesen des Datensatzes
thomann_data = pd.read_csv("thomann_reviews_all.csv")
thomann_data["review_old"] = thomann_data["review"]

In [12]:
# Wörter werden gleich zu Beginn gezählt, da späteres Negationshandling die originale Satzstruktur benötigt 
# aber die Auswirkungen auf den Korpus dennoch ausgegeben werden sollen

# keine Sonderzeichen zählen
baseline = [re.sub("[^\w]"," ", rev) for rev in thomann_data["review"]]

In [13]:
# POS Tagging um das Negationshandling zu ermöglichen
thomann_data["taggedWords"] = [TextBlob(review).tags for review in thomann_data["review"]]

def identify_negations_bigram(tagged_review_words):
    word_tag_pairs = nltk.bigrams(tagged_review_words)
    # findet Adjektive oder Nomen innerhalb der Reviewtags, welche Negation vorangestellt haben und gibt diese Wort-Tupel zurück
    matched_pairs = [(a[0], b[0]) for (a, b) in word_tag_pairs if re.match(negation_regex, a[0],flags=re.IGNORECASE) 
                                                                 and (re.match("JJ*",b[1]) or re.match("NN*",b[1])) 
                                                                 and re.match("[\w]+",b[0])]
    # instanziiere Hilfslisten
    search_list = list()
    replacement_list = list()
    # für alle gefunden Tupel 
    for pair in matched_pairs:
        # konvertiere Tupel zu String 
        search_string = ' '.join(pair)
        # hänge an das zweite Wort NOT_
        replacement = search_string.split()[0] +" NOT_"+ search_string.split()[1]
        # hänge Ergebnisse an Liste
        search_list.append(search_string)
        replacement_list.append(replacement)
    # übergebe gezippte Liste
    return list(zip(search_list,replacement_list))
# speichere identifizierte negations Bigrams in neuer Spalte
thomann_data["replace_negations_bigram"] = thomann_data["taggedWords"].apply(identify_negations_bigram)

def identify_negations_trigram(tagged_review_words):
    word_tag_tris = nltk.trigrams(tagged_review_words)
    matched_tris = [(a[0],b[0],c[0]) for (a,b,c) in word_tag_tris if re.match(negation_regex, a[0],flags=re.IGNORECASE) 
                                                                    and re.match("RB*", b[1]) 
                                                                    and (re.match("JJ*",c[1]) or re.match("NN*",c[1])) 
                                                                    and re.match("[\w]+",c[0])]
    # instanziiere Hilfslisten
    search_list = list()
    replacement_list = list()
    for pair in matched_tris:
        # konvertiere Tupel zu String 
        search_string = ' '.join(pair)
        # hänge an das zweite Wort NOT_
        replacement = search_string.split()[0] + " " + search_string.split()[1] +" NOT_"+ search_string.split()[2]
        # hänge Ergebnisse an Liste
        search_list.append(search_string)
        replacement_list.append(replacement)
    # übergebe gezippte Liste
    return list(zip(search_list,replacement_list))
# speichere identifizierte negations Trigrams in neuer Spalte
thomann_data["replace_negations_trigram"] = thomann_data["taggedWords"].apply(identify_negations_trigram)

# Merge der beiden Ersetzungen
thomann_data["replace_negations"] = thomann_data["replace_negations_trigram"] + thomann_data["replace_negations_bigram"]
#thomann_data.drop(["replace_negations_trigram","replace_negations_bigram"], inplace = True, axis = 1)

# ersetzte die neuen NOT Wörter im Review Text
def replacement(x):
    review = x["review"]
    for t in x["replace_negations"]:
        review = re.sub(t[0], t[1], review)
    return review
thomann_data["review"] = thomann_data.apply(replacement, axis = 1)
thomann_data.drop(["replace_negations"], inplace = True, axis = 1)

# nun sind die Tags zwar bei der Spalte richtig aber die getaggten Tupel sind noch falsch
# es erfolgt ein Abgleich zwischen der alten spalte "taggedWords" (hier sind die POS-Tags noch richtig) und der neuen Spalte
# "taggedWords_neg" (hier sind die Wörter richtig z.B. not_gut)
thomann_data["taggedWords_neg"] = [TextBlob(review).tags for review in thomann_data["review"]]

def choose_right_POS(x):
    # words = z[0][0]
    # tags = z[1][1]
    list_of_tupels = list()
    for z in zip(x["taggedWords_neg"],x["taggedWords"]):
        list_of_tupels.append((z[0][0],z[1][1]))
    return list_of_tupels
thomann_data["taggedWords_neg"] = thomann_data.apply(choose_right_POS, axis = 1)

In [14]:
# PREPROCESSING mit Beachtung der Satzstruktur, ist nur dazu da, dass tokenSent_pp_neg erstellt werden kann (wird für W2V training benötigt)
for mode in ["review","review_old"]:

    thomann_data["review_temp"] = [re.sub("[^\w\.\!\?]"," ", rev) for rev in thomann_data[mode]]
    thomann_data["review_temp"] = [rev.lower() for rev in thomann_data["review_temp"]]
    thomann_data["review_temp"] = [re.sub("[0-9]+"," ", rev) for rev in thomann_data["review_temp"]]
    thomann_data["review_temp"] = [re.sub("[ \t]+ "," ", rev) for rev in thomann_data["review_temp"]]

    # stopwords
    stop = stopwords.words('german')
    # extend 
    stop.append("dass")
    thomann_data["review_temp"] = thomann_data["review_temp"].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

    # stemming 
    stemmer = SnowballStemmer("german")
    def stem(review):
        all_words = review.split()
        all_words_stemmed = [stemmer.stem(x) for x in all_words]
        return " ".join(all_words_stemmed)
    thomann_data["review_temp"] = thomann_data["review_temp"].apply(stem)
    
    # wenn es sich um rohe reviews handelt erstelle die preprocess only Reviewtext
    if mode == "review_old":
        thomann_data["review_pp_only"] = thomann_data["review_temp"]
    
    # wenn es sich um not_ reviews handelt erstelle tokenSent
    if mode == "review":
        thomann_data["tokenSent_pp_neg"] = [sent_tokenize(review) for review in thomann_data["review_temp"]]

    thomann_data.drop(["review_temp"], inplace = True, axis = 1)

In [15]:
# PREPROCESSING NORMAL
count_words(baseline, "Ausgangssituation")
# entferne alles was kein buchstabe oder zahl ist
thomann_data["review"] = [re.sub("[^\w]"," ", rev) for rev in thomann_data["review"]]

test = [re.sub("[^\w]"," ", rev) for rev in thomann_data["review"]]

count_words(thomann_data["review"], "nach NOT")
thomann_data["review"] = [rev.lower() for rev in thomann_data["review"]]
count_words(thomann_data["review"], "nach Case Insensitive")
thomann_data["review"] = [re.sub("[0-9]+"," ", rev) for rev in thomann_data["review"]]
count_words(thomann_data["review"], "ohne Zahlen")
thomann_data["review"] = [re.sub("[ \t]+ "," ", rev) for rev in thomann_data["review"]]

# stopwords
stop = stopwords.words('german')
# extend 
stop.append("dass")
thomann_data["review"] = thomann_data["review"].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
count_words(thomann_data["review"], "ohne Stoppwörter")
# stemming 
stemmer = SnowballStemmer("german")
def stem(review):
    all_words = review.split()
    all_words_stemmed = [stemmer.stem(x) for x in all_words]
    return " ".join(all_words_stemmed)
thomann_data["review"] = thomann_data["review"].apply(stem)
thomann_data["tokenWord_pp_neg"] = [word_tokenize(review) for review in thomann_data["review"]]

count_words(thomann_data["review"], "nach Stemmer")


Step: Ausgangssituation
Count unique Words: 69305
Count Words total: 2183298

Step: nach NOT
Count unique Words: 72869
Count Words total: 2183298

Step: nach Case Insensitive
Count unique Words: 63310
Count Words total: 2183298

Step: ohne Zahlen
Count unique Words: 60777
Count Words total: 2163233

Step: ohne Stoppwörter
Count unique Words: 60550
Count Words total: 1116581

Step: nach Stemmer
Count unique Words: 43308
Count Words total: 1116581



In [16]:
with open('enriched_thomann_data.pickle', 'wb') as f:
        pickle.dump(thomann_data, f, protocol=2)