### Natural Language Processing
We will use ntlk (natural language toolkit) python library.  
See: https://www.nltk.org/  


#### Tokens
Tokenizer split words and sentences. This is a non-trivial task.

#### Stemming
Remove ending of word.

#### Lemmatizer
Similar to stemming
```bash
from nltk.stem.wordnet import WordNetLemmatizer
```
from princeton university

#### POS-Tagging
Tag words (e.g. noun, verb, etc.)

#### Dataset: Amazon Fine Food Reviews

https://www.kaggle.com/snap/amazon-fine-food-reviews

In [None]:
# import
import pandas as pd

import nltk
from nltk.corpus import wordnet
from nltk.stem import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

# https://www.nltk.org/data.html
nltk.set_proxy('http://127.0.0.1:3128')
nltk.download("popular") # Download popular trained models

In [None]:
# Add tags to words
text = "He went into a supermarket in St. Petersburg. There, he bought a Knusperbroetchen for 9.99 $. He knew it better."

sentences = nltk.sent_tokenize(text)

for sentence in sentences:
    # Print words of a sentence
    print(nltk.word_tokenize(sentence))

    # Print tags of each word (e.g. adjective, past, etc)
    tagged_words = nltk.pos_tag(nltk.word_tokenize(sentence))
    for tagged_word in tagged_words:
        print(tagged_word[0] + "/" + tagged_word[1])
    #print(nltk.pos_tag(nltk.word_tokenize(sentence))) 

    # 
    final_sentence = []
    for tagged_word in tagged_words:
        final_sentence.append(tagged_word[0] + "/" + tagged_word[1])
    print("Final sentence: ", final_sentence)

In [None]:
# Stemming - Remove word endings 
from nltk.stem import SnowballStemmer

s = SnowballStemmer("german")

In [None]:
# Remove ending of word
print("Autohäuser: ", s.stem("Autohäuser"))
print("gegangen: ", s.stem("gegangen"))

In [None]:
# Lemmatizer 
from nltk.stem.wordnet import WordNetLemmatizer

l = WordNetLemmatizer()
print("going: ", l.lemmatize("going", "v"))
print("went: ", l.lemmatize("went", "v"))

In [None]:
from nltk.corpus import wordnet
# Function which converts nltk tags to wordnet
# Source: https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [None]:
# Lemmatize sentence
words_tagged = nltk.pos_tag(nltk.word_tokenize("He went to his friends."))

for word in words_tagged:
    print(l.lemmatize(word[0], get_wordnet_pos(word[1])))

In [None]:
# 
import pandas as pd

In [None]:
# Load reviews
df = pd.read_csv("../res/Reviews_10000.csv.bz2")
df.head()

In [None]:
# Extract adjectives from reviews
texts = df["Text"]#[:1000]
texts_transformed = []

for review in texts:
    # Split review into sentences
    sentences = nltk.sent_tokenize(review)
    adjectives = []
    
    # Split words
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        
        # Tag words
        words_tagged = nltk.pos_tag(words)
        
        for word_tagged in words_tagged:
            
            # Only print adjectives
            if word_tagged[1] == "JJ":
                adjectives.append(word_tagged[0])
                #print(word_tagged)
    
    texts_transformed.append(" ".join(adjectives))
    # print(" ".join(adjectives))

#print(texts_transformed)

In [None]:
# import for machine learning
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [None]:
# Prepare data
x = texts_transformed
y = df["Score"] >=4

In [None]:
# Split data
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 0)

# Count features and vectorize
cv = CountVectorizer(max_features = 50) # Train 50 most common adjectives
cv.fit(x_train)

x_train = cv.transform(x_train)
x_test = cv.transform(x_test)

In [None]:
# Train model
model = MultinomialNB()
model.fit(x_train, y_train)

print(model.score(x_test, y_test))

In [None]:
# Print trained adjectives
adj = list(zip(model.coef_[0], cv.get_feature_names()))

adj = sorted(adj)

# Print adjectives sorted
for i in adj:
    print(i)