In [55]:
import nltk
from nltk import ngrams
# Pakckages importeres
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import re
import pandas as pd

# Pathlib bruges til at opsætte sti til fil
from pathlib import Path

In [54]:
# Path til fil opsættes
cwd = Path.cwd()
csv_path = Path(r'C:\Users\mpede\source\repos\mpeder75\The Ai engineer course\utils\tripadvisor_hotel_reviews.csv')

In [16]:
# Data sæt indlæses i variabel
data = pd.read_csv(csv_path)

In [17]:
# Data info viser info om filen
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109 entries, 0 to 108
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  109 non-null    object
 1   Rating  109 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 1.8+ KB


In [18]:
# Data head viser de første 5 rækker
data.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [19]:
# Her kigger vi på 1 review med flere detaljer
data['Review'][0]

'nice hotel expensive parking got good deal stay hotel anniversary, arrived late evening took advice previous reviews did valet parking, check quick easy, little disappointed non-existent view room room clean nice size, bed comfortable woke stiff neck high pillows, not soundproof like heard music room night morning loud bangs doors opening closing hear people talking hallway, maybe just noisy neighbors, aveda bath products nice, did not goldfish stay nice touch taken advantage staying longer, location great walking distance shopping, overall nice experience having pay 40 parking night,  '

In [20]:
# Konverter tekst til lowercase i ny kolonne 'review_lowercase'
data['review_lowercase'] = data['Review'].str.lower()

In [21]:
# Nu tjekkes om 'review_lowervase' er oprettet
data.head()

Unnamed: 0,Review,Rating,review_lowercase
0,nice hotel expensive parking got good deal sta...,4,nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...,2,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...,3,nice rooms not 4* experience hotel monaco seat...
3,"unique, great stay, wonderful time hotel monac...",5,"unique, great stay, wonderful time hotel monac..."
4,"great stay great stay, went seahawk game aweso...",5,"great stay great stay, went seahawk game aweso..."


In [22]:
# Nu fjernes stopwords
en_stopwords = stopwords.words('english')

In [23]:
# Vi ønsker ligledes at ordet 'not' ikke skal være et stopword
en_stopwords.remove('not')

In [26]:
# Tekst uden stopwords placveres i ny kolonne 'review_nostopwords'
# .apply bruges til at anvende en funktion på hver række i kolonnen
data['review_no_stopwords'] = (
    data['review_lowercase'].apply(
        lambda x: ' '.join([
            word for word in x.split()
            if word not in (en_stopwords)]))
)

In [28]:
# Tjekker om kolonnen er oprettet
data['review_no_stopwords'][0]

'nice hotel expensive parking got good deal stay hotel anniversary, arrived late evening took advice previous reviews valet parking, check quick easy, little disappointed non-existent view room room clean nice size, bed comfortable woke stiff neck high pillows, not soundproof like heard music room night morning loud bangs doors opening closing hear people talking hallway, maybe noisy neighbors, aveda bath products nice, not goldfish stay nice touch taken advantage staying longer, location great walking distance shopping, overall nice experience pay 40 parking night,'

In [29]:
# Fjerne punktum men bibeholde tegnet *
data['review_no_stopwords_no_punct'] = (
    data.apply(lambda x: re.sub(r"[*]", "star", x['review_no_stopwords']), axis=1))

In [30]:
# Tjekker om kolonnen er oprettet
data.head()

Unnamed: 0,Review,Rating,review_lowercase,review_no_stopwords,review_no_stopwords_no_punct
0,nice hotel expensive parking got good deal sta...,4,nice hotel expensive parking got good deal sta...,nice hotel expensive parking got good deal sta...,nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...,2,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...,3,nice rooms not 4* experience hotel monaco seat...,nice rooms not 4* experience hotel monaco seat...,nice rooms not 4star experience hotel monaco s...
3,"unique, great stay, wonderful time hotel monac...",5,"unique, great stay, wonderful time hotel monac...","unique, great stay, wonderful time hotel monac...","unique, great stay, wonderful time hotel monac..."
4,"great stay great stay, went seahawk game aweso...",5,"great stay great stay, went seahawk game aweso...","great stay great stay, went seahawk game aweso...","great stay great stay, went seahawk game aweso..."


In [32]:
# Stemming og lemmatization
data['review_no_stopwords_no_punct'] = data.apply(
    lambda x: re.sub(r"([^\w\s])","", x['review_no_stopwords_no_punct']), axis=1)

In [36]:
# Tokenize tekst
data['tokenized'] = data.apply(lambda x: word_tokenize(x['review_no_stopwords_no_punct']), axis=1)

In [37]:
# Tjekker at tokenized
data['tokenized'][0]

['nice',
 'hotel',
 'expensive',
 'parking',
 'got',
 'good',
 'deal',
 'stay',
 'hotel',
 'anniversary',
 'arrived',
 'late',
 'evening',
 'took',
 'advice',
 'previous',
 'reviews',
 'valet',
 'parking',
 'check',
 'quick',
 'easy',
 'little',
 'disappointed',
 'nonexistent',
 'view',
 'room',
 'room',
 'clean',
 'nice',
 'size',
 'bed',
 'comfortable',
 'woke',
 'stiff',
 'neck',
 'high',
 'pillows',
 'not',
 'soundproof',
 'like',
 'heard',
 'music',
 'room',
 'night',
 'morning',
 'loud',
 'bangs',
 'doors',
 'opening',
 'closing',
 'hear',
 'people',
 'talking',
 'hallway',
 'maybe',
 'noisy',
 'neighbors',
 'aveda',
 'bath',
 'products',
 'nice',
 'not',
 'goldfish',
 'stay',
 'nice',
 'touch',
 'taken',
 'advantage',
 'staying',
 'longer',
 'location',
 'great',
 'walking',
 'distance',
 'shopping',
 'overall',
 'nice',
 'experience',
 'pay',
 '40',
 'parking',
 'night']

In [39]:
# Stemmer instans opsættes
ps = PorterStemmer()


In [40]:
# Stemming anvendes på tokenized kolonne
data['stemmed'] = data['tokenized'].apply(lambda tokens: [ps.stem(token) for token in tokens])

In [41]:
data.head()

Unnamed: 0,Review,Rating,review_lowercase,review_no_stopwords,review_no_stopwords_no_punct,tokenized,stemmed
0,nice hotel expensive parking got good deal sta...,4,nice hotel expensive parking got good deal sta...,nice hotel expensive parking got good deal sta...,nice hotel expensive parking got good deal sta...,"[nice, hotel, expensive, parking, got, good, d...","[nice, hotel, expens, park, got, good, deal, s..."
1,ok nothing special charge diamond member hilto...,2,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...,ok nothing special charge diamond member hilto...,"[ok, nothing, special, charge, diamond, member...","[ok, noth, special, charg, diamond, member, hi..."
2,nice rooms not 4* experience hotel monaco seat...,3,nice rooms not 4* experience hotel monaco seat...,nice rooms not 4* experience hotel monaco seat...,nice rooms not 4star experience hotel monaco s...,"[nice, rooms, not, 4star, experience, hotel, m...","[nice, room, not, 4star, experi, hotel, monaco..."
3,"unique, great stay, wonderful time hotel monac...",5,"unique, great stay, wonderful time hotel monac...","unique, great stay, wonderful time hotel monac...",unique great stay wonderful time hotel monaco ...,"[unique, great, stay, wonderful, time, hotel, ...","[uniqu, great, stay, wonder, time, hotel, mona..."
4,"great stay great stay, went seahawk game aweso...",5,"great stay great stay, went seahawk game aweso...","great stay great stay, went seahawk game aweso...",great stay great stay went seahawk game awesom...,"[great, stay, great, stay, went, seahawk, game...","[great, stay, great, stay, went, seahawk, game..."


In [42]:
# Lemmatizer instans opsættes
lemmatizer = WordNetLemmatizer()

In [43]:
# Lemmatize ny kolonne 'tokenized'
data["lemmatized"] = data["tokenized"].apply(
    lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])

In [44]:
# Combine alle lemmatized tokens i en liste
tokens_clean = sum(data['lemmatized'], [])

In [48]:
# unigrams: n=1
unigrams = pd.Series(tokens_clean).value_counts()
print(unigrams)

hotel           292
room            275
great           126
not             122
stay             95
               ... 
175               1
smackagainst      1
2x                1
80                1
connected         1
Name: count, Length: 2589, dtype: int64


In [56]:
# Bigrams: n=2
bigrams = pd.Series(list(ngrams(tokens_clean, 2))).value_counts()
print(bigrams)

(great, location)     24
(space, needle)       21
(hotel, monaco)       16
(great, hotel)        12
(staff, friendly)     12
                      ..
(didnt, make)          1
(personnel, didnt)     1
(minute, stay)         1
(starting, minute)     1
(food, raffle)         1
Name: count, Length: 8263, dtype: int64
