# Collocations

In [12]:
#load all libraries
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import spacy
import string

Data: https://www.kaggle.com/datafiniti/hotel-reviews/data

In [2]:
#load reviews data
reviews = pd.read_csv('/Users/Nicha/Downloads/hotel_reviews.csv')

In [360]:
reviews.head(2)

Unnamed: 0,address,categories,city,country,latitude,longitude,name,postalCode,province,reviews.date,reviews.dateAdded,reviews.doRecommend,reviews.id,reviews.rating,reviews.text,reviews.title,reviews.userCity,reviews.username,reviews.userProvince
0,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2013-09-22T00:00:00Z,2016-10-24T00:00:25Z,,,4.0,Pleasant 10 min walk along the sea front to th...,Good location away from the crouds,,Russ (kent),
1,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2015-04-03T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,Really lovely hotel. Stayed on the very top fl...,Great hotel with Jacuzzi bath!,,A Traveler,


Extract only the reviews...

In [None]:
comments = reviews['reviews.text']

## Preprocessing

In [None]:
#function to remove non-ascii characters
def _removeNonAscii(s): return "".join(i for i in s if ord(i)<128)

In [None]:
comments = comments.astype('str')

In [226]:
#remove non-ascii characters
comments = comments.map(lambda x: _removeNonAscii(x))

In [231]:
#get stop words of all languages
STOPWORDS_DICT = {lang: set(nltk.corpus.stopwords.words(lang)) for lang in nltk.corpus.stopwords.fileids()}

In [232]:
#function to detect language based on # of stop words for particular language
def get_language(text):
    words = set(nltk.wordpunct_tokenize(text.lower()))
    lang = max(((lang, len(words & stopwords)) for lang, stopwords in STOPWORDS_DICT.items()), key = lambda x: x[1])[0]
    if lang == 'english':
        return True
    else:
        return False

In [233]:
#filter for only english comments
eng_comments=comments[comments.apply(get_language)]

In [234]:
eng_comments.head()

0    Pleasant 10 min walk along the sea front to th...
1    Really lovely hotel. Stayed on the very top fl...
3    We stayed here for four nights in October. The...
4    We stayed here for four nights in October. The...
5    We loved staying on the island of Lido! You ne...
Name: reviews.text, dtype: object

In [235]:
#drop duplicates
eng_comments.drop_duplicates(inplace=True)

In [237]:
#load spacy
nlp = spacy.load('en')

In [238]:
#function to clean and lemmatize comments
def clean_comments(text):
    #remove punctuations
    regex = re.compile('[' + re.escape(string.punctuation) + '\\r\\t\\n]')
    nopunct = regex.sub(" ", str(text))
    #use spacy to lemmatize comments
    doc = nlp(nopunct, disable=['parser','ner'])
    lemma = [token.lemma_ for token in doc]
    return lemma

In [239]:
#apply function to clean and lemmatize comments
lemmatized = eng_comments.map(clean_comments)

In [242]:
#make sure to lowercase everything
lemmatized = lemmatized.map(lambda x: [word.lower() for word in x])

In [243]:
lemmatized.head()

0    [pleasant, 10, min, walk, along, the, sea, fro...
1    [really, lovely, hotel,  , stay, on, the, very...
3    [-pron-, stay, here, for, four, night, in, oct...
5    [-pron-, love, stay, on, the, island, of, lido...
6    [lovely, view, out, onto, the, lagoon,  , exce...
Name: reviews.text, dtype: object

In [244]:
#turn all comments' tokens into one single list
unlist_comments = [item for items in lemmatized for item in items]

## Initialize NLTK's Bigrams/Trigrams Finder

In [None]:
bigrams = nltk.collocations.BigramAssocMeasures()
trigrams = nltk.collocations.TrigramAssocMeasures()

In [None]:
bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(unlist_comments)
trigramFinder = nltk.collocations.TrigramCollocationFinder.from_words(unlist_comments)

## 1. Counting Frequencies of Adjacent Words
- Main idea: simply order by frequency
- Issues: too sensitive to very frequent pairs and pronouns/articles/prepositions come up often
- Solution: filter for only adjectives and nouns

In [247]:
bigram_freq = bigramFinder.ngram_fd.items()

In [248]:
bigramFreqTable = pd.DataFrame(list(bigram_freq), columns=['bigram','freq']).sort_values(by='freq', ascending=False)

In [249]:
bigramFreqTable.head().reset_index(drop=True)

Unnamed: 0,bigram,freq
0,"( , -pron-)",29078
1,"( , the)",21918
2,"(-pron-, be)",18353
3,"(the, room)",8898
4,"(-pron-, have)",8377


In [250]:
bigramFreqTable[:10]

Unnamed: 0,bigram,freq
152,"( , -pron-)",29078
93,"( , the)",21918
58,"(-pron-, be)",18353
108,"(the, room)",8898
238,"(-pron-, have)",8377
109,"(room, be)",8300
323,"(in, the)",8150
190,"(be, very)",7708
248,"(be, a)",7263
237,"(and, -pron-)",7012


In [252]:
#get english stopwords
en_stopwords = set(stopwords.words('english'))

In [253]:
#function to filter for ADJ/NN bigrams
def rightTypes(ngram):
    if '-pron-' in ngram or '' in ngram or ' 'in ngram or 't' in ngram:
        return False
    for word in ngram:
        if word in en_stopwords:
            return False
    acceptable_types = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    second_type = ('NN', 'NNS', 'NNP', 'NNPS')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in acceptable_types and tags[1][1] in second_type:
        return True
    else:
        return False

In [254]:
#filter bigrams
filtered_bi = bigramFreqTable[bigramFreqTable.bigram.map(lambda x: rightTypes(x))]

In [255]:
filtered_bi[:10]

Unnamed: 0,bigram,freq
1087,"(front, desk)",2674
73,"(great, location)",797
270,"(friendly, staff)",775
5159,"(hot, tub)",635
4731,"(clean, room)",626
95,"(hotel, staff)",539
3098,"(continental, breakfast)",531
266,"(nice, hotel)",530
4077,"(free, breakfast)",522
1905,"(great, place)",514


In [258]:
trigram_freq = trigramFinder.ngram_fd.items()

In [259]:
trigramFreqTable = pd.DataFrame(list(trigram_freq), columns=['trigram','freq']).sort_values(by='freq', ascending=False)

In [260]:
trigramFreqTable.head().reset_index(drop=True)

Unnamed: 0,trigram,freq
0,"( , -pron-, be)",6267
1,"(the, room, be)",4411
2,"( , the, room)",3349
3,"( , -pron-, have)",2681
4,"(the, staff, be)",2641


In [261]:
trigramFreqTable[:10]

Unnamed: 0,trigram,freq
2575,"( , -pron-, be)",6267
114,"(the, room, be)",4411
113,"( , the, room)",3349
1005,"( , -pron-, have)",2681
1455,"(the, staff, be)",2641
682,"(the, hotel, be)",2323
2598,"( , there, be)",2181
666,"( , the, staff)",1928
266,"(-pron-, have, a)",1835
1320,"(the, front, desk)",1826


In [333]:
def rightTypesTri(ngram):
    if '-pron-' in ngram or '' in ngram or ' 'in ngram or '  ' in ngram or 't' in ngram:
        return False
    for word in ngram:
        if word in en_stopwords:
            return False
    first_type = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    third_type = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in first_type and tags[2][1] in third_type:
        return True
    else:
        return False

In [263]:
filtered_tri = trigramFreqTable[trigramFreqTable.trigram.map(lambda x: rightTypesTri(x))]

In [264]:
filtered_tri[:10]

Unnamed: 0,trigram,freq
3374,"(front, desk, staff)",384
24548,"(non, smoking, room)",213
34521,"(holiday, inn, express)",136
10218,"(front, desk, clerk)",122
12301,"(flat, screen, tv)",79
37533,"(smell, like, smoke)",72
141458,"(old, town, alexandria)",69
18378,"(front, desk, person)",65
6088,"(free, wi, fi)",62
56372,"(great, customer, service)",54


In [339]:
freq_bi = filtered_bi[:20].bigram.values

In [341]:
freq_tri = filtered_tri[:20].trigram.values

## 2. PMI

In [278]:
bigramFinder.apply_freq_filter(20)

In [279]:
bigramPMITable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.pmi)), columns=['bigram','PMI']).sort_values(by='PMI', ascending=False)

In [281]:
bigramPMITable[:10]

Unnamed: 0,bigram,PMI
0,"(universal, studios)",15.201284
1,"(howard, johnson)",14.95478
2,"(cracker, barrel)",14.81126
3,"(santa, barbara)",14.522026
4,"(sub, par)",14.08839
5,"(santana, row)",14.001559
6,"(e, g)",13.687743
7,"(elk, springs)",13.333635
8,"(times, square)",13.161556
9,"(ear, plug)",13.094932


In [282]:
trigramFinder.apply_freq_filter(20)

In [283]:
trigramPMITable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.pmi)), columns=['trigram','PMI']).sort_values(by='PMI', ascending=False)

In [285]:
trigramPMITable[:10]

Unnamed: 0,trigram,PMI
0,"(elk, springs, resort)",23.859277
1,"(zion, national, park)",23.223602
2,"(flat, screen, tv)",22.598334
3,"(hard, boil, egg)",22.117153
4,"(holiday, inn, express)",21.635639
5,"(within, walking, distance)",21.585821
6,"(red, roof, inn)",21.397206
7,"(simpson, house, inn)",20.803959
8,"(free, wi, fi)",20.634339
9,"(slide, glass, door)",20.261822


In [343]:
pmi_bi = bigramPMITable[:20].bigram.values

In [344]:
pmi_tri = trigramPMITable[:20].trigram.values

## 3. t-test

In [297]:
bigramTtable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.student_t)), columns=['bigram','t']).sort_values(by='t', ascending=False)

In [298]:
bigramTtable.head()

Unnamed: 0,bigram,t
0,"( , -pron-)",112.604884
1,"( , the)",92.1636
2,"(-pron-, be)",89.806584
3,"(the, room)",79.4366
4,"(be, very)",78.009909


In [299]:
filteredT_bi = bigramTtable[bigramTtable.bigram.map(lambda x: rightTypes(x))]

In [303]:
filteredT_bi[:10]

Unnamed: 0,bigram,t
21,"(front, desk)",51.576355
109,"(great, location)",27.42921
120,"(friendly, staff)",26.735061
136,"(hot, tub)",25.1521
161,"(continental, breakfast)",22.920096
172,"(free, breakfast)",22.405215
193,"(great, place)",21.472346
208,"(parking, lot)",20.779445
218,"(customer, service)",20.48332
226,"(desk, staff)",20.214107


In [304]:
trigramTtable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.student_t)), columns=['trigram','t']).sort_values(by='t', ascending=False)

In [305]:
trigramTtable.head()

Unnamed: 0,trigram,t
0,"( , -pron-, be)",72.501091
1,"(the, room, be)",65.285635
2,"( , the, room)",55.800759
3,"(the, staff, be)",50.822814
4,"( , -pron-, have)",49.513033


In [324]:
filteredT_tri = trigramTtable[trigramTtable.trigram.map(lambda x: rightTypesTri(x))]

In [325]:
filteredT_tri.head(10)

Unnamed: 0,trigram,t
143,"(front, desk, staff)",19.593921
341,"(non, smoking, room)",14.594362
622,"(holiday, inn, express)",11.6619
730,"(front, desk, clerk)",11.045156
1252,"(flat, screen, tv)",8.888193
1416,"(smell, like, smoke)",8.485101
1488,"(old, town, alexandria)",8.306598
1627,"(front, desk, person)",8.061943
1739,"(free, wi, fi)",7.874003
2093,"(great, customer, service)",7.347582


In [345]:
t_bi = filteredT_bi[:20].bigram.values

In [346]:
t_tri = filteredT_tri[:20].trigram.values

## 4. Chi-Square

In [309]:
bigramChiTable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.chi_sq)), columns=['bigram','chi-sq']).sort_values(by='chi-sq', ascending=False)

In [312]:
bigramChiTable.head(20)

Unnamed: 0,bigram,chi-sq
0,"(wi, fi)",1651813.0
1,"(cracker, barrel)",1322475.0
2,"(howard, johnson)",1206747.0
3,"(la, quinta)",1070882.0
4,"(front, desk)",1027741.0
5,"(universal, studios)",904164.8
6,"(santa, barbara)",870475.6
7,"(santana, row)",836460.5
8,"( , more)",749751.1
9,"(flat, screen)",711552.7


In [313]:
trigramChiTable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.chi_sq)), columns=['trigram','chi-sq']).sort_values(by='chi-sq', ascending=False)

In [315]:
trigramChiTable.head(20)

Unnamed: 0,trigram,chi-sq
0,"(within, walking, distance)",610781900.0
1,"(elk, springs, resort)",578402100.0
2,"(flat, screen, tv)",501737000.0
3,"(holiday, inn, express)",443167900.0
4,"(zion, national, park)",303670900.0
5,"(red, roof, inn)",124439200.0
6,"(hard, boil, egg)",104655800.0
7,"(free, wi, fi)",101786700.0
8,"(simpson, house, inn)",45780810.0
9,"(within, walk, distance)",35559330.0


In [347]:
chi_bi = bigramChiTable[:20].bigram.values

In [348]:
chi_tri = trigramChiTable[:20].trigram.values

## 5. Likelihood

In [316]:
bigramLikTable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.likelihood_ratio)), columns=['bigram','likelihood ratio']).sort_values(by='likelihood ratio', ascending=False)

In [318]:
bigramLikTable.head()

Unnamed: 0,bigram,likelihood ratio
0,"( , more)",48448.438687
1,"(front, desk)",32462.675748
2,"( , -pron-)",31120.23525
3,"(didn, t)",26672.134364
4,"(be, very)",24360.396967


In [319]:
filteredLik_bi = bigramLikTable[bigramLikTable.bigram.map(lambda x: rightTypes(x))]

In [321]:
filteredLik_bi.head(10)

Unnamed: 0,bigram,likelihood ratio
1,"(front, desk)",32462.675748
51,"(hot, tub)",7400.417612
75,"(continental, breakfast)",5283.734146
93,"(customer, service)",4513.277281
99,"(wi, fi)",4395.380815
100,"(great, location)",4345.972738
105,"(walk, distance)",4207.577371
113,"(parking, lot)",3992.332408
124,"(friendly, staff)",3679.524918
125,"(air, conditioner)",3656.082047


In [322]:
trigramLikTable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.likelihood_ratio)), columns=['trigram','likelihood ratio']).sort_values(by='likelihood ratio', ascending=False)

In [323]:
trigramLikTable.head()

Unnamed: 0,trigram,likelihood ratio
0,"(the, room, be)",95028.980291
1,"( , -pron-, be)",85768.942108
2,"(the, staff, be)",77588.060878
3,"( , more, -pron-)",77213.83221
4,"( , -pron-, have)",76439.150648


In [334]:
filteredLik_tri = trigramLikTable[trigramLikTable.trigram.map(lambda x: rightTypesTri(x))]

In [335]:
filteredLik_tri.head(20)

Unnamed: 0,trigram,likelihood ratio
81,"(front, desk, clerk)",53404.085856
93,"(front, desk, staff)",51768.199453
116,"(front, desk, person)",49662.849781
126,"(front, desk, attendant)",49209.297668
131,"(front, desk, personnel)",49133.163919
134,"(call, front, desk)",49020.607199
137,"(front, desk, people)",49003.888883
139,"(front, desk, guy)",48988.631032
141,"(front, desk, lady)",48976.873913
165,"(hotel, front, desk)",48703.645937


In [349]:
lik_bi = filteredLik_bi[:20].bigram.values

In [350]:
lik_tri = filteredLik_tri[:20].trigram.values

## Bigram Comparison

In [353]:
bigramsCompare = pd.DataFrame([freq_bi, pmi_bi, t_bi, chi_bi, lik_bi]).T

In [355]:
bigramsCompare.columns = ['Frequency With Filter', 'PMI', 'T-test With Filter', 'Chi-Sq Test', 'Likeihood Ratio Test With Filter']

In [356]:
bigramsCompare

Unnamed: 0,Frequency With Filter,PMI,T-test With Filter,Chi-Sq Test,Likeihood Ratio Test With Filter
0,"(front, desk)","(universal, studios)","(front, desk)","(wi, fi)","(front, desk)"
1,"(great, location)","(howard, johnson)","(great, location)","(cracker, barrel)","(hot, tub)"
2,"(friendly, staff)","(cracker, barrel)","(friendly, staff)","(howard, johnson)","(continental, breakfast)"
3,"(hot, tub)","(santa, barbara)","(hot, tub)","(la, quinta)","(customer, service)"
4,"(clean, room)","(sub, par)","(continental, breakfast)","(front, desk)","(wi, fi)"
5,"(hotel, staff)","(santana, row)","(free, breakfast)","(universal, studios)","(great, location)"
6,"(continental, breakfast)","(e, g)","(great, place)","(santa, barbara)","(walk, distance)"
7,"(nice, hotel)","(elk, springs)","(parking, lot)","(santana, row)","(parking, lot)"
8,"(free, breakfast)","(times, square)","(customer, service)","( , more)","(friendly, staff)"
9,"(great, place)","(ear, plug)","(desk, staff)","(flat, screen)","(air, conditioner)"


## Trigram Comparison

In [357]:
trigramsCompare = pd.DataFrame([freq_tri, pmi_tri, t_tri, chi_tri, lik_tri]).T

In [358]:
trigramsCompare.columns = ['Frequency With Filter', 'PMI', 'T-test With Filter', 'Chi-Sq Test', 'Likeihood Ratio Test With Filter']

In [359]:
trigramsCompare

Unnamed: 0,Frequency With Filter,PMI,T-test With Filter,Chi-Sq Test,Likeihood Ratio Test With Filter
0,"(front, desk, staff)","(elk, springs, resort)","(front, desk, staff)","(within, walking, distance)","(front, desk, clerk)"
1,"(non, smoking, room)","(zion, national, park)","(non, smoking, room)","(elk, springs, resort)","(front, desk, staff)"
2,"(holiday, inn, express)","(flat, screen, tv)","(holiday, inn, express)","(flat, screen, tv)","(front, desk, person)"
3,"(front, desk, clerk)","(hard, boil, egg)","(front, desk, clerk)","(holiday, inn, express)","(front, desk, attendant)"
4,"(flat, screen, tv)","(holiday, inn, express)","(flat, screen, tv)","(zion, national, park)","(front, desk, personnel)"
5,"(smell, like, smoke)","(within, walking, distance)","(smell, like, smoke)","(red, roof, inn)","(call, front, desk)"
6,"(old, town, alexandria)","(red, roof, inn)","(old, town, alexandria)","(hard, boil, egg)","(front, desk, people)"
7,"(front, desk, person)","(simpson, house, inn)","(front, desk, person)","(free, wi, fi)","(front, desk, guy)"
8,"(free, wi, fi)","(free, wi, fi)","(free, wi, fi)","(simpson, house, inn)","(front, desk, lady)"
9,"(great, customer, service)","(slide, glass, door)","(great, customer, service)","(within, walk, distance)","(hotel, front, desk)"
