# Metadata

```
Course:   DS 5001
Module:   08 Lab
Topic:    Prepare AirBnB data with SpaCy
Author:   R.C. Alvarado
```

# Set Up

In [1]:
data_home = "../data"
local_lib = "../lib"

In [2]:
!ls {data_home}/airbnb

airbnb-BOW.csv airbnb-LIB.csv airbnb.csv


In [39]:
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm
import sys; sys.path.append(local_lib)

In [4]:
from topicmodel import TopicModel

# Get Data

In [5]:
ABB = pd.read_csv(f"{data_home}/airbnb/airbnb.csv", sep="|")

In [6]:
ABB.index.name = 'doc_id'

In [37]:
ABB.head()

Unnamed: 0_level_0,doc_key,doc_title,doc_date,doc_year,doc_month,doc_country,doc_label,doc_price,doc_rating,doc_content,doc_original
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,375799,"United Kingdom, New Malden, 2015-09-02",2015-09-02,2015,9,United Kingdom,New Malden,80,98.0,"Comfortable, 1 Bedroom Apartment On quiet stre...","Comfortable, 1 Bedroom Apartment On quiet stre..."
1,2284438,"United Kingdom, Kingston upon Thames, 2015-09-02",2015-09-02,2015,9,United Kingdom,Kingston upon Thames,43,97.0,The room is in a modern 1 bedroom flat with of...,The room is in a modern 1 bedroom flat with of...
2,4356007,"United Kingdom, Kingston upon Thames, 2015-09-02",2015-09-02,2015,9,United Kingdom,Kingston upon Thames,325,100.0,Our family home offers 2 double bedrooms with ...,Our family home offers 2 double bedrooms with ...
3,7031432,"United Kingdom, Kingston upon Thames, 2015-09-02",2015-09-02,2015,9,United Kingdom,Kingston upon Thames,47,,The is a basic ground floor bedroom in a recen...,The is a basic ground floor bedroom in a recen...
4,7208109,"United Kingdom, New Malden, 2015-09-02",2015-09-02,2015,9,United Kingdom,New Malden,25,,"This room has a ensuite shower room, wardrobe,...","This room has a ensuite shower room, wardrobe,..."


In [7]:
ABB.value_counts(['doc_country','doc_year'])

doc_country     doc_year
France          2015        35427
United States   2015        28509
United Kingdom  2015        25361
Mexico          2015            1
Vanuatu         2015            1
dtype: int64

In [8]:
UK = ABB[ABB.doc_country == 'United Kingdom'].copy().dropna()

In [9]:
UK.doc_content

doc_id
0        Comfortable, 1 Bedroom Apartment On quiet stre...
1        The room is in a modern 1 bedroom flat with of...
2        Our family home offers 2 double bedrooms with ...
7        Stay in my lovely, light and airy home, close ...
8        This room has a door opening out onto the back...
                               ...                        
25353    Welcome to your lovely cosy double room locate...
25354    This beautiful location is on London's Southba...
25356    A wonderful barge suitable for either a single...
25357    Our newly re-furnished flat in central London ...
25360    This is an amazing new flat in the heart of le...
Name: doc_content, Length: 17369, dtype: object

In [38]:
len(UK)

17369

In [10]:
# UK.to_csv(f"{data_home}/output/airbnb-uk-LIB.csv")

# Annotate with SpaCy

In [43]:
# docs = UK.sample(1000).doc_original.astype('str').values
docs = UK.doc_original.astype('str').values
stats_package = 'en_core_web_sm'
# pipeline = ["tok2vec", "tagger", "parser", "ner"]
nlp = spacy.load(stats_package)
DOCS = [doc.to_json() for doc in tqdm(nlp.pipe(docs))]

17369it [05:03, 57.26it/s]


In [41]:
features = list(DOCS[0].keys()); features

['text', 'ents', 'sents', 'tokens']

In [None]:
l

In [42]:
DOCS[0]['text']

"Wonderfully tranquil, bright and centrally located garden flat set in a quiet road within an 8 minute walk of Fulham Broadway underground. Pamper yourself in this luxurious and newly renovated flat with 42' TV, king size bed and south facing garden. A quiet, comfortable, luxurious and convenient, centrally located 2 double bedroom apartment offers splendid accommodation for up to 6 guests. The flat comprises a lounge, open plan kitchen with dining area, a master bedroom with king size bed, 2nd double bedroom with French doors leading out, a newly tiled bathroom and a south facing garden. The lounge has a very comfortable and large corner sofa which converts into a king size bed (photographed) .There is also a new Samsung 42' Flat Screen TV for relaxing. Off site luggage storage available at extra cost for early arrivals or late departures. Ask for details. The apartment benefits from being supplied with fresh, starched white linen and towels for each stay by the same company that supp

# Convert Output to Data Frames

In [14]:
class Corpus(): pass

In [15]:
corpus = Corpus()        
corpus.TEXT = pd.DataFrame([(i, x['text']) for i, x in enumerate(DOCS)], columns=['doc_id','text_str']).set_index('doc_id')

In [24]:
corpus.TEXT.head()

Unnamed: 0_level_0,text_str
doc_id,Unnamed: 1_level_1
0,"Single Room, with own bathroom adjacent not sh..."
1,A sweet cosy and well proportioned one bedroom...
2,Little quiet studio in Zone 1 in London Centra...
3,Located within walking distance to Beckton sta...
4,This is your chance to live in the coolest nei...


In [17]:
corpus.TOKEN = pd.concat([pd.DataFrame(x['tokens']) for x in DOCS])

In [18]:
corpus.TOKEN.loc[corpus.TOKEN.id == 0, 'doc_id'] = [i for i in range(1000)]

In [19]:
corpus.TOKEN.doc_id = corpus.TOKEN.doc_id.ffill()

In [20]:
corpus.TOKEN

Unnamed: 0,id,start,end,tag,pos,morph,lemma,dep,head,doc_id
0,0,0,6,NNP,PROPN,Number=Sing,Single,compound,1,0.0
1,1,7,11,NNP,PROPN,Number=Sing,Room,nsubj,8,0.0
2,2,11,12,",",PUNCT,PunctType=Comm,",",punct,1,0.0
3,3,13,17,IN,ADP,,with,prep,1,0.0
4,4,18,21,JJ,ADJ,Degree=Pos,own,amod,5,0.0
...,...,...,...,...,...,...,...,...,...,...
203,203,966,972,NNP,PROPN,Number=Sing,Tumble,compound,204,999.0
204,204,973,978,NNP,PROPN,Number=Sing,Dryer,conj,201,999.0
205,205,978,979,",",PUNCT,PunctType=Comm,",",punct,204,999.0
206,206,980,987,NNP,PROPN,Number=Sing,Washing,compound,207,999.0


In [25]:
corpus.TOKEN['token_str'] = corpus.TOKEN.apply(lambda x: corpus.TEXT.text_str.values[0][x.start:x.end], 1)
corpus.TOKEN = corpus.TOKEN.reset_index().rename(columns=dict(index='token_num')).set_index(['doc_id','token_num'])

In [26]:
corpus.TOKEN

Unnamed: 0_level_0,Unnamed: 1_level_0,id,start,end,tag,pos,morph,lemma,dep,head,token_str
doc_id,token_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0.0,0,0,0,6,NNP,PROPN,Number=Sing,Single,compound,1,Single
0.0,1,1,7,11,NNP,PROPN,Number=Sing,Room,nsubj,8,Room
0.0,2,2,11,12,",",PUNCT,PunctType=Comm,",",punct,1,","
0.0,3,3,13,17,IN,ADP,,with,prep,1,with
0.0,4,4,18,21,JJ,ADJ,Degree=Pos,own,amod,5,own
...,...,...,...,...,...,...,...,...,...,...,...
999.0,203,203,966,972,NNP,PROPN,Number=Sing,Tumble,compound,204,rth Co
999.0,204,204,973,978,NNP,PROPN,Number=Sing,Dryer,conj,201,mon t
999.0,205,205,978,979,",",PUNCT,PunctType=Comm,",",punct,204,r
999.0,206,206,980,987,NNP,PROPN,Number=Sing,Washing,compound,207,in stat


In [31]:
# DOCS[0]

In [None]:
# annotate_corpus(UK.doc_original.astype('str').values)

In [33]:
BOW = corpus.TOKEN[corpus.TOKEN.pos == 'NOUN'].value_counts(['doc_id', 'lemma']).to_frame('n').sort_index()
BOW.index.names = ['doc_id', 'term_str']

In [34]:
BOW

Unnamed: 0_level_0,Unnamed: 1_level_0,n
doc_id,term_str,Unnamed: 2_level_1
0.0,arm,1
0.0,basis,1
0.0,bath,1
0.0,bathroom,4
0.0,bedroom,2
...,...,...
999.0,towel,1
999.0,tv,1
999.0,view,1
999.0,washing,1


In [None]:
# BOW.to_csv(f"{data_home}/output/airbnb-uk-nouns.csv")

In [None]:
# BOW = pd.read_csv(f"{data_home}/output/airbnb-uk-nouns.csv").set_index(['doc_id','term_str'])

In [36]:
tm_book = TopicModel(BOW)
tm_book.n_topics = 40
tm_book.create_X()
tm_book.get_model()
tm_book.describe_topics()

In [None]:
tm_book.plot_topics()

## Appendix: Method 2

In [None]:
def annotate_corpus(docs):
    
    stats_package = 'en_core_web_sm'
    nlp = spacy.load(stats_package)
    DOCS = []
    for doc in tqdm.tqdm(docs):
        DOCS.append(nlp(doc).to_json())
    features = list(DOCS[0].keys())

    feature_data = {f:[] for f in features}
    for i in range(len(DOCS)):    
        text = DOCS[i]['text']
        # for feature in features[1:]:
        for feature in ['tokens']:
            df = pd.DataFrame(DOCS[i][feature])
            df[f'{feature[:-1]}_str'] = df.apply(lambda x: text[x.start:x.end], 1)
            df['doc_id'] = i
            feature_data[feature].append(df)
            
    TOKEN = pd.concat(feature_data['tokens'])