In [1]:
import numpy as np
import pandas as pd

dataset1 = pd.read_csv('../../data/raw/yp_competitors_rws_0001_0050.csv')
dataset2 = pd.read_csv('../../data/raw/yp_competitors_rws_0051_2506.csv')
dataset = pd.concat([dataset1, dataset2])

In [2]:
dataset.head()

Unnamed: 0,alias,ratingValue,dataPublished,description,author
0,kimos-maui-lahaina,5,2019-01-06,I stumbled across this great restaurant overlo...,Bella L.
1,kimos-maui-lahaina,5,2019-01-04,Excellent view on the ocean at sunset.\nExcell...,Rachou A.
2,kimos-maui-lahaina,3,2018-12-25,This place was not what the reviews portrayed ...,Ozzetta B.
3,kimos-maui-lahaina,2,2018-12-08,We were excited to repeat our Keoki's (in Kaua...,Arleen C.
4,kimos-maui-lahaina,3,2018-11-29,"If you're looking for a tourist spot, this is ...",Carol B.


In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 454035 entries, 0 to 435016
Data columns (total 5 columns):
alias            454035 non-null object
ratingValue      454035 non-null int64
dataPublished    454035 non-null object
description      454035 non-null object
author           454035 non-null object
dtypes: int64(1), object(4)
memory usage: 20.8+ MB


In [4]:
dataset.alias.unique()

array(['kimos-maui-lahaina', 'lahaina-fish-lahaina', 'blu-maui-lahaina',
       ..., 'boston-market-huntington-beach',
       'garibaldi-de-noche-costa-mesa', 'vegan-nirvana-huntington-beach'],
      dtype=object)

In [11]:
# with nltk
from nltk import word_tokenize, sent_tokenize
from nltk import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from string import punctuation

lemmatizer = WordNetLemmatizer()
porter = PorterStemmer()
stops = stopwords.words('english')

df = dataset.copy()

df['sent_tokens'] = df.description.apply(lambda x: sent_tokenize(x))
df['word_tokens_doc'] = df.sent_tokens.apply(lambda x: [tuple(word_tokenize(i)) for i in x])
df['norm_tokens_doc'] = df.word_tokens_doc.apply(
    lambda d: [tuple(map(lambda w: porter.stem(lemmatizer.lemmatize(w.lower())), [a for a in s if a.lower() not in stops and a.lower() not in punctuation])) for s in d])
df['word_tokens'] = df.description.apply(lambda x: word_tokenize(x))
df['norm_tokens'] = df.word_tokens.apply(
    lambda x: [porter.stem(lemmatizer.lemmatize(w.lower())) for w in x if w.lower() not in stops and w.lower() not in punctuation])

In [12]:
print(df.sent_tokens[0], '\n\n', df.norm_tokens_doc[0])

0    [I stumbled across this great restaurant overl...
0    [Sitting on the beach is one way to experience...
Name: sent_tokens, dtype: object 

 0    [(stumbl, across, great, restaur, overlook, oc...
0    [(sit, beach, one, way, experi, maui), (far, r...
Name: norm_tokens_doc, dtype: object


In [13]:
df.loc[:,['word_tokens', 'norm_tokens']].head()

Unnamed: 0,word_tokens,norm_tokens
0,"[I, stumbled, across, this, great, restaurant,...","[stumbl, across, great, restaur, overlook, oce..."
1,"[Excellent, view, on, the, ocean, at, sunset, ...","[excel, view, ocean, sunset, excel, food, fres..."
2,"[This, place, was, not, what, the, reviews, po...","[place, review, portray, starter, walk, stair,..."
3,"[We, were, excited, to, repeat, our, Keoki, 's...","[excit, repeat, keoki, 's, kauai, lovefest, si..."
4,"[If, you, 're, looking, for, a, tourist, spot,...","['re, look, tourist, spot, unfortun, could, n'..."


In [14]:
df.to_csv('../../data/processed/yp_competitors_rws_0001_0256_textfeatures.csv', index=False)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 454035 entries, 0 to 435016
Data columns (total 10 columns):
alias              454035 non-null object
ratingValue        454035 non-null int64
dataPublished      454035 non-null object
description        454035 non-null object
author             454035 non-null object
sent_tokens        454035 non-null object
word_tokens_doc    454035 non-null object
norm_tokens_doc    454035 non-null object
word_tokens        454035 non-null object
norm_tokens        454035 non-null object
dtypes: int64(1), object(9)
memory usage: 58.1+ MB


In [None]:
# with spacy
import spacy
nlp = spacy.load('en', disable=['parser', 'ner'])

dataset['nlp'] = dataset.description.apply(lambda x: nlp(x))
firstrow = dataset.nlp[0]
dir(firstrow)

In [34]:
a = [None, 2, 3, None]
a

[None, 2, 3, None]