In [1]:
import numpy as np
import pandas as pd

dataset = pd.read_csv('../../data/raw/yp_competitors_rws_0001_0050.csv')

In [2]:
dataset.head()

Unnamed: 0,alias,ratingValue,dataPublished,description,author
0,kimos-maui-lahaina,5,2019-01-06,I stumbled across this great restaurant overlo...,Bella L.
1,kimos-maui-lahaina,5,2019-01-04,Excellent view on the ocean at sunset.\nExcell...,Rachou A.
2,kimos-maui-lahaina,3,2018-12-25,This place was not what the reviews portrayed ...,Ozzetta B.
3,kimos-maui-lahaina,2,2018-12-08,We were excited to repeat our Keoki's (in Kaua...,Arleen C.
4,kimos-maui-lahaina,3,2018-11-29,"If you're looking for a tourist spot, this is ...",Carol B.


In [3]:
dataset.shape

(19018, 5)

In [4]:
from itertools import chain


In [None]:
# with spacy
import spacy
nlp = spacy.load('en', disable=['parser', 'ner'])

dataset['nlp'] = dataset.description.apply(lambda x: nlp(x))

In [None]:
firstrow = dataset.nlp[0]
dir(firstrow)

In [7]:
# with nltk
from nltk import word_tokenize, sent_tokenize
from nltk import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords

lemmatizer = WordNetLemmatizer()
porter = PorterStemmer()

df = dataset.copy()

df['sent_tokens'] = df.description.apply(lambda x: sent_tokenize(x))
df['word_tokens_doc'] = df.sent_tokens.apply(lambda x: [word_tokenize(i) for i in x])
df['word_tokens'] = df.description.apply(lambda x: word_tokenize(x))
df['norm_tokens'] = df.word_tokens.apply(
    lambda x: [porter.stem(lemmatizer.lemmatize(w.lower()))
                                  for w in x if w.lower() not in stopwords.words('english')]
)
df.head()

Unnamed: 0,alias,ratingValue,dataPublished,description,author,sent_tokens,word_tokens_doc,word_tokens,norm_tokens
0,kimos-maui-lahaina,5,2019-01-06,I stumbled across this great restaurant overlo...,Bella L.,[I stumbled across this great restaurant overl...,"[[I, stumbled, across, this, great, restaurant...","[I, stumbled, across, this, great, restaurant,...","[stumbl, across, great, restaur, overlook, oce..."
1,kimos-maui-lahaina,5,2019-01-04,Excellent view on the ocean at sunset.\nExcell...,Rachou A.,"[Excellent view on the ocean at sunset., Excel...","[[Excellent, view, on, the, ocean, at, sunset,...","[Excellent, view, on, the, ocean, at, sunset, ...","[excel, view, ocean, sunset, ., excel, food, ...."
2,kimos-maui-lahaina,3,2018-12-25,This place was not what the reviews portrayed ...,Ozzetta B.,[This place was not what the reviews portrayed...,"[[This, place, was, not, what, the, reviews, p...","[This, place, was, not, what, the, reviews, po...","[place, review, portray, ., starter, ,, walk, ..."
3,kimos-maui-lahaina,2,2018-12-08,We were excited to repeat our Keoki's (in Kaua...,Arleen C.,[We were excited to repeat our Keoki's (in Kau...,"[[We, were, excited, to, repeat, our, Keoki, '...","[We, were, excited, to, repeat, our, Keoki, 's...","[excit, repeat, keoki, 's, (, kauai, ), lovefe..."
4,kimos-maui-lahaina,3,2018-11-29,"If you're looking for a tourist spot, this is ...",Carol B.,"[If you're looking for a tourist spot, this is...","[[If, you, 're, looking, for, a, tourist, spot...","[If, you, 're, looking, for, a, tourist, spot,...","['re, look, tourist, spot, ,, ., unfortun, cou..."


In [8]:
df.word_tokens_doc[0]

[['I',
  'stumbled',
  'across',
  'this',
  'great',
  'restaurant',
  'overlooking',
  'the',
  'ocean',
  'for',
  'lunch',
  'during',
  'my',
  'vacation',
  'to',
  'Maui',
  '.'],
 ['I',
  'did',
  'not',
  'have',
  'high',
  'expectations',
  'for',
  'this',
  'place',
  ',',
  'but',
  'boy',
  'did',
  'it',
  'blow',
  'me',
  'out',
  'of',
  'the',
  'water',
  '.'],
 ['The',
  'fish',
  'and',
  'chips',
  'is',
  'some',
  'of',
  'the',
  'best',
  'I',
  "'ve",
  'ever',
  'had',
  '(',
  'and',
  'I',
  "'ve",
  'had',
  'lots',
  ',',
  'including',
  'from',
  'London',
  ')',
  '.'],
 ['I', 'highly', 'recommend', 'it', '.'],
 ['Also', ',', 'the', 'turkey', 'bacon', 'sandwich', 'was', 'SO', 'good', '.'],
 ['In',
  'terms',
  'of',
  'drinks',
  ',',
  'I',
  'highly',
  'recommend',
  'the',
  'Pacific',
  'Paradise',
  'drink',
  '!'],
 ['So', 'delicious', 'and', 'tropical', '!'],
 ['I', 'also', 'really', 'enjoyed', 'the', 'Lahaina', 'Lemonade', '.'],
 ['Service'

In [9]:
df.to_csv('../../data/processed/yp_competitors_rws_0001_0050_textfeatures.csv', index=False)

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19018 entries, 0 to 19017
Data columns (total 8 columns):
alias            19018 non-null object
ratingValue      19018 non-null int64
dataPublished    19018 non-null object
description      19018 non-null object
author           19018 non-null object
sent_tokens      19018 non-null object
word_tokens      19018 non-null object
norm_tokens      19018 non-null object
dtypes: int64(1), object(7)
memory usage: 1.2+ MB
