In [None]:
# Text preprocessing

In [28]:
import pandas as pd
import numpy as np

import spacy
import spacy_lookups_data
import re

## Speeding up with Dask

import dask
import dask.dataframe as ddf
from dask.diagnostics import ProgressBar
import multiprocessing


In [2]:
# Load the dataframe

reviews = pd.read_csv('../data/yelp_reviews_restaurant.csv')
reviews.head()

Unnamed: 0,user_id,business_id,review_stars,useful,funny,cool,text,date,name,address,city,state,postal_code,latitude,longitude,stars,review_count,attributes,categories
0,V34qejxNsCbcgD8C0HVk-Q,HQl28KMwrEKHqhFrrDqVNQ,5,1,0,0,I love Deagan's. I do. I really do. The atmosp...,2015-12-05 03:18:11,Liberty Village Market & Cafe,65 Jefferson Avenue,Toronto,ON,M6K 1Y3,43.637885,-79.421223,3.5,20,"{'RestaurantsAttire': ""u'casual'"", 'GoodForKid...",American
1,zFCuveEe6M-ijY1iy23IJg,HQl28KMwrEKHqhFrrDqVNQ,5,6,2,5,"We walked into Melt. ""Did you want to put your...",2011-08-25 04:24:23,Liberty Village Market & Cafe,65 Jefferson Avenue,Toronto,ON,M6K 1Y3,43.637885,-79.421223,3.5,20,"{'RestaurantsAttire': ""u'casual'"", 'GoodForKid...",American
2,4V985R3RG-rv0B7WCPQzeQ,HQl28KMwrEKHqhFrrDqVNQ,1,1,0,0,I commented on how slow the service was last A...,2015-03-04 20:37:43,Liberty Village Market & Cafe,65 Jefferson Avenue,Toronto,ON,M6K 1Y3,43.637885,-79.421223,3.5,20,"{'RestaurantsAttire': ""u'casual'"", 'GoodForKid...",American
3,nFGcoL6wuPQzxsNJVSfGrA,HQl28KMwrEKHqhFrrDqVNQ,4,2,0,0,We walked in off the streets on a September ni...,2014-09-10 01:38:55,Liberty Village Market & Cafe,65 Jefferson Avenue,Toronto,ON,M6K 1Y3,43.637885,-79.421223,3.5,20,"{'RestaurantsAttire': ""u'casual'"", 'GoodForKid...",American
4,CJqgUQeWhdgbDyLAFy7xvQ,HQl28KMwrEKHqhFrrDqVNQ,4,0,0,0,Brunch on Saturday was excellent. The Bloody M...,2018-01-21 18:50:29,Liberty Village Market & Cafe,65 Jefferson Avenue,Toronto,ON,M6K 1Y3,43.637885,-79.421223,3.5,20,"{'RestaurantsAttire': ""u'casual'"", 'GoodForKid...",American


In [12]:
reviews['text'].dropna(inplace=True)

In [None]:
# Sample text to check the preprocessing functions

text = reviews.loc[578, 'text']
text

In [4]:
# Functions to preprocess text

# convert to lower cases
def to_lower(text):
    return text.lower()

# remove some special characters
def remove_special_chars(text):
    return ''.join(re.sub(r'[-/@"():;^_%&,.!?]', ' ', text))

# remove white spaces
def remove_spaces(text):
    return re.sub(r'\s+',' ', text)

In [None]:
text

In [5]:
%%time
def clean_text(df):
    df['cleaned_text'] = df['text'].map(to_lower).map(remove_special_chars).map(remove_spaces)
    return df

df = clean_text(reviews)
df.head()

Wall time: 5min 35s


Unnamed: 0,user_id,business_id,review_stars,useful,funny,cool,text,date,name,address,city,state,postal_code,latitude,longitude,stars,review_count,attributes,categories,cleaned_text
0,V34qejxNsCbcgD8C0HVk-Q,HQl28KMwrEKHqhFrrDqVNQ,5,1,0,0,I love Deagan's. I do. I really do. The atmosp...,2015-12-05 03:18:11,Liberty Village Market & Cafe,65 Jefferson Avenue,Toronto,ON,M6K 1Y3,43.637885,-79.421223,3.5,20,"{'RestaurantsAttire': ""u'casual'"", 'GoodForKid...",American,i love deagan's i do i really do the atmospher...
1,zFCuveEe6M-ijY1iy23IJg,HQl28KMwrEKHqhFrrDqVNQ,5,6,2,5,"We walked into Melt. ""Did you want to put your...",2011-08-25 04:24:23,Liberty Village Market & Cafe,65 Jefferson Avenue,Toronto,ON,M6K 1Y3,43.637885,-79.421223,3.5,20,"{'RestaurantsAttire': ""u'casual'"", 'GoodForKid...",American,we walked into melt did you want to put your n...
2,4V985R3RG-rv0B7WCPQzeQ,HQl28KMwrEKHqhFrrDqVNQ,1,1,0,0,I commented on how slow the service was last A...,2015-03-04 20:37:43,Liberty Village Market & Cafe,65 Jefferson Avenue,Toronto,ON,M6K 1Y3,43.637885,-79.421223,3.5,20,"{'RestaurantsAttire': ""u'casual'"", 'GoodForKid...",American,i commented on how slow the service was last a...
3,nFGcoL6wuPQzxsNJVSfGrA,HQl28KMwrEKHqhFrrDqVNQ,4,2,0,0,We walked in off the streets on a September ni...,2014-09-10 01:38:55,Liberty Village Market & Cafe,65 Jefferson Avenue,Toronto,ON,M6K 1Y3,43.637885,-79.421223,3.5,20,"{'RestaurantsAttire': ""u'casual'"", 'GoodForKid...",American,we walked in off the streets on a september ni...
4,CJqgUQeWhdgbDyLAFy7xvQ,HQl28KMwrEKHqhFrrDqVNQ,4,0,0,0,Brunch on Saturday was excellent. The Bloody M...,2018-01-21 18:50:29,Liberty Village Market & Cafe,65 Jefferson Avenue,Toronto,ON,M6K 1Y3,43.637885,-79.421223,3.5,20,"{'RestaurantsAttire': ""u'casual'"", 'GoodForKid...",American,brunch on saturday was excellent the bloody ma...


In [26]:
nlp = spacy.load('en_core_web_sm', disable = ['ner', 'parser']) 
nlp.max_length = 33000000

In [23]:
# The number of partitions recomended are the same as CPU available in your PC/VM
#print(f'The number of CPU available are: {multiprocessing.cpu_count()}')
df = reviews[['cleaned_text', 'stars']]
dask_dataframe = ddf.from_pandas(df, npartitions=2)

In [24]:
dask_dataframe.head()

Unnamed: 0,cleaned_text,stars
0,i love deagan's i do i really do the atmospher...,3.5
1,we walked into melt did you want to put your n...,3.5
2,i commented on how slow the service was last a...,3.5
3,we walked in off the streets on a september ni...,3.5
4,brunch on saturday was excellent the bloody ma...,3.5


In [27]:
nlps = dask_dataframe['cleaned_text'].map(lambda x: nlp(x))

In [34]:
docs = nlp(reviews['cleaned_text'])

TypeError: Argument 'string' has incorrect type (expected str, got Series)

In [39]:
def apply_lemma(string):
    '''
    This function takes a sentence and returns a clean text
    '''
    doc = nlp(string)
    l_token = [token.lemma_ for token in doc if not token.is_punct | token.is_space | token.is_digit | 
               token.like_url | token.like_num | token.like_email & token.is_oov]
    return ' '.join(l_token)

In [None]:
%%time

reviews['lemmas'] = reviews['cleaned_text'].apply(lambda x: apply_lemma(x))

In [32]:
%%time
def token_filter(token):
    return not (token.is_punct | token.is_space | token.is_stop)

filtered_tokens = []
for doc in nlp.pipe(docs):
    tokens = [token.lemma_ for token in doc if token_filter(token)]
    filtered_tokens.append(tokens)

NameError: name 'docs' is not defined

In [31]:
%%time
def token_filter(token):
    return not (token.is_punct | token.is_space | token.is_stop)

filtered_tokens = []
for doc in nlps:
    tokens = [token.lemma_ for token in doc if token_filter(token)]
    filtered_tokens.append(tokens)

MemoryError: Unable to allocate 461. KiB for an array with shape (410, 288) and data type float32