In [75]:
import nltk
import pandas as pd
import numpy as np
import spacy
import re
from string import punctuation

In [2]:
reviews = pd.read_excel('../data/airbnb_text_reviews.xlsx')

In [3]:
reviews.sample(1)

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
112515,1370405,21147209,2014-10-12,21057777,Andre,Excellent value for money! Christa and Zach we...


In [4]:
reviews.sample(3)

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
359415,11775483,505044601,2019-08-08,2743522,Alexandra,Anthony was very responsive and helpful to me ...
341417,10482409,183225312,2017-08-16,138653197,Elijah,Perfect place to stay on a budget in the middl...
1027629,54221901,582576044744759040,2022-03-14,167799886,James,"Spacious, clean, great location."


In [5]:
reviews['comments'] = reviews['comments'].astype('string')

In [6]:
reviews['comments'].describe()

count         1048358
unique        1000410
top       Great place
freq             1366
Name: comments, dtype: object

## Cleaning text with NLTK

### Note some of the reviews are empty except contain only punctuation

In [49]:
reviews.loc[(reviews['comments'] == '.') | (reviews['comments'] == '!') | (reviews['comments'] == '?') ][:5]

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
609,5178,458595941119654016,2021-09-24,409836786,Elya,.
8075,27644,503596934129974976,2021-11-25,38781491,Jay,.
9103,31994,180524495,2017-08-10,98704132,Martin (Aka Max),.
9106,31994,194910000,2017-09-17,99756545,Vardhan,.
9159,31994,366953207,2019-01-03,213464080,Glenda,.


### Dealing with NANs in our dataframe

In [84]:
for index, string in enumerate(reviews['comments']):
  if type(string) != type('example'):
      reviews['comments'][index] = ""
reviews['comments'].shape

(1048575,)

In [82]:
# def remove_punctuation_and_digits(text):
#     """Remove repeated punctuation in strings"""
#     pattern = re.compile("[\d{}]+$".format(re.escape(punctuation)))
#     text =  [item for item in text if not pattern.match(item)]
#     return text
# reviews['comments2'] = reviews['comments'].apply(lambda x: remove_punctuation_and_digits(x))

## Defining a custom class for cleaning text data

In [85]:
# importing text preprocessing libraries
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

# integrating our preprocessing into a pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

In [133]:
class TextPreprocessor(BaseEstimator, TransformerMixin):

    def __init__(self, lemmatize):
        self.lemmatize = lemmatize
        #define attributes to store if text preprocessing requires fitting from data
        pass

    def fit(self, data, y = 0):
        # this is where you would fit things like corpus specific stopwords
        # fit probable bigrams with bigram model in here

        # save as parameters of Text preprocessor

        return self

    def transform(self, data, y = 0):
        fully_normalized_corpus = data.apply(self.process_doc)

        return fully_normalized_corpus


    def process_doc(self, doc, lemmatize = 'nltk'):

        if lemmatize == 'nltk':
            #initialize lemmatizer
            wnl = WordNetLemmatizer()
            stop_words = stopwords.words('english')

            # helper function to change nltk's part of speech tagging to a wordnet format.
            def pos_tagger(nltk_tag):
                if nltk_tag.startswith('J'):
                    return wordnet.ADJ
                elif nltk_tag.startswith('V'):
                    return wordnet.VERB
                elif nltk_tag.startswith('N'):
                    return wordnet.NOUN
                elif nltk_tag.startswith('R'):
                    return wordnet.ADV
                else:
                    return None
            # remove stop words and punctuations, then lower case
            doc_norm = [tok.lower() for tok in word_tokenize(doc) if ((tok.isalpha()) & (tok not in stop_words))]

            #  POS detection on the result will be important in telling Wordnet's lemmatizer how to lemmatize

            # creates list of tuples with tokens and POS tags in wordnet format
            wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tag(doc_norm)))
            doc_norm = [wnl.lemmatize(token, pos) for token, pos in wordnet_tagged if pos is not None]

        else:
            # remove stop words and punctuations, then lower case
            doc_norm = [tok.lower() for tok in word_tokenize(doc) if ((tok.isalpha()) & (tok not in stop_words))]

        return " ".join(doc_norm)

In [134]:
proc = TextPreprocessor(lemmatize = 'ntlk')

reviews['comments_clean'] =  proc.fit_transform(reviews['comments'])

In [135]:
reviews['comments_clean']

0                                                 great host
1          nice room price great neighborhood john accomm...
2                                  very nice apt new remodel
3          great place stay john great host great man hou...
4                                                           
                                 ...                        
1048570    great stay clean wish front door keep open als...
1048571                                          place great
1048572                   have great stay clean amenity need
1048573    nancy great host communicative responsive apar...
1048574                                 amaze host love stay
Name: comments_clean, Length: 1048575, dtype: object

In [137]:
#reviews.to_excel('text_reviews_processed.xlsx')