# Data Preprocessing

In [1]:
import re
import pandas as pd
import gensim
import spacy
import swifter
from nltk.tokenize import word_tokenize
from gensim.parsing.preprocessing import STOPWORDS

from tqdm import tqdm

nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
tqdm.pandas()

In [2]:
df = pd.read_hdf('./../../code/data/starbucks/data.h5', key='starbucks')
df = df.iloc[:4000]

In [3]:
df.shape

(4000, 28)

In [4]:
df.drop_duplicates('tweet', inplace=True)

In [5]:
df.dropna(subset='tweet', axis=0, inplace=True)

In [6]:
df.shape

(3977, 28)

In [7]:
df = df[df['language'] == 'en']

In [8]:
df = df[~df['username'].isin(['starbucks_es', 'starbucks_cstm', 'starbucks_j_cpg', 'starbuckspoho', 
                                'starbuckspoho', 'starbuckshomeme', 'starbucksperu'])]

In [9]:
df.shape

(3313, 28)

In [10]:
def preprocess_tweet(text):
    """Method to perform preprocessing of tweets"""
    text = str(text).lower() # Convert tweets to lower case
    text = re.sub(r'@ *\w*', '', str(text)) # Remove tagged usernames from the tweets
    text = re.sub(r'#\w+', '', str(text)) # Remove hashtags from the tweets
    text = re.sub('\n', ' ', str(text)) # Remove newline characters from the tweets
    text = re.sub('\xa0', ' ', str(text)) # Remove special characters coverted to strings from the tweets. In this case it is "\xa0"
    text = re.sub('&amp', ' ', str(text)) # Remove "&amp" from the tweets
    text = re.sub(r'http\S+', '', str(text)) # Remove links from the tweets
    text = re.sub(r'[^A-Za-z0-9 ]+', '', str(text)) # Keep only alphanumeric characters in the tweets
    return text

df.loc[:, 'preprocessed_tweet'] = df['tweet'].apply(preprocess_tweet)

In [11]:
# Remove blank tweets after preprocessing
df = df[df['preprocessed_tweet'].map(bool)]

In [12]:
def lemmatization(text):
    """Function to lemmatized tokenied sentence"""
    return " ".join([token.lemma_ if token.lemma_ != '-PRON-' else token.lower_ for token in nlp(text)])

df['preprocessed_tweet'] = df['preprocessed_tweet'].swifter.apply(lemmatization)

Pandas Apply:   0%|          | 0/3313 [00:00<?, ?it/s]

In [13]:
def remove_stopwords(text):
    """Method to remove stopwords from tweet text"""

    stopwords = STOPWORDS.union(set(['starbucks', 'starbuck']))

    text_tokens = word_tokenize(text)
    tokens_without_sw = [word for word in text_tokens if not word in stopwords]
    return " ".join(tokens_without_sw)

df['preprocessed_tweet'] = df['preprocessed_tweet'].swifter.apply(remove_stopwords)

Pandas Apply:   0%|          | 0/3313 [00:00<?, ?it/s]

In [14]:
df[['tweet', 'preprocessed_tweet']]

Unnamed: 0,tweet,preprocessed_tweet
0,WHY Y’all lie &amp; say Starbucks took EBT now...,y lie ebt pour shii
1,Thanks A Latta Giveaway\n#WIN a $10 Starbucks ...,thank latta giveaway 10 amazon gc
2,I used to hate Starbucks but now I love it so ...,use hate love want invent new frappuccino flavor
3,philz needs to replace the starbucks on story ...,philz need replace story white fr
4,@staceyabrams @BeeForGeorgia There were more p...,people grand opening
...,...,...
3994,Hey @Starbucks I think I maybe the only one in...,hey think maybe cinnamon dolce syrup
3995,1h30 walk done !! now im resting in a mall wit...,1h30 walk rest mall ice americano fast 1h30 wa...
3996,@VP you know I am working this corner to get ...,know work corner money coffee tuesday election...
3997,Starting soon! Register here to hear from youn...,start soon register hear young striker unionis...


In [15]:
df.to_hdf('./../../code/data/starbucks/data.h5', key='preprocessed_starbucks')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['Unnamed: 0', 'tweet', 'conversation_id', 'date', 'hashtags',
       'inReplyToTweetId', 'reply_to', 'language', 'likes_count', 'media',
       'mentions', 'quoted_tweet', 'retweets_count', 'link',
       'user_status_count', 'location', 'name', 'description', 'verified',
       'url', 'user_id', 'username', 'preprocessed_tweet'],
      dtype='object')]

  df.to_hdf('./../../code/data/starbucks/data.h5', key='preprocessed_starbucks')
