# Sentiment Analysis: Preprocessing

### Import libraries

In [3]:
import pandas as pd
import re
import nltk
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nechamaborisute/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nechamaborisute/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Load in data

In [5]:
# Load in data 
data = pd.read_csv('../data/data.csv.zip', encoding='latin-1')

# rename columns
data.columns = ['sentiment', 'tweet_id', 'date', 'query', 'user', 'text']

# Look at data information, dtypes, nulls
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048572 entries, 0 to 1048571
Data columns (total 6 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   sentiment  1048572 non-null  int64 
 1   tweet_id   1048572 non-null  int64 
 2   date       1048572 non-null  object
 3   query      1048572 non-null  object
 4   user       1048572 non-null  object
 5   text       1048572 non-null  object
dtypes: int64(2), object(4)
memory usage: 48.0+ MB


In [6]:
data.head()

Unnamed: 0,sentiment,tweet_id,date,query,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


Data file format has 6 fields:
* 0 - the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
* 1 - the id of the tweet (2087)
* 2 - the date of the tweet (Sat May 16 23:58:44 UTC 2009)
* 3 - the query (lyx). If there is no query, then this value is NO_QUERY.
* 4 - the user that tweeted (robotickilldozr)
* 5 - the text of the tweet (Lyx is cool)

### Preprocess text data

In [7]:
data['text'].head(10)

0    is upset that he can't update his Facebook by ...
1    @Kenichan I dived many times for the ball. Man...
2      my whole body feels itchy and like its on fire 
3    @nationwideclass no, it's not behaving at all....
4                        @Kwesidei not the whole crew 
5                                          Need a hug 
6    @LOLTrish hey  long time no see! Yes.. Rains a...
7                           I just re-pierced my ears 
8    @caregiving I couldn't bear to watch it.  And ...
9    @octolinz16 It it counts, idk why I did either...
Name: text, dtype: object

The text data is messy and has a lot of characters I'd like to remove before analysis, so I'm going to write a function to go over and clean all the tweets.

* stop words
* punctuation
* @, url, user
* lowercase


In [10]:
def process_tweet(tweet_text, min_length):
    
    # get common stop words
    stop_words = stopwords.words('english')
    
    # instantiate lemmatizer
    wnl = WordNetLemmatizer()
    
    # helper function to change nltk's part of speech tagging to a wordnet format.
    def pos_tagger(nltk_tag):
        if nltk_tag.startswith('J'):
            return wordnet.ADJ
        elif nltk_tag.startswith('V'):
            return wordnet.VERB
        elif nltk_tag.startswith('N'):
            return wordnet.NOUN
        elif nltk_tag.startswith('R'):
            return wordnet.ADV
        else:         
            return None
        
    # lower case everything
    tweet_lower = tweet_text.lower()
    
     #remove mentions, hashtags, and urls, strip whitspace and breaks
    tweet_lower = re.sub(r"@[a-z0-9_]+|#[a-z0-9_]+|http\S+", "", tweet_lower).strip().replace("\r", "").replace("\n", "").replace("\t", "")
    
    # remove stop words and punctuations 
    tweet_norm = [x for x in word_tokenize(tweet_lower) if ((x.isalpha()) & (x not in stop_words)) ]

    #  POS detection on the result will be important in telling Wordnet's lemmatizer how to lemmatize
    
    # creates list of tuples with tokens and POS tags in wordnet format
    wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tag(tweet_norm))) 

    # now we are going to have a cutoff here. any tokenized cocument with length < min length will be removed from corpus
    if len(wordnet_tagged) <= min_length:
        return ''
    else:
         # rejoins lemmatized sentence 
        tweet_norm = " ".join([wnl.lemmatize(x[0], x[1]) for x in wordnet_tagged if x[1] is not None])
        return tweet_norm

In [11]:
# apply function to tweets
data['text'] = data['text'].apply(process_tweet, args = [10])

In [13]:
# get rid of empty tweets
data_cleaned = data[data['text'] != '']
data_cleaned['text']

0          upset update facebook texting cry result schoo...
6          hey long time see yes rain bite bit lol fine t...
27         want go promote gear groove unfornately ride b...
31         ok sick spend hour sit shower cause sick stand...
32                 ill tell ya story later good day ill hour
                                 ...                        
1048556    watchin espn first take favorite mornin show l...
1048558    muhaha thankgoodness miss last date rmcaat jun...
1048559    good morning people twitter tgifriday thats wa...
1048565    today message church service deliver skype fre...
1048566    back home thought do week call alter something...
Name: text, Length: 207344, dtype: object

In [15]:
# data_cleaned.to_csv('../data/data_cleaned.csv')