# Filtering Words by Ratio of Positive to Negative Counts

Defining the level of positiveness or negativeness, without calculating the log likelihood, by comparing the positive to negative frequency of the word and filtering a subset of words that have a minimum/maximum ratio of positivity/negativity.

## 1. Initializing

In [22]:
import nltk
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.tokenize import TweetTokenizer
import re
import string
import numpy as np

In [23]:
nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\Mahmoud\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mahmoud\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [24]:
postw = twitter_samples.strings('positive_tweets.json')
negtw = twitter_samples.strings('negative_tweets.json')
x = postw + negtw
y = np.append(np.ones((len(postw), 1)), np.zeros((len(negtw), 1)), axis=0)

## 2. Tweet processing function

**Input**: a string containing a tweet  
**Output**: a list of words containing the processed tweet

In [25]:
def process_tweet(tweet):
    
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    
    # removing hyperlinks, Twitter marks and styles
    tweet = re.sub(r'\$\w*', '', tweet)
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    
    # tokenizing tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    # removing stop words and punctuations, stemming
    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  
                word not in string.punctuation):  
            stem_word = stemmer.stem(word) 
            tweets_clean.append(stem_word)

    return tweets_clean

## 3. Building word frequencies

Defining building word frequencies function.

**Input**: a list of tweets, an m x 1 array with the sentiment label of each tweet (either 0 or 1)  
**Output**: a dictionary mapping each (word, sentiment) pair to its frequency

In [26]:
def build_freqs(tweets, ys):

    yslist = np.squeeze(ys).tolist()

    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            freqs[pair] = freqs.get(pair, 0) + 1

    return freqs

Creating frequency dictionary:

In [27]:
freqs = build_freqs(x, y)
print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

type(freqs) = <class 'dict'>
len(freqs) = 13067


Building lookup function

**Input**: a dictionary with the frequency of each pair (or tuple), the word to look up, the label corresponding to the word  
**Output**: the number of times the word with its corresponding label appears

In [28]:
def lookup(freqs, word, label):

    n = 0 

    pair = (word, label)
    if (pair in freqs):
        n = freqs[pair]

    return n

## 4. Get ratio function

**Input**: a dictionary containing the words, a string to lookup  
**Output**: a dictionary with keys 'positive', 'negative', and 'ratio'.

In [29]:
def get_ratio(freqs, word):
    
    pos_neg_ratio = {'positive': 0, 'negative': 0, 'ratio': 0.0}

    pos_neg_ratio['positive'] = lookup(freqs, word, 1)
    pos_neg_ratio['negative'] = lookup(freqs, word, 0)

    pos_neg_ratio['ratio'] = (pos_neg_ratio['positive']+1)/(pos_neg_ratio['negative']+1)

    return pos_neg_ratio

## 5. Get words by threshold

**Input**: a dictionary of words, label( 1 for positive, 0 for negative), ratio that will be used as the cutoff for including a word in the returned dictionary  
**Output**: a dictionary containing the word and information on its positive count, negative count, and ratio of positive to negative counts

In [30]:
def get_words_by_threshold(freqs, label, threshold):

    word_list = {}

    for key in freqs.keys():
        word, _ = key
        pos_neg_ratio = get_ratio(freqs, word)
        if label == 1 and pos_neg_ratio['ratio'] >= threshold:
            word_list[word]= pos_neg_ratio
        elif label == 0 and pos_neg_ratio['ratio'] <= threshold:
            word_list[word] = pos_neg_ratio
            
    return word_list

## 6. Testing the function

Finding positive words at or above a threshold

In [31]:
get_words_by_threshold(freqs, label=1, threshold=15)

{'followfriday': {'positive': 25, 'negative': 0, 'ratio': 26.0},
 ':)': {'positive': 3568, 'negative': 2, 'ratio': 1189.6666666666667},
 'flipkartfashionfriday': {'positive': 17, 'negative': 0, 'ratio': 18.0},
 ':D': {'positive': 629, 'negative': 0, 'ratio': 630.0},
 ':p': {'positive': 137, 'negative': 0, 'ratio': 138.0},
 ':-)': {'positive': 692, 'negative': 0, 'ratio': 693.0},
 "here'": {'positive': 25, 'negative': 0, 'ratio': 26.0},
 'youth': {'positive': 19, 'negative': 0, 'ratio': 20.0},
 'bam': {'positive': 44, 'negative': 1, 'ratio': 22.5},
 'warsaw': {'positive': 44, 'negative': 0, 'ratio': 45.0},
 ';)': {'positive': 27, 'negative': 0, 'ratio': 28.0},
 'stat': {'positive': 60, 'negative': 0, 'ratio': 61.0},
 'via': {'positive': 69, 'negative': 1, 'ratio': 35.0},
 'fback': {'positive': 27, 'negative': 0, 'ratio': 28.0}}

Finding negative words at or below a threshold

In [32]:
get_words_by_threshold(freqs, label=0, threshold=0.02)

{':(': {'positive': 1, 'negative': 4571, 'ratio': 0.0004374453193350831},
 ':-(': {'positive': 0, 'negative': 493, 'ratio': 0.0020242914979757085},
 '♛': {'positive': 0, 'negative': 210, 'ratio': 0.004739336492890996},
 '》': {'positive': 0, 'negative': 210, 'ratio': 0.004739336492890996}}