In [1]:
import nltk
nltk.download('stopwords')
nltk.download('twitter_samples')
from nltk.corpus import stopwords, twitter_samples

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


In [2]:
pos_tweets = twitter_samples.strings('positive_tweets.json')
neg_tweets = twitter_samples.strings('negative_tweets.json')

In [3]:
import numpy as np

train_pos = pos_tweets[:4000]
test_pos = pos_tweets[4000:]
train_neg = neg_tweets[:4000]
test_neg = neg_tweets[4000:]
train_x = train_pos + train_neg
test_x = test_pos + test_neg
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

In [4]:
import re
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer


def process_tweet(tweet):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
            word not in string.punctuation):  # remove punctuation
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

In [5]:
custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"

# print cleaned tweet
print(process_tweet(custom_tweet))

['hello', 'great', 'day', ':)', 'good', 'morn']


In [6]:
train_x[:10]

['#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)',
 '@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!',
 '@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!',
 '@97sides CONGRATS :)',
 'yeaaaah yippppy!!!  my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days',
 '@BhaktisBanter @PallaviRuhail This one is irresistible :)\n#FlipkartFashionFriday http://t.co/EbZ0L2VENM',
 "We don't like to keep our lovely customers waiting for long! We hope you enjoy! Happy Friday! - LWWF :) https://t.co/smyYriipxI",
 '@Impatientraider On second thought, there’s just not enough time for a DD :) But new shorts entering system. Sheep must be buying.',
 'Jgh , but we have to go to Bayan :D bye',
 'As an act of mischievousness, am calling the ETL layer of our in-house warehousing 

In [35]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(tokenizer=process_tweet)
train_x_vecs = vectorizer.fit_transform(train_x).toarray()

In [36]:
train_x_vecs.shape

(8000, 9084)

In [16]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(train_x_vecs, train_y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [22]:
test_x_vecs = vectorizer.transform(test_x)
preds = model.predict(test_x_vecs)

In [23]:
preds

array([0., 0., 1., ..., 0., 0., 0.])

In [24]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
print(f'accuracy={accuracy_score(test_y, preds)}')
print(f'f1={f1_score(test_y, preds)}')
print(f'confusion={confusion_matrix(test_y, preds)}')

accuracy=0.5905
f1=0.4095169430425379
confusion=[[897 103]
 [716 284]]


In [27]:
train_x_vecs.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [44]:
import numpy as np

class NaiveBayes:
    def __init__(self):
        self.num_classes = 0
        self.num_features = 0
        self.log_prior = None
        self.log_likelihood = None
    
    def fit(self, X, y):
        self.num_classes = len(np.unique(y))
        self.num_features = X.shape[1]
        self.log_prior = np.zeros(self.num_classes)
        self.log_likelihood = np.zeros((self.num_features, self.num_classes))

        for c in range(self.num_classes):
            self.log_prior[c] = np.log(len(y[y == c]) / len(y))
        
        for f in range(self.num_features):
            for c in range(self.num_classes):
                sub_x = X[y == c]
                self.log_likelihood[f, c] = np.log((np.sum(sub_x[:, f]) + 1) / (np.sum(sub_x)) + self.num_features)

        return self

    def predict(self, X):
        probs = np.dot(X, self.log_likelihood) + self.log_prior.reshape((1, -1))
        return np.argmax(probs, axis=1)


In [45]:
model = NaiveBayes()
model.fit(train_x_vecs, train_y)

<__main__.NaiveBayes at 0x7f5d8f796fd0>

In [47]:
test_x_vecs = vectorizer.transform(test_x)
predictions = model.predict(test_x_vecs.toarray())

In [49]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
print(f'accuracy={accuracy_score(test_y, predictions)}')
print(f'f1={f1_score(test_y, predictions)}')
print(f'confusion={confusion_matrix(test_y, predictions)}')

accuracy=0.9935
f1=0.993483709273183
confusion=[[996   4]
 [  9 991]]
