In [1]:
import re
import nltk
import string
import numpy as np

from nltk.corpus import stopwords
from nltk.corpus import twitter_samples
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [4]:
print(len(all_positive_tweets))
print(len(all_negative_tweets))

5000
5000


In [5]:
train_positive = all_positive_tweets[:4000]
test_positive = all_positive_tweets[4000:]

train_negative = all_negative_tweets[:4000]
test_negative = all_negative_tweets[4000:]

train_x = train_positive + train_negative
test_x = test_positive + test_negative

print(len(train_x))
print(len(test_x))

8000
2000


In [6]:
print(len(train_positive))
print(len(test_positive))
print(len(train_negative))
print(len(test_negative))

4000
1000
4000
1000


In [7]:
train_y = np.append(np.ones((len(train_positive), 1)), np.zeros((len(train_negative), 1)), axis=0)
test_y = np.append(np.ones((len(test_positive), 1)), np.zeros((len(test_negative), 1)), axis=0)

In [8]:
print(train_y.shape)
print(test_y.shape)

(8000, 1)
(2000, 1)


In [9]:
def process_tweets(tweet):

  tweet_0 = re.sub(r'^RT[\s]+', '', tweet)
  tweet_0 = re.sub(r'https?://[^\s\n\r]+', '', tweet_0)
  tweet_0 = re.sub(r'#', '', tweet_0)

  tokenizer = TweetTokenizer(preserve_case = False, reduce_len = True, strip_handles = True)
  tokenize_tweet = tokenizer.tokenize(tweet_0)

  stopwards_english = stopwords.words('english')
  panctuations = string.punctuation

  clean_tweet = []

  for word in tokenize_tweet:
    if (word not in stopwards_english and
        word not in panctuations):

      clean_tweet.append(word)

  stem_tweets = []
  streammer = PorterStemmer()

  for word in clean_tweet:
    stem_word = streammer.stem(word)
    stem_tweets.append(stem_word)

  return stem_tweets

In [10]:
def build_freqs(tweet, ys):
  yslist = np.squeeze(ys).tolist()

  freqs = {}

  for y, tweet in zip(yslist, tweet):
    for word in process_tweets(tweet):
      pair = (word, y)

      if pair in freqs:
        freqs[pair]+= 1
      else:
        freqs[pair] = 1

  return freqs

In [11]:
freqs_0 = build_freqs(train_x, train_y)

print(type(freqs_0))
print(len(freqs_0))

<class 'dict'>
11426


In [12]:
def extract_features(tweet, freqs):
  word_0 = process_tweets(tweet)

  x = np.zeros((1, 3))
  x[0, 0] = 1

  positive_sum = 0
  negative_sum = 0

  for word in word_0:
    if (word, 1) in freqs_0:
      positive_sum = freqs_0[(word, 1)]
    if (word, 0) in freqs_0:
      negative_sum = freqs_0[(word, 0)]

  x[0, 1] = positive_sum
  x[0, 2] = negative_sum

  assert(x.shape == (1, 3))
  return x

In [13]:
X = np.zeros((len(train_x), 3))

for i in range(len(train_x)):
  X[i, :] = extract_features(train_x[i], freqs_0)

Y = train_y

In [14]:
Y = np.squeeze(Y)

print(Y.shape)
print(X.shape)

(8000,)
(8000, 3)


In [15]:
model = LogisticRegression()
model.fit(X, Y)

In [16]:
X_X = np.zeros((len(test_x), 3))

for i in range(len(test_x)):
  X_X[i, :] = extract_features(test_x[i], freqs_0)

Y_Y = test_y

In [17]:
Y_pred = model.predict(X_X)

In [18]:
# Y_pred

In [19]:
# Y_Y

In [20]:
accuracy = accuracy_score(Y_Y, Y_pred)
print("Accuracy:", accuracy)

conf_matrix = confusion_matrix(Y_Y, Y_pred)
print("Confusion Matrix:\n", conf_matrix)

class_report = classification_report(Y_Y, Y_pred)
print("Classification Report:\n", class_report)

Accuracy: 0.8855
Confusion Matrix:
 [[813 187]
 [ 42 958]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.95      0.81      0.88      1000
         1.0       0.84      0.96      0.89      1000

    accuracy                           0.89      2000
   macro avg       0.89      0.89      0.88      2000
weighted avg       0.89      0.89      0.88      2000



In [33]:
sentence = "I hate this movie"
sentence_0 = extract_features(sentence, freqs_0)
# print(sentence_0)
sentence_0 = np.squeeze(sentence_0)
# print(sentence_0)
predictions = model.predict_proba([sentence_0])



[1.]


In [38]:
tweet = 'This is a ridiculously bright movie. The plot was terrible and I was sad until the ending!'
tweet_features = extract_features(tweet, freqs_0)
predictions = model.predict_proba(tweet_features)

print("Probability For Postive Sentiment:", predictions[0][1])
print("Probability For Negative Sentiment:", predictions[0][0])

print("\n")

if predictions[0][1] > predictions[0][0]:
  print("Positive Sentiment")
else:
  print("Negative Sentiment")

Probability For Postive Sentiment: 0.47315744207940225
Probability For Negative Sentiment: 0.5268425579205978


Negative Sentiment
