## Importing libraries

In [1]:
import nltk
from nltk.corpus import twitter_samples
import matplotlib.pyplot as plt
import random
import numpy as np

import re                                  # library for regular expression operations
import string                              # for string operations

from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings

In [2]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

## Visualizing tweets

In [3]:
# select the set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [4]:
print(f'Number of positive tweets: {len(all_positive_tweets)}')
print(f'Number of negative tweets: {len(all_negative_tweets)}')

print(f'Type of all positive tweets: {type(all_positive_tweets)}')
print(f'Type of a tweet entry: {type(all_positive_tweets[0])}')

Number of positive tweets: 5000
Number of negative tweets: 5000
Type of all positive tweets: <class 'list'>
Type of a tweet entry: <class 'str'>


In [5]:
# print positive tweet in green color
print(f'\033[92m {all_positive_tweets[np.random.randint(0,len(all_positive_tweets))]}')

# print negative tweet in red color
print(f'\033[91m {all_negative_tweets[np.random.randint(0,len(all_negative_tweets))]}')

[92m @arjaycj please follow me i love you so much and it would mean the world!! :)
[91m @myteksi hi, may i request a promo code pls? Tried using iwantin but got rejected just now :(


## Pre-processing raw text

For NLP, the preprocessing steps are comprised of the following tasks:
- Tokenizing the string
- Lowercasing
- Removing stopwords and punctuation
- Stemming (tuning, tune, tuned => tun) Stem the word

In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
tweet = all_positive_tweets[2277]
tweet

'My beautiful sunflowers on a sunny Friday morning off :) #sunflowers #favourites #happy #Friday off… https://t.co/3tfYom0N1i'

### Remove hyperlinks, twitter marks, and styles


In [8]:
# remove old style retweet text "RT"
tweet2 = re.sub(r'^RT[\s]+', '', tweet)

# remove hyperlinks
tweet2 = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet2)

# remove hash sign
tweet2 = re.sub(r'#', '', tweet2)

print(tweet2)

My beautiful sunflowers on a sunny Friday morning off :) sunflowers favourites happy Friday off… 


### Tokenize the string

In [9]:
# instantiate tokenizer class
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

# tokenize tweets
tweet_tokens = tokenizer.tokenize(tweet2)

print('Tokenized string:')
print(tweet_tokens)

Tokenized string:
['my', 'beautiful', 'sunflowers', 'on', 'a', 'sunny', 'friday', 'morning', 'off', ':)', 'sunflowers', 'favourites', 'happy', 'friday', 'off', '…']


### Remove stopwords and punctuation

In [10]:
# import english stopwords list from nltk
stopwords_english = stopwords.words('english')

print('Stopwords:')
print(stopwords_english,'\n')

print('Punctuations:')
print(string.punctuation)

Stopwords:
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so'

In [11]:
print('\033[91m', tweet_tokens)

tweets_clean = []

for word in tweet_tokens:
  if (word not in stopwords_english and
      word not in string.punctuation):
    tweets_clean.append(word)

print('After removing stopwords:')
print('\033[94m', tweets_clean)

[91m ['my', 'beautiful', 'sunflowers', 'on', 'a', 'sunny', 'friday', 'morning', 'off', ':)', 'sunflowers', 'favourites', 'happy', 'friday', 'off', '…']
After removing stopwords:
[94m ['beautiful', 'sunflowers', 'sunny', 'friday', 'morning', ':)', 'sunflowers', 'favourites', 'happy', 'friday', '…']


### Stemming

Process of converting a word to its most general form, or stem.

Helps in reducing the size of our vocabulary.

- learn
- learning
- learned
- learnt 

All these words are stemmed from its commong root learn. 

Stemming process can produce words that are not correct spellings of the root word. 

- happy
- happiness
- happier

The prefix happi is more commonly used. So we choose happi, and not happ because happ relates to happen. 


In [12]:
# Instantiate the stemming class
stemmer = PorterStemmer()

tweets_stem = []

for word in tweets_clean:
  stem_word = stemmer.stem(word)
  tweets_stem.append(stem_word)

print('Stemmed words:')
print(tweets_stem)

Stemmed words:
['beauti', 'sunflow', 'sunni', 'friday', 'morn', ':)', 'sunflow', 'favourit', 'happi', 'friday', '…']


In [13]:
# Let's make a function to process a give tweet 
def process_tweet(tweet):
  # Remove retweets, hyperlinks, hashtag sign
  tweet = re.sub(r'^RT[\s]+', '', tweet)
  tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
  tweet = re.sub(r'#', '', tweet)

  # tokenize
  tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
  tweet_tokens = tokenizer.tokenize(tweet)

  # remove stopwords and punctuations
  stopwords_english = stopwords.words('english')
  tweets_clean = []

  for word in tweet_tokens:
    if (word not in stopwords_english and
        word not in string.punctuation):
      tweets_clean.append(word)

  # stemming
  stemmer = PorterStemmer()
  tweets_stem = []

  for word in tweets_clean:
    stem_word = stemmer.stem(word)
    tweets_stem.append(stem_word)

  return tweets_stem

In [14]:
tweets_stem = process_tweet(tweet)
print(tweets_stem)

['beauti', 'sunflow', 'sunni', 'friday', 'morn', ':)', 'sunflow', 'favourit', 'happi', 'friday', '…']


## Building word frequencies

In [15]:
# concatenate the tweets
tweets = all_positive_tweets + all_negative_tweets
print(f'Number of tweets: {len(tweets)}')

Number of tweets: 10000


In [16]:
# make labels
labels = np.append(np.ones(len(all_positive_tweets)), np.zeros(len(all_negative_tweets))).reshape(-1,1)
labels

array([[1.],
       [1.],
       [1.],
       ...,
       [0.],
       [0.],
       [0.]])

In [17]:
labels.shape

(10000, 1)

In [18]:
# build freq dictionary
def build_freqs(tweets, labels):
  """
  Input:
    tweets: a list of tweets
    labels: an mx1 array with the sentiment label of each tweet (1 or 0)
  Output:
    freqs: a dictionary mapping each (word, sentiment) pair to its frequency
  """

  labels_list = np.squeeze(labels).tolist()

  freqs = {}
  for tweet, label in zip(tweets, labels_list):
    for word in process_tweet(tweet):
      pair = (word, label)
      freqs[pair] = freqs.get(pair, 0) + 1
  
  return freqs

In [19]:
# create frequency dictionary
freqs = build_freqs(tweets, labels)
print(f'Length of freqs = {len(freqs)}')

Length of freqs = 13075


In [20]:
print(freqs)

{('followfriday', 1.0): 25, ('top', 1.0): 32, ('engag', 1.0): 7, ('member', 1.0): 16, ('commun', 1.0): 33, ('week', 1.0): 83, (':)', 1.0): 3568, ('hey', 1.0): 76, ('jame', 1.0): 7, ('odd', 1.0): 2, (':/', 1.0): 5, ('pleas', 1.0): 97, ('call', 1.0): 37, ('contact', 1.0): 7, ('centr', 1.0): 2, ('02392441234', 1.0): 1, ('abl', 1.0): 8, ('assist', 1.0): 1, ('mani', 1.0): 33, ('thank', 1.0): 620, ('listen', 1.0): 16, ('last', 1.0): 47, ('night', 1.0): 68, ('bleed', 1.0): 2, ('amaz', 1.0): 51, ('track', 1.0): 5, ('scotland', 1.0): 2, ('congrat', 1.0): 21, ('yeaaah', 1.0): 1, ('yipppi', 1.0): 1, ('accnt', 1.0): 2, ('verifi', 1.0): 2, ('rqst', 1.0): 1, ('succeed', 1.0): 1, ('got', 1.0): 69, ('blue', 1.0): 9, ('tick', 1.0): 1, ('mark', 1.0): 1, ('fb', 1.0): 6, ('profil', 1.0): 2, ('15', 1.0): 5, ('day', 1.0): 246, ('one', 1.0): 129, ('irresist', 1.0): 2, ('flipkartfashionfriday', 1.0): 17, ('like', 1.0): 233, ('keep', 1.0): 68, ('love', 1.0): 400, ('custom', 1.0): 4, ('wait', 1.0): 70, ('long',

## Logistic Regression

### Preparing the data


In [21]:
# split the data into two pieces, one for training and one for testing
# 80% training set, 20% test set

split = 0.8
pos_size = (int)(len(all_positive_tweets) * split)
train_pos = all_positive_tweets[:pos_size]
test_pos = all_positive_tweets[pos_size:]

train_neg = all_negative_tweets[:pos_size]
test_neg = all_negative_tweets[pos_size:]

X_train = train_pos + train_neg
X_test = test_pos + test_neg

In [22]:
# combine positve and negative labels
y_train = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
y_test = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [23]:
print(f'Train y shape: {y_train.shape}')
print(f'Test y shape: {y_test.shape}')

Train y shape: (8000, 1)
Test y shape: (2000, 1)


In [24]:
# create frequency dictionary
freqs = build_freqs(X_train, y_train)
print(f'len(freqs): {str(len(freqs.keys()))}')

len(freqs): 11345


### Sigmoid function

In [25]:
def sigmoid(z):
  """
  Input: 
    z: is the input (can be scalar or array)
  Output:
    h: sigmoid of z
  """

  h = np.divide(1, 1 + np.exp(-z))
  return h

In [26]:
## Test sigmoid function
if (sigmoid(0) == 0.5):
  print('SUCCESS!')
else:
  print('Somethings wrong')

SUCCESS!


### Logistic regression: regression + sigmoid

Regression:
$$z = \theta_0 x_0 + \theta_1 x_1 + \theta_2 x_2 + ... \theta_N x_N$$

Logistic regression
$$ h(z) = \frac{1}{1+\exp^{-z}}$$
$$z = \theta_0 x_0 + \theta_1 x_1 + \theta_2 x_2 + ... \theta_N x_N$$


### Cost function and Gradient

The cost function used for logistic regression is the average of the log loss across all training examples:

$$J(\theta) = -\frac{1}{m} \sum_{i=1}^m y^{(i)}\log (h(z(\theta)^{(i)})) + (1-y^{(i)})\log (1-h(z(\theta)^{(i)})) $$
* $m$ is the number of training examples
* $y^{(i)}$ is the actual label of the i-th training example.
* $h(z(\theta)^{(i)})$ is the model's prediction for the i-th training example.

The loss function for a single training example is
$$ Loss = -1 \times \left( y^{(i)}\log (h(z(\theta)^{(i)})) + (1-y^{(i)})\log (1-h(z(\theta)^{(i)})) \right)$$


In [27]:
y = 0 # true label
h = 0.999999999  # model predicts close to 1

# so loss should be high
loss = - (y * np.log(h) + (1 - y) * np.log(1 - h))
print(loss)

y = 0 # true label
h = 0.001  # model predicts close to 0

# so loss should be low
loss = - (y * np.log(h) + (1 - y) * np.log(1 - h))
print(loss)

20.723265865228342
0.0010005003335835344


#### Update the weights

To update your weight vector $\theta$, you will apply gradient descent to iteratively improve your model's predictions.  
The gradient of the cost function $J$ with respect to one of the weights $\theta_j$ is:

$$\nabla_{\theta_j}J(\theta) = \frac{1}{m} \sum_{i=1}^m(h^{(i)}-y^{(i)})x^{(i)}_j$$
* 'i' is the index across all 'm' training examples.
* 'j' is the index of the weight $\theta_j$, so $x^{(i)}_j$ is the feature associated with weight $\theta_j$

* To update the weight $\theta_j$, we adjust it by subtracting a fraction of the gradient determined by $\alpha$:
$$\theta_j = \theta_j - \alpha \times \nabla_{\theta_j}J(\theta) $$
* The learning rate $\alpha$ is a value that we choose to control how big a single update will be.



### Instructions: Implement gradient descent function
* The number of iterations `num_iters` is the number of times that you'll use the entire training set.
* For each iteration, you'll calculate the cost function using all training examples (there are `m` training examples), and for all features.
* Instead of updating a single weight $\theta_i$ at a time, we can update all the weights in the column vector:  
$$\mathbf{\theta} = \begin{pmatrix}
\theta_0
\\
\theta_1
\\ 
\theta_2 
\\ 
\vdots
\\ 
\theta_n
\end{pmatrix}$$
* $\mathbf{\theta}$ has dimensions (n+1, 1), where 'n' is the number of features, and there is one more element for the bias term $\theta_0$ (note that the corresponding feature value $\mathbf{x_0}$ is 1).
* The 'logits', 'z', are calculated by multiplying the feature matrix 'x' with the weight vector 'theta'.  $z = \mathbf{x}\mathbf{\theta}$
    * $\mathbf{x}$ has dimensions (m, n+1) 
    * $\mathbf{\theta}$: has dimensions (n+1, 1)
    * $\mathbf{z}$: has dimensions (m, 1)
* The prediction 'h', is calculated by applying the sigmoid to each element in 'z': $h(z) = sigmoid(z)$, and has dimensions (m,1).
* The cost function $J$ is calculated by taking the dot product of the vectors 'y' and 'log(h)'.  Since both 'y' and 'h' are column vectors (m,1), transpose the vector to the left, so that matrix multiplication of a row vector with column vector performs the dot product.
$$J = \frac{-1}{m} \times \left(\mathbf{y}^T \cdot log(\mathbf{h}) + \mathbf{(1-y)}^T \cdot log(\mathbf{1-h}) \right)$$
* The update of theta is also vectorized.  Because the dimensions of $\mathbf{x}$ are (m, n+1), and both $\mathbf{h}$ and $\mathbf{y}$ are (m, 1), we need to transpose the $\mathbf{x}$ and place it on the left in order to perform matrix multiplication, which then yields the (n+1, 1) answer we need:
$$\mathbf{\theta} = \mathbf{\theta} - \frac{\alpha}{m} \times \left( \mathbf{x}^T \cdot \left( \mathbf{h-y} \right) \right)$$

In [65]:
def gradient_descent(x, y, theta, alpha, epochs):
  """
  Input:
    x: matrix of features of shape (m, n+1)
    y: corresponding labels of the input matrix x, dimensions (m,1)
    theta: weight vector of dimnesion (n+1, 1)
    alpha: learning rate
    epochs: number of iterations you want to train your model for
  
  Output: 
    J: final cost
    theta: final weight vector
  """

  m = x.shape[0]
  
  for i in range(1, epochs+1):

    # get z, the dot product of x and theta 
    z = x @ theta

    assert(z.shape == (m,1))

    # get the sigmoid of z
    h = sigmoid(z)

    # calculate the cost function
    J = (-1/m) * ((y.T @ np.log(h)) + ((1-y).T @ np.log(1-h)))

    # calculate gradient
    grad_J_theta = np.divide(np.matmul(x.T, (h - y)), m)

    theta = theta - alpha * grad_J_theta
    print(f'Epoch: {i}/{epochs}\tLoss: {J}')
  return J, theta

In [66]:
# Check the function
# Construct a synthetic test case using numpy PRNG functions
np.random.seed(1)
# X input is 10 x 3 with ones for the bias terms
tmp_X = np.append(np.ones((10, 1)), np.random.rand(10, 2) * 2000, axis=1)
# Y Labels are 10 x 1
tmp_Y = (np.random.rand(10, 1) > 0.35).astype(float)

# Apply gradient descent
tmp_J, tmp_theta = gradient_descent(tmp_X, tmp_Y, np.zeros((3, 1)), 1e-8, 700)
print(f'The cost after training is {tmp_J}')
print(f'The resulting vector of weights is {tmp_theta}')

Epoch: 1/700	Loss: [[0.69314718]]
Epoch: 2/700	Loss: [[0.69305407]]
Epoch: 3/700	Loss: [[0.69296151]]
Epoch: 4/700	Loss: [[0.6928695]]
Epoch: 5/700	Loss: [[0.69277802]]
Epoch: 6/700	Loss: [[0.69268708]]
Epoch: 7/700	Loss: [[0.69259667]]
Epoch: 8/700	Loss: [[0.69250679]]
Epoch: 9/700	Loss: [[0.69241743]]
Epoch: 10/700	Loss: [[0.69232859]]
Epoch: 11/700	Loss: [[0.69224027]]
Epoch: 12/700	Loss: [[0.69215246]]
Epoch: 13/700	Loss: [[0.69206515]]
Epoch: 14/700	Loss: [[0.69197836]]
Epoch: 15/700	Loss: [[0.69189206]]
Epoch: 16/700	Loss: [[0.69180626]]
Epoch: 17/700	Loss: [[0.69172096]]
Epoch: 18/700	Loss: [[0.69163614]]
Epoch: 19/700	Loss: [[0.69155182]]
Epoch: 20/700	Loss: [[0.69146797]]
Epoch: 21/700	Loss: [[0.69138461]]
Epoch: 22/700	Loss: [[0.69130172]]
Epoch: 23/700	Loss: [[0.6912193]]
Epoch: 24/700	Loss: [[0.69113736]]
Epoch: 25/700	Loss: [[0.69105588]]
Epoch: 26/700	Loss: [[0.69097486]]
Epoch: 27/700	Loss: [[0.6908943]]
Epoch: 28/700	Loss: [[0.6908142]]
Epoch: 29/700	Loss: [[0.69073455]

## Part 2: Extracting the features

* Given a list of tweets, extract the features and store them in a matrix. You will extract two features.
    * The first feature is the number of positive words in a tweet.
    * The second feature is the number of negative words in a tweet. 
* Then train your logistic regression classifier on these features.
* Test the classifier on a validation set. 

### Instructions: Implement the extract_features function. 
* This function takes in a single tweet.
* Process the tweet using the imported `process_tweet()` function and save the list of tweet words.
* Loop through each word in the list of processed words
    * For each word, check the `freqs` dictionary for the count when that word has a positive '1' label. (Check for the key (word, 1.0)
    * Do the same for the count for when the word is associated with the negative label '0'. (Check for the key (word, 0.0).)


In [67]:
def extract_features(tweet, freqs):
  """
  Input: 
    tweet: a list of words for one tweet
    freqs: a dictionary corresponding to the frequencies of each tuple (word,label)
  
  Output:
    x: a feature vector of dimension (1,3)
  """

  # process the tweet
  words = process_tweet(tweet)

  x = np.zeros((1,3))

  # bias term is set to 1
  x[0,0] = 1

  for word in words:
    x[0,1] += freqs.get((word,1), 0)
    x[0,2] += freqs.get((word,0), 0)
  
  return x

In [68]:
# test 1
# test on on sample of training data
x = extract_features(X_train[0], freqs)
print(x)

[[1.00e+00 3.02e+03 6.10e+01]]


In [69]:
# test 2:
# check for when the words are not in the freqs dictionary
tmp2 = extract_features('blorb bleeeeb bloooob', freqs)
print(tmp2)

[[1. 0. 0.]]


## Part 3: Training Your Model

To train the model:
* Stack the features for all training examples into a matrix `X`. 
* Call `gradientDescent`, which you've implemented above.

In [70]:
X = np.zeros((len(X_train), 3))
for i in range(len(X_train)):
  X[i, :] = extract_features(X_train[i], freqs)

# training labels corresponding to X
y = y_train

In [79]:
# Apply gradient descent
J, theta = gradient_descent(X, y, np.zeros((3,1)), alpha=1e-9, epochs=1500)
print(f'The cost after training is {J}')
print(f'The resulting vector of weights is {theta}')

Epoch: 1/1500	Loss: [[0.69314718]]
Epoch: 2/1500	Loss: [[0.69207586]]
Epoch: 3/1500	Loss: [[0.69100782]]
Epoch: 4/1500	Loss: [[0.68994307]]
Epoch: 5/1500	Loss: [[0.6888816]]
Epoch: 6/1500	Loss: [[0.68782338]]
Epoch: 7/1500	Loss: [[0.6867684]]
Epoch: 8/1500	Loss: [[0.68571667]]
Epoch: 9/1500	Loss: [[0.68466816]]
Epoch: 10/1500	Loss: [[0.68362287]]
Epoch: 11/1500	Loss: [[0.68258078]]
Epoch: 12/1500	Loss: [[0.68154188]]
Epoch: 13/1500	Loss: [[0.68050616]]
Epoch: 14/1500	Loss: [[0.67947362]]
Epoch: 15/1500	Loss: [[0.67844423]]
Epoch: 16/1500	Loss: [[0.67741798]]
Epoch: 17/1500	Loss: [[0.67639488]]
Epoch: 18/1500	Loss: [[0.6753749]]
Epoch: 19/1500	Loss: [[0.67435803]]
Epoch: 20/1500	Loss: [[0.67334427]]
Epoch: 21/1500	Loss: [[0.6723336]]
Epoch: 22/1500	Loss: [[0.67132601]]
Epoch: 23/1500	Loss: [[0.67032148]]
Epoch: 24/1500	Loss: [[0.66932002]]
Epoch: 25/1500	Loss: [[0.6683216]]
Epoch: 26/1500	Loss: [[0.66732622]]
Epoch: 27/1500	Loss: [[0.66633387]]
Epoch: 28/1500	Loss: [[0.66534453]]
Epoch:

## Part 4: Test your logistic regression

It is time for you to test your logistic regression function on some new input that your model has not seen before. 

#### Instructions: Write `predict_tweet`
Predict whether a tweet is positive or negative.

* Given a tweet, process it, then extract the features.
* Apply the model's learned weights on the features to get the logits.
* Apply the sigmoid to the logits to get the prediction (a value between 0 and 1).

$$y_{pred} = sigmoid(\mathbf{x} \cdot \theta)$$

In [80]:
def predict_tweet(tweet, freqs, theta):
  """
  Input: 
    tweet: a string
    freqs: a dictionary corresponding to frequencies of each tuple (word, label)
    theta: vector of trained weights
  Output: 
    y_pred: probability of tweet being positive or negative
  """
  x = extract_features(tweet, freqs)

  y_pred = sigmoid(x @ theta)

  return y_pred

In [81]:
# Run this cell to test your function
for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']:
    print( '%s -> %f' % (tweet, predict_tweet(tweet, freqs, theta)))

I am happy -> 0.518580
I am bad -> 0.494339
this movie should have been great. -> 0.515331
great -> 0.515464
great great -> 0.530898
great great great -> 0.546273
great great great great -> 0.561561


In [82]:
my_tweet = 'I am learning :)'
predict_tweet(my_tweet, freqs, theta)

array([[0.8163643]])

## Check performance using the test set
After training your model using the training set above, check how your model might perform on real, unseen data, by testing it against the test set.

#### Instructions: Implement `test_logistic_regression` 
* Given the test data and the weights of your trained model, calculate the accuracy of your logistic regression model. 
* Use your `predict_tweet()` function to make predictions on each tweet in the test set.
* If the prediction is > 0.5, set the model's classification `y_hat` to 1, otherwise set the model's classification `y_hat` to 0.
* A prediction is accurate when `y_hat` equals `test_y`.  Sum up all the instances when they are equal and divide by `m`.


In [83]:
def test_logistic_regression(X_test, y_test, freqs, theta):
  y_hat = []

  for tweet in X_test:
    y_pred = predict_tweet(tweet, freqs, theta)

    if y_pred > 0.5:
      y_hat.append(1.0)
    else:
      y_hat.append(0)
    
  y_hat = np.array(y_hat)

  num_correct = (y_hat.squeeze() == y_test.squeeze()).sum()
  accuracy = num_correct / len(y_test)

  return accuracy

In [85]:
tmp_accuracy = test_logistic_regression(X_test, y_test, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

Logistic regression model's accuracy = 0.9950


In [86]:
# Feel free to change the tweet below
my_tweet = 'It was good to see that the actors pulled it off.'
print(process_tweet(my_tweet))
y_hat = predict_tweet(my_tweet, freqs, theta)
print(y_hat)
if y_hat > 0.5:
    print('Positive sentiment')
else: 
    print('Negative sentiment')

['good', 'see', 'actor', 'pull']
[[0.51960712]]
Positive sentiment
