<a href="https://www.kaggle.com/code/mahmoudsaeed99/sentiment-by-lr-implementation-code?scriptVersionId=106567325" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Brief Describtion

> **In this code i will implement `Logistic Regression algorithm` to predict Sentiment of the tweets**

> **First i will make the `sigmoid` function and `gradient discent` function to get the lowest cost**

# Import Libraries

In [1]:
import nltk 
nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
import numpy as np
import pandas as pd
import re
import string

from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer

In [3]:
all_positive = twitter_samples.strings("positive_tweets.json")
all_negative = twitter_samples.strings("negative_tweets.json")

# Explore and preprocess

In [4]:
len(all_positive) , len(all_negative)

(5000, 5000)

In [5]:
#split data two pieces train and test
train_pos = all_positive[:4000]
test_pos = all_positive[4000:]
train_neg = all_negative[:4000]
test_neg = all_negative[4000:]

all_train = train_pos + train_neg
all_test = test_pos + test_neg

In [6]:
train_y = np.append(np.ones((len(train_pos),1)) , np.zeros((len(train_neg) , 1)) , axis = 0)
test_y = np.append(np.ones((len(test_pos) , 1)) , np.zeros((len(test_neg) , 1)) , axis = 0)

In [7]:
len(train_y) , len(test_y)

(8000, 2000)

In [8]:
def process_tweet(tweet):
    stemmer = PorterStemmer()
    lemma = WordNetLemmatizer()
    stop_words = stopwords.words('english')
    # remove links
    tweet = re.sub(r'https?://[^\s\n\r]+' , '' , tweet)
    #remove hash
    tweet = re.sub(r'#' , '' , tweet)
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # tokenize tweets
    token = TweetTokenizer()
    tweet_token = token.tokenize(tweet)
    clean_tweets = []
    pos = nltk.pos_tag(tweet_token)
    for word , pos in zip(tweet_token,pos):
        if (word not in stop_words and word not in string.punctuation):
#             stem_word = stemmer.stem(word)
            lemma_word = lemma.lemmatize(word)
            if pos[1] == 'VBN':
                clean_tweets.append(stemmer.stem(lemma_word))
                continue
            clean_tweets.append(lemma_word)
    return clean_tweets        
            

In [9]:
process_tweet("i am mahmoud saeed graduated from faculty of computers and Artificial Intelligence")

['mahmoud',
 'saeed',
 'graduat',
 'faculty',
 'computer',
 'Artificial',
 'Intelligence']

In [10]:
def build_freq(tweets , ys):
    yslist = np.squeeze(ys).tolist()
    
    freqs = {}
    for y , tweet in zip(yslist , tweets):
        for word in process_tweet(tweet):
            pair = (word , y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1
    return freqs           
     
freqs =  build_freq(all_train ,train_y)   

### Sigmoid

In [11]:
def sigmoid(z):
    h = 1 / (1 + np.exp(-1 * z))
    return h

In [12]:
#test sigmoid function 
if sigmoid(0) == 0.5:
    print("success")
else:
    print("faild!")
    
if sigmoid(4.92) == 0.9927537604041685:
    print("success")
else:
    print("faild!")    

success
success


In [13]:
def gradientDescent(x , y , theta , alpha , num_iter):
    '''
        Input:
        x: matrix of features which is (m,n+1)
        y: corresponding labels of the input matrix x, dimensions (m,1)
        theta: weight vector of dimension (n+1,1)
        alpha: learning rate
        num_iters: number of iterations you want to train your model for
    Output:
        J: the final cost
        theta: your final weight vector
    '''
    for _ in range(num_iter):    
        z = np.dot(x,theta)
        h = sigmoid(z)
        J = -1/x.shape[0] * (np.dot(np.transpose(y) , np.log(h)) + np.dot(np.transpose(1 - y) , np.log(1 -h)))
        theta = theta - (alpha / x.shape[0]) * np.dot(np.transpose(x) , (h - y))
        
    J = float(J)
    return J , theta

In [14]:
# Check the function
# Construct a synthetic test case using numpy PRNG functions
np.random.seed(1)
# X input is 10 x 3 with ones for the bias terms
tmp_X = np.append(np.ones((10, 1)), np.random.rand(10, 2) * 2000, axis=1)
# Y Labels are 10 x 1
tmp_Y = (np.random.rand(10, 1) > 0.35).astype(float)

# Apply gradient descent
tmp_J, tmp_theta = gradientDescent(tmp_X, tmp_Y, np.zeros((3, 1)), 1e-8, 700)
print(f"The cost after training is {tmp_J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(tmp_theta)]}")

The cost after training is 0.67094970.
The resulting vector of weights is [4.1e-07, 0.00035658, 7.309e-05]


### Extracting Feature

In [15]:
def extract_feature(tweet , freq , process_tweet = process_tweet):
    words = process_tweet(tweet)
    x = np.zeros((1,3))
    x[0,0] = 1
    for word in words:
        x[0,1] += freq[(word,1)] if (word,1) in freq else 0
        x[0,2] += freq[(word,0)] if (word,0) in freq else 0
        
    assert(x.shape == (1,3))
    return x

In [16]:

s = "mahmoud is a good person"
test = extract_feature(s,freqs)
print(test)

[[  1. 145.  77.]]


# Building model

### Trainig model

In [17]:
X = np.zeros((len(all_train),3))
for i in range(len(all_train)):
    X[i ,:] = extract_feature(all_train[i] , freqs)
    
Y = train_y
J , theta = gradientDescent(X , Y , np.zeros((3,1)) , 1e-9, 1500)
print(f"The cost after training is {J:.8f}.")

The cost after training is 0.23108701.


### Test Model

$$y_{pred} = sigmoid(\mathbf{x} \cdot \theta)$$

In [18]:
def predict_tweet(tweet , freqs , theta):
    x = extract_feature(tweet , freqs)
    y_pred = sigmoid(np.dot(x , theta))
    return y_pred

### check accuracy

In [19]:
def get_accuracy(test_x , test_y , freqs , theta , predict_tweet = predict_tweet):
    y_hat = []
    
    for tweet in test_x:
        y_predict = predict_tweet(tweet , freqs , theta)
        if y_predict > 0.5:
            y_hat.append(1)
        else:
            y_hat.append(0)
            
    accuracy = np.sum(test_y.reshape(1, test_y.shape[0]) == y_hat) / len(y_hat)
    
    return accuracy

In [20]:
get_accuracy(all_test ,test_y , freqs , theta)

0.9815

In [21]:
my_tweet = 'This is a ridiculously bright movie. The plot was terrible and I was sad until the ending!'
print(process_tweet(my_tweet))
y_hat = predict_tweet(my_tweet, freqs, theta)
print(y_hat)
if y_hat > 0.5:
    print('Positive sentiment')
else: 
    print('Negative sentiment')

['This', 'ridiculously', 'bright', 'movie', 'The', 'plot', 'terrible', 'I', 'sad', 'ending']
[[0.42112456]]
Negative sentiment
