In [1]:
import numpy as np
import pandas as pd
import timeit
import time
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.float_format', lambda x: '{:,.2f}'.format(x))

# Tweet Sentiment Prediction
Air Paradis has commissioned an AI product that anticipates bad buzz on social networks. The AI product can predict the sentiment associated with a tweet.
* Data description: information about tweets (user who posted, content, time of posting) and a binary label (tweet expressing a negative sentiment or not). 
* A functional prototype of the model. That sends a tweet and retrieves the sentiment prediction. 

In [None]:
# import kagglehub

# # Download latest version
# path = kagglehub.dataset_download("kazanova/sentiment140")

# print("Path to dataset files:", path)

In [2]:
# 1. Get the file path and read the file
file_path = 'C:/Users/spectre/.cache/kagglehub/datasets/kazanova/sentiment140/versions/2/training.1600000.processed.noemoticon.csv'
# read the datafile
df = pd.read_csv(file_path, 
                sep= ',',
                on_bad_lines='warn',
                parse_dates=True,
                encoding='latin-1',
                header=None,
                engine='python')
# add column titles
df.columns = ['polarity', 'id', 'date', 'query', 'user', 'text']
# select onlt the needed columns
df = df[['polarity', 'text']]

# The Dataset 
The sentiment140 dataset contains 1,600,000 tweets extracted using the twitter api . The tweets have been annotated (0 = negative, 4 = positive) and they can be used to detect sentiment.

In [3]:
# 2. Covert polarity from 0,4 to 0,1 binary values
df['target'] = df['polarity'].apply(lambda x: 1 if x == 4 else 0)
# count the number of occurance for each sentiment class
print(f'polarity counts: {df.polarity.value_counts()}\nTarget counts: {df.target.value_counts()}')

polarity counts: polarity
0    800000
4    800000
Name: count, dtype: int64
Target counts: target
0    800000
1    800000
Name: count, dtype: int64


# Preprocess the text
Convert text to lower case, remove contractions, urls, usernames, digits, punctuations stopwords, tokenize corpus and lemmatize.

In [4]:
# 3. Preprocess the text 
df = df[['target','text']]

# transform corpus to lowercase
df['text'] = df['text'].str.lower()

# Change contractions so that tokenization can work right
import contractions
df['text'] = df['text'].apply(lambda x: ' '.join([contractions.fix(word) for word in x.split()]))

# Remove URLs
df['text'] = df['text'].str.replace(r'http[s]?://\S+|www.\S+', '', regex=True)

# Remove all @s (usernames)
df['text'] = df['text'].str.replace(r'@\S+', '', regex=True)

# Remove hashtags
df['text'] = df['text'].str.replace(r'#\S+','', regex=True)

# Remove digits
df['text'] = df['text'].replace(r'\d+', '', regex=True)
    
# Remove Punctuation
df['text'] = df['text'].str.replace(r'[\!"#$%&\*+,-./:;<=>?@^_`()|~=]','', regex=True)

In [5]:
# 4. Lemmatize the text
# Lemmatize the corpus
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector

# function to get a language detector
def get_lang_detector(nlp, name):
    return LanguageDetector()

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

Language.factory("language_detector", func=get_lang_detector) # load the language detector
nlp.add_pipe('language_detector', last=True)

def lemmatize(text):

   doc = nlp(text)

   tokens = [token.lemma_ for token in doc if not (token.is_stop or token.is_punct or token.is_digit)]

   return ' '.join(tokens)

In [7]:
# Apply the  lemmatize()  function to the whole cleaned corpus with:
df['clean_text'] = df.text.apply(lambda txt : lemmatize(txt))

In [25]:
# 4. Tokenize the tweets
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

vocab_size = 5000
tokenizer = Tokenizer(num_words=vocab_size, split=' ')
tokenizer.fit_on_texts(df['clean_text'].values)

In [26]:
# 5. Split data into label, features, traing and testing sets
from sklearn.model_selection import train_test_split
X = df.clean_text
y = df.target
# Split X and y into training and testing sets
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.25, random_state=42)
# validation set
Xvalid, yvalid = Xtrain[:64], ytrain[:64]

In [28]:
# 6. Vectorize the tweets and pad to same length
Xtrain_ = tokenizer.texts_to_sequences(Xtrain.values)
Xtrain_ = pad_sequences(Xtrain_)
X_val = tokenizer.texts_to_sequences(Xvalid.values)
X_val = pad_sequences(X_val)
Xtest_ = tokenizer.texts_to_sequences(Xtest.values)
Xtest_ = pad_sequences(Xtest_)

In [29]:
[Xtrain_.shape,X_val.shape, Xtest_.shape]

[(1200000, 85), (64, 12), (400000, 38)]

In [30]:
# 7. Xtrain_ has shape (1200000, 38), X_val has shape (64, 25), and Xtest_ has (400000, 40) need to bring to same number of tweets by filling missing with 0s
import numpy as np
Xtest_ = np.lib.pad(Xtest_, ((0,0),(Xtrain_.shape[1] - Xtest_.shape[1],0)), 'constant', constant_values=(0))
X_val = np.lib.pad(X_val, ((0,0),(Xtrain_.shape[1] - X_val.shape[1],0)), 'constant', constant_values=(0))
[Xtrain_.shape,X_val.shape,Xtest_.shape]

[(1200000, 85), (64, 85), (400000, 85)]

In [31]:
print(f'Training text counts: {len(Xtrain_)}\n\
    Training Target counts: {ytrain.value_counts()}\n\
    --------------------------------------\n\
        Validating text counts: {len(X_val)}\n\
            Validating Target counts: {yvalid.value_counts()}\n\
                 --------------------------------------\n\
        Testing text counts: {len(Xtest_)}\n\
            Testing Target counts: {ytest.value_counts()}')

Training text counts: 1200000
    Training Target counts: target
0    600419
1    599581
Name: count, dtype: int64
    --------------------------------------
        Validating text counts: 64
            Validating Target counts: target
0    34
1    30
Name: count, dtype: int64
                 --------------------------------------
        Testing text counts: 400000
            Testing Target counts: target
1    200419
0    199581
Name: count, dtype: int64


In [35]:
# 7. Train a logistic regression classifier
start_time = time.time()
from sklearn.linear_model import LogisticRegression
mod_logit = LogisticRegression()
mod_logit.fit(Xtrain_, ytrain)

print("Training the model took : %s seconds " % (time.time() - start_time))

Training the model took : 35.975924253463745 seconds 


In [None]:
# 8. Save the model
import pickle
import joblib 
# filename = 'logistic_model2.pkl'
# vecname = 'vectorizer.pkl'

# Save the trained model using joblib.
# joblib.dump(mod_logit, open(filename, 'wb'))

# Load the model 
# model = pickle.load(open(filename, 'rb')) 

In [37]:
# Function to predict and evaluate model
def val_test(model, x_train, y_train, x_test, y_test):
    from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
    from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, classification_report, confusion_matrix
    # generate predictions
    y_pred = model.predict(x_test)
    # predicted probabilities
    y_pred_prob = model.predict_proba(x_test)[::, 1]

    # Generate a confusion matrix for the model
    cm = confusion_matrix(y_test, y_pred)
    cm_df = pd.DataFrame(
        cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
        )

    # cross validation
    c_v = RepeatedStratifiedKFold(n_splits=10, n_repeats= 3, random_state=13)
    #crossvalidation score
    crosval = cross_val_score(model, x_train, y_train, scoring='accuracy', cv=c_v, n_jobs=-1)

    # Print the classification report for the model
    print("Confusion Matrix")
    display(cm_df)

    print("Classification Report")
    print(classification_report(y_test, y_pred))
    print(f"Accuracy Score : {accuracy_score(y_test, y_pred): .3f}, \
        Recall score: {recall_score(y_test, y_pred): .3f}")
    print(f'Area Under Curve: {roc_auc_score(y_test, model.predict_proba(x_test)[::,1]): .4f}, \
        Cross Validation Score: {crosval.mean():.3f}')

In [38]:
# 9. Make predictions and evaluate model
start_time = time.time()
val_test(mod_logit, Xtrain_, ytrain, Xtest_, ytest)
print("Validating the model took : %s seconds " % (time.time() - start_time))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,68798,130783
Actual 1,64108,136311


Classification Report
              precision    recall  f1-score   support

           0       0.52      0.34      0.41    199581
           1       0.51      0.68      0.58    200419

    accuracy                           0.51    400000
   macro avg       0.51      0.51      0.50    400000
weighted avg       0.51      0.51      0.50    400000

Accuracy Score :  0.513,         Recall score:  0.680
Area Under Curve:  0.5153,         Cross Validation Score: 0.513
Validating the model took : 653.8046953678131 seconds 


### Logistic_model (vectorized text)
Confusion Matrix

               Predicted 0	Predicted 1
Actual   0	         155679	      43902
Actual   1	         37085          163334

Classification Report

              precision    recall  f1-score   support

           0       0.81      0.78      0.79    199581
           1       0.79      0.81      0.80    200419

    accuracy                           0.80    400000

   macro avg       0.80      0.80      0.80    400000
   
weighted avg       0.80      0.80      0.80    400000

Accuracy Score :  0.798,         

Recall score:  0.815

Area Under Curve:  0.8698,        

Cross Validation Score: 0.798

### Logistic_model1 (with lemmatized text)
Confusion Matrix

               Predicted 0	Predicted 1
Actual 0	      145984	      53597
Actual 1	      42385	         158034

Classification Report

              precision    recall  f1-score   support

           0       0.77      0.73      0.75    199581
           1       0.75      0.79      0.77    200419

    accuracy                           0.76    400000

   macro avg       0.76      0.76      0.76    400000

weighted avg       0.76      0.76      0.76    400000

Accuracy Score :  0.760,        

Recall score:  0.789

Area Under Curve:  0.8360,         

Cross Validation Score: 0.760

Validating the model took : 346.57127499580383 seconds 

### Logistic_model3 (with lemmatized and padded text)
Confusion Matrix

            Predicted 0	Predicted 1

Actual 0	   68798	   130783

Actual 1	   64108	   136311

Classification Report

              precision    recall  f1-score   support

           0       0.52      0.34      0.41    199581
           1       0.51      0.68      0.58    200419

    accuracy                           0.51    400000

   macro avg       0.51      0.51      0.50    400000

weighted avg       0.51      0.51      0.50    400000

Accuracy Score :  0.513,         

Recall score:  0.680

Area Under Curve:  0.5153

Cross Validation Score: 0.513

Validating the model took : 653.8046953678131 seconds

In [91]:
# 10. Supply Tweet and apply model to classify

def classify_tweet(model, vectorizer, tweet):
    tweet = str(tweet).lower().replace('#', '').replace('@', '').replace(r'http[s]?://\S+|www.\S+', '')
    message_vect = vectorizer.transform([tweet])
    prediction = model.predict(message_vect)[0]
    return 'Positive' if prediction == 1 else 'Negative'

# Example of using the function
message = 'The food and service was this data science article is the human subject service airplane africa'
classifier = pickle.load(open('logistic_model.pkl', 'rb')) 
vectorizer = pickle.load(open('vectorizer.pkl', 'rb'))
print(f'Predicted Tweet Sentiment: {classify_tweet(classifier, vectorizer, message)}')

Predicted Tweet Sentiment: Negative


# Customized Advanced Model
## Recurrent Neural Networks
Unlike other Neural Networks that work without memory (ie.  every single input is processed independently with no relation with the other ones.), RNN's one advantage is its ability to remember. RNN process sequences by iterating along the sequence elements and keeping information relative to what it has processed so far.

There are three RNN types: a single LSTM (long short-term memory) model, a Bidirectional LSTM and the Conv1D model.

Working with Neural Networks, it is important to make all the inputs in a fixed size. To achieve this objective we will pad the tweet sentences.

# Gated Recurrent Units (GRU)
Gated Recurrent Unit (GRU) is a type of recurrent neural network (RNN) that was introduced by Cho et al. in 2014 as a simpler alternative to <i><u>Long Short-Term Memory (LSTM)</u></i> networks. Like LSTM, GRU can process sequential data such as text, speech, and time-series data. GRUs are lesser known but equally robust algorithms to solve the limitations of simple RNNs.

The basic idea behind GRU is to use *gating mechanisms* to selectively update the hidden state of the network at each time step. The gating mechanisms are used to control the flow of information in and out of the network. The GRU has two gating mechanisms, called the **reset gate** and the **update gate**.

The reset gate determines how much of the previous hidden state should be forgotten, while the update gate determines how much of the new input should be used to update the hidden state. The output of the GRU is calculated based on the updated hidden state.

To solve the *Vanishing-Exploding* gradients problem often encountered during the operation of a basic Recurrent Neural Network, many variations were developed. One of the most famous variations is the *Long Short Term Memory Network(LSTM)*. One of the lesser-known but equally effective variations is the *Gated Recurrent Unit Network(GRU)*. 

Unlike LSTM, it consists of only three gates and does not maintain an Internal Cell State. The information which is stored in the Internal Cell State in an LSTM

from (https://www.geeksforgeeks.org/gated-recurrent-unit-networks/#)

In [41]:
# 9. Setup the RNN model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GRU, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dropout
from tensorflow.keras.metrics import AUC
from tensorflow.keras.callbacks import ModelCheckpoint

start_time = time.time()

# Keeping a fixed length of all tweets to max 160 words
max_words = Xtrain_.shape[1]

# fixing every word's embedding size to be 32
embd_len = 128

# loss function
loss = 'binary_crossentropy'

# optimizer, using Adam due to robustness
adam = Adam(learning_rate=0.001)


gru_model = Sequential(name='CustomizedAdvancedModel')

# Add embedding layer
gru_model.add(Embedding(vocab_size,
                    embd_len,
                    input_length=max_words))

# add dropout layer
gru_model.add(SpatialDropout1D(0.3))

# add the GRU layer
gru_model.add(GRU(128,
              activation='tanh',
              return_sequences=False))

# add the output layer
gru_model.add(Dense(1, activation='sigmoid'))

# print model summary
print(gru_model.summary())

# Model Hyperparameters
gru_model.compile(
    loss=loss,
    optimizer=adam,
    metrics=['accuracy', AUC(name='auc')]
)

# Get weights
frequencies = pd.value_counts(ytrain)
weights = {0: frequencies.sum() / frequencies[0], 1: frequencies.sum() / frequencies[1]}

# Save the best model
cp = ModelCheckpoint("best_model1.h5",
                     monitor='val_accuracy',
                     verbose=1,
                     save_best_only=True,
                     mode='auto',
                     period=1,
                     save_weights_only=False)

# Train the GRU model
gru_model.fit(Xtrain_, ytrain,
          batch_size=64,
          epochs=5,
          verbose=1,
          validation_data=(X_val, yvalid),
          callbacks = [cp],
          class_weight = weights)

# Print model score on test data
print()
print(f'GRU model Score : {gru_model.evaluate(Xtest_, ytest, verbose =0)}')
print("Training the model took : %s seconds " % (time.time() - start_time))

Model: "CustomizedAdvancedModel"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 85, 128)           640000    
                                                                 
 spatial_dropout1d (SpatialD  (None, 85, 128)          0         
 ropout1D)                                                       
                                                                 
 gru (GRU)                   (None, 128)               99072     
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 739,201
Trainable params: 739,201
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/5
Epoch 1: val_accuracy improved from -inf to 0.68750, saving model to best_model1.h5
Epoc

In [13]:
print(f'GRU model Score : {gru_model.evaluate(Xtest_, ytest, verbose =0)}')

GRU model Score : [0.39110904932022095, 0.8224725127220154, 0.9047358632087708]


In [14]:
print("Training the model took : %s seconds " % (time.time() - start_time))

Training the model took : 9273.00593161583 seconds 


In [43]:
# 10a. Function to predict and evaluate model
def pred_test(model,x_test, y_test):
    from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, classification_report, confusion_matrix
    # generate predictions (probabilities)
    y_pred_ = model.predict(x_test)

    # predictions
    y_pred = (model.predict(x_test) > 0.5).astype(int)

    # Generate a confusion matrix for the model
    cm = confusion_matrix(y_test, y_pred)
    cm_df = pd.DataFrame(
        cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
        )

    # Print the classification report for the model
    print("Confusion Matrix")
    display(cm_df)

    print("Classification Report")
    print(classification_report(y_test, y_pred))
    print(f"Accuracy Score : {accuracy_score(y_test, y_pred): .3f}, \
        Recall score: {recall_score(y_test, y_pred): .3f}")
    print(f'Area Under Curve: {roc_auc_score(y_test, y_pred_): .4f}')

In [32]:
# 10b. Make predictions and evaluate model
# load the best model
from tensorflow.keras.models import load_model

best_model = load_model('best_model.h5')
pred_test(best_model, Xtest_, ytest)

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,164858,34723
Actual 1,36484,163935


Classification Report
              precision    recall  f1-score   support

           0       0.82      0.83      0.82    199581
           1       0.83      0.82      0.82    200419

    accuracy                           0.82    400000
   macro avg       0.82      0.82      0.82    400000
weighted avg       0.82      0.82      0.82    400000

Accuracy Score :  0.822,         Recall score:  0.818
Area Under Curve:  0.9047


In [28]:
print(f' Positive predicted positive: {166119/200419}\nNegative predicted negative: {162870/199581}')

 Positive predicted positive: 0.8288585413558595
Negative predicted negative: 0.8160596449561832


In [44]:
# 10b. Make predictions and evaluate model
# load the best model
from tensorflow.keras.models import load_model

best_model = load_model('best_model1.h5')
pred_test(best_model, Xtest_, ytest)

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,147686,51895
Actual 1,41482,158937


Classification Report
              precision    recall  f1-score   support

           0       0.78      0.74      0.76    199581
           1       0.75      0.79      0.77    200419

    accuracy                           0.77    400000
   macro avg       0.77      0.77      0.77    400000
weighted avg       0.77      0.77      0.77    400000

Accuracy Score :  0.767,         Recall score:  0.793
Area Under Curve:  0.8503


In [108]:
# 11a. Function to make predictions

def predict_sentiment(model, tweet):
    # sentiment = ['Negative','Positive']
    max_len = 85 #40
    sequence = tokenizer.texts_to_sequences([tweet])
    text = pad_sequences(sequence, maxlen=max_len)
    # sent = sentiment[np.around(model.predict(text), decimals=0).argmax(axis=1)[0]]
    prediction = (model.predict(text) > 0.5).astype(int)
    return 'Positive' if prediction == 1 else 'Negative'

In [114]:
# 11b. Supply Tweet and apply model to classify
model = load_model('best_model1.h5')
message = 'The food and service was a   and this data science article is the good human subject service superb airplane africa'
max_words = 85 
print(f'{message}. The predicted sentiment is [{predict_sentiment(model, message)}]')

The food and service was a   and this data science article is the good human subject service superb airplane africa. The predicted sentiment is [Positive]


# Sentiment Analysis APP
Sentiment analysis, also known as opinion mining involves the use of computational methods to determine the sentiment or emotional tone expressed in a piece of text, such as positive, negative, or neutral. 
Sentiment Analysis is a natural language processing (often referred to as NLP) and machine learning to interpret and classify emotions in text information.
Sentiment analysis plays a crucial role in various industries as companies can use this analysis to improve user experience, product quality, brand reputation and generally the going concern of a business.

For this project, the core functionality is to predect the sentiment of a tet, review or document.

Sentiment prediction can be outlined as a process that follows the below steps/methods:
* Data/Input (Text): User can enter texts/sentences in the regular language format in the provided text area of the application.
* Language Detection: Before performing any analysis, we need to determine the language of the input text. (Process can be automated through use the Langdetect library). It detects the language of each text input and enables language-specific sentiment analysis.
* Sentiment Analysis: Once the language of the text is determined, we can proceed with sentiment analysis using the SentimentIntensityAnalyzer. The analyzer calculates sentiment scores for each text, including a compound score that represents the overall sentiment. Positive scores indicate positive sentiment, negative scores indicate negative sentiment, and scores around zero indicate neutrality.
* Thresholds and Sentiment Categorization: To categorize the sentiment, we define threshold values for positive and negative sentiments. The user can adjust these thresholds using the number inputs in the application. Sentiment scores above the positive threshold are considered positive, scores below the negative threshold are considered negative, and scores in between are classified as neutral.
* Sentiment Labels and Confidence Scores: Based on the sentiment scores and the defined thresholds, the sentiment of each text is determined as positive, negative, or neutral. Additionally, we calculate a confidence score, which represents the intensity of the sentiment. The confidence score is derived from the compound score and is expressed as a percentage.
* Tabular Representation: The sentiment predictions, along with the original texts and confidence scores, are organized into a DataFrame using the Pandas library. This tabular representation provides a clear overview of the sentiment analysis results.
* Visualization: To enhance the understanding of sentiment distribution, we create a pie chart using the Matplotlib library. The pie chart illustrates the proportion of positive, negative, and neutral sentiments in the analyzed texts.