In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
# !pip install contractions
# !pip install emoji

In [3]:
df = pd.read_csv("train.csv")
df = df[['full_text', 'Sentiment Analysis (Label)']]
df.rename(columns = {'Sentiment Analysis (Label)': 'label'}, inplace = True)
df.dropna(inplace=True)
df.groupby('label').count()

Unnamed: 0_level_0,full_text
label,Unnamed: 1_level_1
Negative,1376
Neutral,3853
Positive,1769


In [4]:
label_mapping = {'Negative': 1, 'Positive': 1, 'Neutral': 0}
df['subjectivity'] = df['label'].map(label_mapping)
df_subjectivity = df[['full_text','subjectivity']]
df_subjectivity.groupby('subjectivity').count()

Unnamed: 0_level_0,full_text
subjectivity,Unnamed: 1_level_1
0,3853
1,3145


In [5]:
df_subjectivity.head()

Unnamed: 0,full_text,subjectivity
0,Thinking about a conspiraboomer paying some CO...,1
1,@aja9696 @civillibertari2 @POTUS That's litera...,1
2,@POTUS I have a question Mr Presidente If I g...,1
3,"@ziggystar18 @corybernardi Ironically, yes, be...",1
4,@dysclinic And many become worse after covid v...,1


In [6]:
test_df = pd.read_csv("eval.csv")
test_df = test_df[['full_text', 'Sentiment Analysis (Label)']]
test_df.rename(columns = {'Sentiment Analysis (Label)': 'label'}, inplace = True)
test_df.dropna(inplace=True)

In [7]:
label_mapping = {'Negative': 1, 'Positive': 1, 'Neutral': 0}
test_df['subjectivity'] = test_df['label'].map(label_mapping)
subjectivity_test_df = test_df[['full_text','subjectivity']]
subjectivity_test_df.groupby('subjectivity').count()

Unnamed: 0_level_0,full_text
subjectivity,Unnamed: 1_level_1
0,486
1,514


### Preprocessing Tweets

We will be using these commonly used ways to pre-process tweets

1. Expand contractions
2. Map emojis into its word meaning
3. Remove mentions(@), hashtags(#), punctuations and numbers as we prioritise sentiment from the text
4. Lemmatization

In [8]:
import re
import contractions
import emoji
import nltk
# Download if u havent
#nltk.download('wordnet')
#nltk.download('punkt')
from nltk.stem import WordNetLemmatizer


In [9]:
def clean_text(text):

    # Step 1: Expand contractions
    text = contractions.fix(text)
    
    # Step 2: Map emojis into its word meaning
    text = emoji.demojize(text)
    
    # Step 3: Remove mentions, hashtags, numbers and links
    pattern = r'@[A-Za-z0-9_]+|#[A-Za-z0-9]+|\d+|https?://\S+|[^\w\s]'
    text = re.sub(pattern, '', text)
    # put text to lowercase
    text = text.lower() 

    
    # Step 4: Lemmatize words
    wordnet_lemmatizer = WordNetLemmatizer()
    words = text.split()
    words = [wordnet_lemmatizer.lemmatize(word, pos="v") for word in words]
    text = ' '.join(words)
    
    return text

# Example usage:
tweet = "@username #happybirthday! I can't believe it's 5 o'clock. 123 I'll be home soon https://facebook.com 🎉"
cleaned_tweet = clean_text(tweet)
print(cleaned_tweet)

i cannot believe it be of the clock i will be home soon party_popper


In [10]:
df_subjectivity['clean_text'] = df_subjectivity['full_text'].apply(clean_text)
subjectivity_test_df['clean_text'] = subjectivity_test_df['full_text'].apply(clean_text)

In [11]:
df_subjectivity.head()

Unnamed: 0,full_text,subjectivity,clean_text
0,Thinking about a conspiraboomer paying some CO...,1,think about a conspiraboomer pay some covidpos...
1,@aja9696 @civillibertari2 @POTUS That's litera...,1,that be literally what the vaers be a cdc repo...
2,@POTUS I have a question Mr Presidente If I g...,1,i have a question mr presidente if i get the c...
3,"@ziggystar18 @corybernardi Ironically, yes, be...",1,ironically yes because australia have a good p...
4,@dysclinic And many become worse after covid v...,1,and many become worse after covid vaccines muc...


In [12]:
subjectivity_test_df.head()

Unnamed: 0,full_text,subjectivity,clean_text
0,Here is today’s Idaho COVID-19 vaccine data at...,0,here be todays idaho covid vaccine data at a g...
1,Health care worker dies after second dose of C...,1,health care worker die after second dose of co...
2,Boris Johnson: Emergency Services to receive C...,0,boris johnson emergency service to receive cov...
3,Think about it.... The same Liberal government...,1,think about it the same liberal government tha...
4,South Africa receives its first anti-Covid vac...,0,south africa receive its first anticovid vaccines


In [13]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from nltk.tokenize import word_tokenize

### Subjectivity Classification

In [14]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import numpy as np


def validation(pipeline):
    
    print(pipeline)
    
    # Initialize lists to store evaluation metrics
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []

    # K-Fold Cross Validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    for fold, (train_index, val_index) in enumerate(kf.split(df_subjectivity)):
            X_train_fold, X_val_fold = df_subjectivity['clean_text'].iloc[train_index], df_subjectivity['clean_text'].iloc[val_index]
            y_train_fold, y_val_fold = df_subjectivity['subjectivity'].iloc[train_index], df_subjectivity['subjectivity'].iloc[val_index]

            pipeline.fit(X_train_fold, y_train_fold)

            # Get predictions on validation fold
            predictions = pipeline.predict(X_val_fold)

            # Compute accuracy for this fold
            fold_accuracy = accuracy_score(y_val_fold, predictions)
            accuracies.append(fold_accuracy)
            
            # Compute precision, recall, and F1-score for this fold
            fold_report = classification_report(y_val_fold, predictions, output_dict=True)

            # Append precision, recall, and F1-score to respective lists
            precisions.append(fold_report['weighted avg']['precision'])
            recalls.append(fold_report['weighted avg']['recall'])
            f1_scores.append(fold_report['weighted avg']['f1-score'])

    # Calculate the mean accuracy, precision, recall, and F1-score across all folds
    mean_accuracy = np.mean(accuracies)
    mean_precision = np.mean(precisions)
    mean_recall = np.mean(recalls)
    mean_f1_score = np.mean(f1_scores)
    
    # Test against test set
    pipeline.fit(df_subjectivity['clean_text'], df_subjectivity['subjectivity'])
    
    # Get predictions on test set
    predictions = pipeline.predict(subjectivity_test_df['clean_text'])
    
    # Compute accuracy for test set
    test_accuracy = accuracy_score(subjectivity_test_df['subjectivity'], predictions)

    # Compute precision, recall, and F1-score for test set
    test_report = classification_report(subjectivity_test_df['subjectivity'], predictions, output_dict=True)

    print("Mean K-Fold Accuracy:", mean_accuracy)
    print("Mean K-Fold Precision:", mean_precision)
    print("Mean K-Fold Recall:", mean_recall)
    print("Mean K-Fold F1-Score:", mean_f1_score)
    
    print("Test Accuracy:", test_accuracy)
    print("Test Precision:", test_report['weighted avg']['precision'])
    print("Test Recall:", test_report['weighted avg']['recall'])
    print("Test F1-Score:", test_report['weighted avg']['f1-score'])

### Pipelines using unigrams

In [15]:
# Define count vectorizer pipeline for Logistic Regression
lr_cv_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=word_tokenize)),  # Use NLTK's word_tokenize function
    ('classifier', LogisticRegression(max_iter=1000))
])

# Define count vectorizer pipeline for Support Vector Machine (SVM)
svm_cv_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=word_tokenize)),  # Use NLTK's word_tokenize function
    ('classifier', SVC())
])

# Define count vectorizer pipeline for Naive Bayes
nb_cv_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=word_tokenize)),  # Use NLTK's word_tokenize function
    ('classifier', MultinomialNB())
])

# Define count vectorizer pipeline for Random Forest
rf_cv_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=word_tokenize)),  # Use NLTK's word_tokenize function
    ('classifier', RandomForestClassifier())
])

# Define TF-IDF vectorizer pipeline for Logistic Regression
lr_tfidf_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize)),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Define TF-IDF vectorizer pipeline for Support Vector Machine (SVM)
svm_tfidf_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize)),
    ('classifier', SVC())
])

# Define TF-IDF vectorizer pipeline for Naive Bayes
nb_tfidf_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize)),
    ('classifier', MultinomialNB())
])

# Define TF-IDF vectorizer pipeline for Random Forest
rf_tfidf_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize)),
    ('classifier', RandomForestClassifier())
])


In [16]:
cv_pipelines = [lr_cv_pipeline, svm_cv_pipeline, nb_cv_pipeline, rf_cv_pipeline,
                lr_tfidf_pipeline, svm_tfidf_pipeline, nb_tfidf_pipeline, rf_tfidf_pipeline]

for pipeline in cv_pipelines:
    validation(pipeline)
    print()

Pipeline(steps=[('vectorizer',
                 CountVectorizer(tokenizer=<function word_tokenize at 0x000002C204762340>)),
                ('classifier', LogisticRegression(max_iter=1000))])
Mean K-Fold Accuracy: 0.6854849382211784
Mean K-Fold Precision: 0.684201785085691
Mean K-Fold Recall: 0.6854849382211784
Mean K-Fold F1-Score: 0.683599645582267
Test Accuracy: 0.657
Test Precision: 0.6656068159600051
Test Recall: 0.657
Test F1-Score: 0.6544575120818843

Pipeline(steps=[('vectorizer',
                 CountVectorizer(tokenizer=<function word_tokenize at 0x000002C204762340>)),
                ('classifier', SVC())])
Mean K-Fold Accuracy: 0.6834831001735934
Mean K-Fold Precision: 0.6829064960651245
Mean K-Fold Recall: 0.6834831001735934
Mean K-Fold F1-Score: 0.6801079130598799
Test Accuracy: 0.665
Test Precision: 0.6787288760743747
Test Recall: 0.665
Test F1-Score: 0.660746355025606

Pipeline(steps=[('vectorizer',
                 CountVectorizer(tokenizer=<function word_tokenize at 

### Pipelines using n-grams(1,2) : Unigrams + Bigrams

In [17]:
# Define CountVectorizer pipeline for Logistic Regression with n-grams (1, 2)
lr_cv_ngram_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2))),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Define CountVectorizer pipeline for Support Vector Machine (SVM) with n-grams (1, 2)
svm_cv_ngram_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2))),
    ('classifier', SVC())
])

# Define CountVectorizer pipeline for Naive Bayes with n-grams (1, 2)
nb_cv_ngram_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2))),
    ('classifier', MultinomialNB())
])

# Define CountVectorizer pipeline for Random Forest with n-grams (1, 2)
rf_cv_ngram_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2))),
    ('classifier', RandomForestClassifier())
])

# Define TF-IDF vectorizer pipeline for Logistic Regression with n-grams (1, 2)
lr_tfidf_ngram_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2))),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Define TF-IDF vectorizer pipeline for Support Vector Machine (SVM) with n-grams (1, 2)
svm_tfidf_ngram_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2))),
    ('classifier', SVC())
])

# Define TF-IDF vectorizer pipeline for Naive Bayes with n-grams (1, 2)
nb_tfidf_ngram_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2))),
    ('classifier', MultinomialNB())
])

# Define TF-IDF vectorizer pipeline for Random Forest with n-grams (1, 2)
rf_tfidf_ngram_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2))),
    ('classifier', RandomForestClassifier())
])

In [18]:
ngrams_pipelines = [lr_cv_ngram_pipeline, svm_cv_ngram_pipeline, nb_cv_ngram_pipeline, rf_cv_ngram_pipeline,
                   lr_tfidf_ngram_pipeline, svm_tfidf_ngram_pipeline, nb_tfidf_ngram_pipeline, rf_tfidf_ngram_pipeline]

for pipeline in ngrams_pipelines:
    validation(pipeline)
    print()

Pipeline(steps=[('vectorizer',
                 CountVectorizer(ngram_range=(1, 2),
                                 tokenizer=<function word_tokenize at 0x000002C204762340>)),
                ('classifier', LogisticRegression(max_iter=1000))])
Mean K-Fold Accuracy: 0.6943448381496987
Mean K-Fold Precision: 0.6935730981523014
Mean K-Fold Recall: 0.6943448381496987
Mean K-Fold F1-Score: 0.6924997480130912
Test Accuracy: 0.689
Test Precision: 0.6970358707097526
Test Recall: 0.689
Test F1-Score: 0.6872099322209851

Pipeline(steps=[('vectorizer',
                 CountVectorizer(ngram_range=(1, 2),
                                 tokenizer=<function word_tokenize at 0x000002C204762340>)),
                ('classifier', SVC())])
Mean K-Fold Accuracy: 0.6853424895333402
Mean K-Fold Precision: 0.6847024652640417
Mean K-Fold Recall: 0.6853424895333402
Mean K-Fold F1-Score: 0.6817724509816206
Test Accuracy: 0.681
Test Precision: 0.6908324389971868
Test Recall: 0.681
Test F1-Score: 0.6785217850

### Pipelines using n-grams(2,2) : Bigrams (Tried, performs poorly and took a long time)

In [19]:
# # Define CountVectorizer pipeline for Logistic Regression with bigrams only
# lr_cv_ngram_pipeline = Pipeline([
#     ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(2, 2))),
#     ('classifier', LogisticRegression(max_iter=1000))
# ])

# # Define CountVectorizer pipeline for Support Vector Machine (SVM) with bigrams only
# svm_cv_ngram_pipeline = Pipeline([
#     ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(2, 2))),
#     ('classifier', SVC())
# ])

# # Define CountVectorizer pipeline for Naive Bayes with bigrams only
# nb_cv_ngram_pipeline = Pipeline([
#     ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(2, 2))),
#     ('classifier', MultinomialNB())
# ])

# # Define CountVectorizer pipeline for Random Forest with bigrams only
# rf_cv_ngram_pipeline = Pipeline([
#     ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(2, 2))),
#     ('classifier', RandomForestClassifier())
# ])

# # Define TF-IDF vectorizer pipeline for Logistic Regression with bigrams only
# lr_tfidf_ngram_pipeline = Pipeline([
#     ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize, ngram_range=(2, 2))),
#     ('classifier', LogisticRegression(max_iter=1000))
# ])

# # Define TF-IDF vectorizer pipeline for Support Vector Machine (SVM) with bigrams only
# svm_tfidf_ngram_pipeline = Pipeline([
#     ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize, ngram_range=(2, 2))),
#     ('classifier', SVC())
# ])

# # Define TF-IDF vectorizer pipeline for Naive Bayes with bigrams only
# nb_tfidf_ngram_pipeline = Pipeline([
#     ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize, ngram_range=(2, 2))),
#     ('classifier', MultinomialNB())
# ])

# # Define TF-IDF vectorizer pipeline for Random Forest with bigrams only
# rf_tfidf_ngram_pipeline = Pipeline([
#     ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize, ngram_range=(2, 2))),
#     ('classifier', RandomForestClassifier())
# ])

In [20]:
# ngrams_pipelines = [lr_cv_ngram_pipeline, svm_cv_ngram_pipeline, nb_cv_ngram_pipeline, rf_cv_ngram_pipeline,
#                    lr_tfidf_ngram_pipeline, svm_tfidf_ngram_pipeline, nb_tfidf_ngram_pipeline, rf_tfidf_ngram_pipeline]

# for pipeline in ngrams_pipelines:
#     validation(pipeline)
#     print()

### Pipelines using n-grams(2,3) : Bigrams + Trigrams (Tried, performs poorly and took a long time)

In [21]:
# # Define CountVectorizer pipeline for Logistic Regression with bigrams and trigrams
# lr_cv_ngram_pipeline = Pipeline([
#     ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(2, 3))),
#     ('classifier', LogisticRegression(max_iter=1000))
# ])

# # Define CountVectorizer pipeline for Support Vector Machine (SVM) with bigrams and trigrams
# svm_cv_ngram_pipeline = Pipeline([
#     ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(2, 3))),
#     ('classifier', SVC())
# ])

# # Define CountVectorizer pipeline for Naive Bayes with bigrams and trigrams
# nb_cv_ngram_pipeline = Pipeline([
#     ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(2, 3))),
#     ('classifier', MultinomialNB())
# ])

# # Define CountVectorizer pipeline for Random Forest with bigrams and trigrams
# rf_cv_ngram_pipeline = Pipeline([
#     ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(2, 3))),
#     ('classifier', RandomForestClassifier())
# ])

# # Define TF-IDF vectorizer pipeline for Logistic Regression with bigrams and trigrams
# lr_tfidf_ngram_pipeline = Pipeline([
#     ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize, ngram_range=(2, 3))),
#     ('classifier', LogisticRegression(max_iter=1000))
# ])

# # Define TF-IDF vectorizer pipeline for Support Vector Machine (SVM) with bigrams and trigrams
# svm_tfidf_ngram_pipeline = Pipeline([
#     ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize, ngram_range=(2, 3))),
#     ('classifier', SVC())
# ])

# # Define TF-IDF vectorizer pipeline for Naive Bayes with bigrams and trigrams
# nb_tfidf_ngram_pipeline = Pipeline([
#     ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize, ngram_range=(2, 3))),
#     ('classifier', MultinomialNB())
# ])

# # Define TF-IDF vectorizer pipeline for Random Forest with bigrams and trigrams
# rf_tfidf_ngram_pipeline = Pipeline([
#     ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize, ngram_range=(2, 3))),
#     ('classifier', RandomForestClassifier())
# ])

In [22]:
# ngrams_pipelines = [lr_cv_ngram_pipeline, svm_cv_ngram_pipeline, nb_cv_ngram_pipeline, rf_cv_ngram_pipeline,
#                    lr_tfidf_ngram_pipeline, svm_tfidf_ngram_pipeline, nb_tfidf_ngram_pipeline, rf_tfidf_ngram_pipeline]

# for pipeline in ngrams_pipelines:
#     validation(pipeline)
#     print()

### Selected three best models
Based on best Test Accuracy and Mean K-Fold Accuracy, and similar scores in Precision and Recall:
- 1. TfidfVectorizer(Unigrams) + Logistic Regression
- 2. CountVectorizer(Unigrams + Bigrams), + Logistic Regression
- 3. TfidfVectorizer(Unigrams + Bigrams), + Logistic Regression

| Measure                                                      | TfidfVectorizer Unigrams + LR  | CountVectorizer Unigrams and Bigrams + LR | TfidfVectorizer Unigrams and Bigrams + LR |
|:-------------------------------------------------------------|:--------------------------------|:--------------------------------------------:|:-------------------------------------------:|
| Mean K-Fold Accuracy                                         | 0.6899                          | 0.6943                                       | 0.6909                                      |
| Mean K-Fold Precision                                       | 0.6892                          | 0.6936                                       | 0.6904                                      |
| Mean K-Fold Recall                                          | 0.6899                          | 0.6943                                       | 0.6909                                      |
| Mean K-Fold F1-Score                                        | 0.6873                          | 0.6925                                       | 0.6901                                      |
| Test Accuracy                                               | 0.688                           | 0.689                                        | 0.682                                       |
| Test Precision                                              | 0.6975                          | 0.6970                                       | 0.6862                                      |
| Test Recall                                                 | 0.688                           | 0.689                                        | 0.682                                       |
| Test F1-Score                                               | 0.6857                          | 0.6872                                       | 0.6813                                      |



In [23]:
# Define TF-IDF vectorizer pipeline for Logistic Regression
lr_tfidf_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize)),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Define CountVectorizer pipeline for Logistic Regression with n-grams (1, 2)
lr_cv_ngram_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2))),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Define TF-IDF vectorizer pipeline for Logistic Regression with n-grams (1, 2)
lr_tfidf_ngram_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2))),
    ('classifier', LogisticRegression(max_iter=1000))
])

In [24]:
validation(lr_tfidf_pipeline)
print()
validation(lr_cv_ngram_pipeline)
print()
validation(lr_tfidf_ngram_pipeline)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function word_tokenize at 0x000002C204762340>)),
                ('classifier', LogisticRegression(max_iter=1000))])
Mean K-Fold Accuracy: 0.6899147350148065
Mean K-Fold Precision: 0.6892420830589006
Mean K-Fold Recall: 0.6899147350148065
Mean K-Fold F1-Score: 0.687304330929749
Test Accuracy: 0.688
Test Precision: 0.6975497188834479
Test Recall: 0.688
Test F1-Score: 0.6857418939783839

Pipeline(steps=[('vectorizer',
                 CountVectorizer(ngram_range=(1, 2),
                                 tokenizer=<function word_tokenize at 0x000002C204762340>)),
                ('classifier', LogisticRegression(max_iter=1000))])
Mean K-Fold Accuracy: 0.6943448381496987
Mean K-Fold Precision: 0.6935730981523014
Mean K-Fold Recall: 0.6943448381496987
Mean K-Fold F1-Score: 0.6924997480130912
Test Accuracy: 0.689
Test Precision: 0.6970358707097526
Test Recall: 0.689
Test F1-Score: 0.6872099322209851

Pipeline(steps=[('

### Enhancements
1. NER, WSD(applied using lesk but it takes too long to train hence it is not used.)
2. Ensemble model
3. Grid Search

In [25]:
# Uncomment the download if you have not downloaded
# nltk.download('maxent_ne_chunker')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('words')

In [26]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn
from nltk import pos_tag
from nltk.chunk import ne_chunk
import nltk
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Function for named entity recognition (NER)
def extract_entities(text):
    entities = []
    for chunk in ne_chunk(pos_tag(word_tokenize(text))):
        if hasattr(chunk, 'label'):
            entities.append(' '.join(c[0] for c in chunk.leaves()))
    return entities

In [27]:
# Baseline model without innovations
baseline_model = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize)),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Model with named entity recognition (NER)
ner_model = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=lambda text: extract_entities(text))),
    ('classifier', LogisticRegression(max_iter=1000))
])

In [28]:
import warnings
warnings.filterwarnings("ignore")
validation(baseline_model)
print()
validation(ner_model)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function word_tokenize at 0x000002C204762340>)),
                ('classifier', LogisticRegression(max_iter=1000))])
Mean K-Fold Accuracy: 0.6899147350148065
Mean K-Fold Precision: 0.6892420830589006
Mean K-Fold Recall: 0.6899147350148065
Mean K-Fold F1-Score: 0.687304330929749
Test Accuracy: 0.688
Test Precision: 0.6975497188834479
Test Recall: 0.688
Test F1-Score: 0.6857418939783839

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function <lambda> at 0x000002C227D34E00>)),
                ('classifier', LogisticRegression(max_iter=1000))])
Mean K-Fold Accuracy: 0.5505871540896559
Mean K-Fold Precision: 0.30343878136133373
Mean K-Fold Recall: 0.5505871540896559
Mean K-Fold F1-Score: 0.3911644741820092
Test Accuracy: 0.486
Test Precision: 0.236196
Test Recall: 0.486
Test F1-Score: 0.3178950201884253


### Results of baseline and NER models.
1. NER model performs poorly, it will be excluded from the ensemble model.

### Ensemble model
- 1. TfidfVectorizer(Unigrams) + Logistic Regression
- 2. CountVectorizer(Unigrams + Bigrams), + Logistic Regression
- 3. TfidfVectorizer(Unigrams + Bigrams), + Logistic Regression

In [29]:
# Define TF-IDF vectorizer pipeline for Logistic Rregression
baseline_model = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize)),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Define CountVectorizer pipeline for Logistic Rregression with n-grams (1, 2)
lr_cv_ngram_model = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2))),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Define TF-IDF vectorizer pipeline for Logistic Regression with n-grams (1, 2)
lr_tfidf_ngram_model = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2))),
    ('classifier', LogisticRegression(max_iter=1000))
])

from sklearn.ensemble import VotingClassifier

# Define the ensemble model using a VotingClassifier
ensemble_model = VotingClassifier(estimators=[
    ('baseline', baseline_model),
    ('lrcv', lr_cv_ngram_model),
    ('lrtfidf', lr_tfidf_ngram_model),
    
], voting='hard')  # You can change to 'soft' if you prefer soft voting

In [30]:
validation(baseline_model)
print()
validation(lr_cv_ngram_model)
print()
validation(lr_tfidf_ngram_model)
print()
validation(ensemble_model)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function word_tokenize at 0x000002C204762340>)),
                ('classifier', LogisticRegression(max_iter=1000))])
Mean K-Fold Accuracy: 0.6899147350148065
Mean K-Fold Precision: 0.6892420830589006
Mean K-Fold Recall: 0.6899147350148065
Mean K-Fold F1-Score: 0.687304330929749
Test Accuracy: 0.688
Test Precision: 0.6975497188834479
Test Recall: 0.688
Test F1-Score: 0.6857418939783839

Pipeline(steps=[('vectorizer',
                 CountVectorizer(ngram_range=(1, 2),
                                 tokenizer=<function word_tokenize at 0x000002C204762340>)),
                ('classifier', LogisticRegression(max_iter=1000))])
Mean K-Fold Accuracy: 0.6943448381496987
Mean K-Fold Precision: 0.6935730981523014
Mean K-Fold Recall: 0.6943448381496987
Mean K-Fold F1-Score: 0.6924997480130912
Test Accuracy: 0.689
Test Precision: 0.6970358707097526
Test Recall: 0.689
Test F1-Score: 0.6872099322209851

Pipeline(steps=[('

### Conclusion of ensemble model
1. The Ensemble model has highest Test metrics and K-Fold Metrics on par with the best.
3. This shows a good improvement compared to only using 1 model.

| Measure              | Baseline (LR) | CV LR Ngrams      | TFIDF LR Ngrams     | Ensemble (Voting) |
|:---------------------|:--------------:|:--------:|:--------:|:-----------------:|
| Mean K-Fold Accuracy | 0.6899         | 0.6943   | 0.6909   | 0.6942            |
| Mean K-Fold Precision| 0.6892         | 0.6936   | 0.6904   | 0.6935            |
| Mean K-Fold Recall   | 0.6899         | 0.6943   | 0.6909   | 0.6942            |
| Mean K-Fold F1-Score | 0.6873         | 0.6925   | 0.6901   | 0.6926            |
| Test Accuracy        | 0.688          | 0.689    | 0.682    | 0.691             |
| Test Precision       | 0.6975         | 0.6970   | 0.686    | 0.6988            |
| Test Recall          | 0.688          | 0.689    | 0.682    | 0.691             |
| Test F1-Score        | 0.6857         | 0.6872   | 0.681    | 0.6893            |


### Enhancement by grid search for Logistic Regression
As the grid search for Ensemble model takes took long, we will investigate for Logistic Regression.

In [31]:
X_train = df_subjectivity['clean_text']
y_train = df_subjectivity['subjectivity']
X_test = subjectivity_test_df['clean_text']
y_test = subjectivity_test_df['subjectivity']

In [32]:
from sklearn.model_selection import GridSearchCV

# Define TF-IDF vectorizer pipeline for Logistic Rregression
baseline_model = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize)),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Define CountVectorizer pipeline for Logistic Rregression with n-grams (1, 2)
lr_cv_ngram_model = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2))),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Define TF-IDF vectorizer pipeline for Logistic Regression with n-grams (1, 2)
lr_tfidf_ngram_model = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2))),
    ('classifier', LogisticRegression(max_iter=1000))
])

In [33]:
# Define the parameter grid for grid search for logistic regression model
lr_param_grid = {
    'classifier__C': [0.1, 1, 10],  # Parameters for logistic regression in lr_model
    'classifier__solver': ['lbfgs', 'liblinear', 'newton-cg', 'sag'],  # Solver options for logistic regression in lr_model
}

# Perform grid search for logistic regression model with TF-IDF vectorizer
lr_grid_search = GridSearchCV(baseline_model, lr_param_grid, cv=5)
lr_grid_search.fit(X_train, y_train)

# Get best parameters and score for logistic regression model with TF-IDF vectorizer
print("Best Parameters (LR baseline):", lr_grid_search.best_params_)
print("Best Score (LR baseline):", lr_grid_search.best_score_)

# Evaluate the best model found by grid search on test data
test_accuracy = lr_grid_search.score(X_test, y_test)
print("Test Accuracy (LR baseline):", test_accuracy)

Best Parameters (LR baseline): {'classifier__C': 1, 'classifier__solver': 'lbfgs'}
Best Score (LR baseline): 0.693770141938119
Test Accuracy (LR baseline): 0.688


In [34]:
# Define the parameter grid for grid search for logistic regression model
lr_param_grid = {
    'classifier__C': [0.1, 1, 10],  # Parameters for logistic regression in lr_model
    'classifier__solver': ['lbfgs', 'liblinear', 'newton-cg', 'sag'],  # Solver options for logistic regression in lr_model
}

# Perform grid search for logistic regression model with TF-IDF vectorizer
lr_grid_search = GridSearchCV(lr_cv_ngram_model, lr_param_grid, cv=5)
lr_grid_search.fit(X_train, y_train)

# Get best parameters and score for logistic regression model with TF-IDF vectorizer
print("Best Parameters (LR cv):", lr_grid_search.best_params_)
print("Best Score (LR cv):", lr_grid_search.best_score_)

# Evaluate the best model found by grid search on test data
test_accuracy = lr_grid_search.score(X_test, y_test)
print("Test Accuracy (LR cv):", test_accuracy)

Best Parameters (LR cv): {'classifier__C': 0.1, 'classifier__solver': 'lbfgs'}
Best Score (LR cv): 0.6944837128561218
Test Accuracy (LR cv): 0.674


In [35]:
# Define the parameter grid for grid search for logistic regression model
lr_param_grid = {
    'classifier__C': [0.1, 1, 10],  # Parameters for logistic regression in lr_model
    'classifier__solver': ['lbfgs', 'liblinear', 'newton-cg', 'sag'],  # Solver options for logistic regression in lr_model
}

# Perform grid search for logistic regression model with TF-IDF vectorizer
lr_grid_search = GridSearchCV(lr_tfidf_ngram_model, lr_param_grid, cv=5)
lr_grid_search.fit(X_train, y_train)

# Get best parameters and score for logistic regression model with TF-IDF vectorizer
print("Best Parameters (LR tfidf):", lr_grid_search.best_params_)
print("Best Score (LR tfidf):", lr_grid_search.best_score_)

# Evaluate the best model found by grid search on test data
test_accuracy = lr_grid_search.score(X_test, y_test)
print("Test Accuracy (LR tfidf):", test_accuracy)

Best Parameters (LR tfidf): {'classifier__C': 10, 'classifier__solver': 'lbfgs'}
Best Score (LR tfidf): 0.6927703461656286
Test Accuracy (LR tfidf): 0.693


### Updated Ensemble Model

1. We will update for lr_tfidf_ngams models only. The optimized parameters for baseline is the same as the original. As the test accuracy for lr_cv_ngrams decreased, we will not use its updated config.

In [36]:
# Define TF-IDF vectorizer pipeline for Logistic Rregression
baseline_model = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize)),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Define CountVectorizer pipeline for Logistic Rregression with n-grams (1, 2)
lr_cv_ngram_model = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2))),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Define TF-IDF vectorizer pipeline for Logistic Regression with n-grams (1, 2)
lr_tfidf_ngram_model = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2))),
    ('classifier', LogisticRegression(max_iter=1000, C=10, solver='lbfgs'))
])

# Define the ensemble model using a VotingClassifier
ensemble_model = VotingClassifier(estimators=[
    ('baseline', baseline_model),
    ('lrcv', lr_cv_ngram_model),
    ('lrtfidf', lr_tfidf_ngram_model),
    
], voting='hard')  # You can change to 'soft' if you prefer soft voting

In [37]:
validation(ensemble_model)

VotingClassifier(estimators=[('baseline',
                              Pipeline(steps=[('vectorizer',
                                               TfidfVectorizer(tokenizer=<function word_tokenize at 0x000002C204762340>)),
                                              ('classifier',
                                               LogisticRegression(max_iter=1000))])),
                             ('lrcv',
                              Pipeline(steps=[('vectorizer',
                                               CountVectorizer(ngram_range=(1,
                                                                            2),
                                                               tokenizer=<function word_tokenize at 0x000002C204762340>)),
                                              ('classifier',
                                               LogisticRegression(max_iter=1000))])),
                             ('lrtfidf',
                              Pipeline(steps=[('vectorizer

| Measure                | Original Ensemble       | Updated Ensemble     |
|------------------------|----------------|---------------|
| Mean K-Fold Accuracy   | 0.6942         | 0.7018        |
| Mean K-Fold Precision  | 0.6935         | 0.7013        |
| Mean K-Fold Recall     | 0.6942         | 0.7018        |
| Mean K-Fold F1-Score   | 0.6926         | 0.7009        |
| Test Accuracy          | 0.691          | 0.698         |
| Test Precision         | 0.6988         | 0.7039        |
| Test Recall            | 0.691          | 0.698         |
| Test F1-Score          | 0.6893         | 0.6969        |


### Conclusion
We will choose the Updated Ensemble as there are improves in all metrics

In [38]:
import time
import pickle

# Start the timer
start_time = time.time()

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# End the timer
end_time = time.time()

# Save the model to a file
with open('subjectivity_model.pkl', 'wb') as f:
    pickle.dump(ensemble_model, f)
    
# Calculate the training time
training_time = end_time - start_time

print("Training time:", training_time, "seconds")
print("Training size:", len(X_train))
print("Training time per sample:", training_time/len(X_train), "seconds")
print("Train samples per second:", len(X_train) / training_time)
print()

# Start the timer for inference
start_time = time.time()

# Make predictions on the test data
predictions = ensemble_model.predict(X_test)

# End the timer for inference
end_time = time.time()

# Calculate the inference time
inference_time = end_time - start_time
print("Inference time:", inference_time, "seconds")
print("Inference size:", len(X_test))
print("Inference time per sample:", inference_time / len(X_test), "seconds")
print("Inference samples per second:", len(X_test) / inference_time)

Training time: 6.108870029449463 seconds
Training size: 6998
Training time per sample: 0.0008729451313874625 seconds
Train samples per second: 1145.5473706699022

Inference time: 0.3749995231628418 seconds
Inference size: 1000
Inference time per sample: 0.0003749995231628418 seconds
Inference samples per second: 2666.670057512992


In [39]:
# Later, when you want to use the model for prediction:
import re
import contractions
import emoji
from nltk.stem import WordNetLemmatizer
import pickle

# Load the model from the file
with open('subjectivity_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

# Decode output:
def decode(predictions):
    if predictions[0] == 0:
           return 'Neutral'
    else:
        return 'Opinionated'
    
# Preprocess tweet    
def preprocess(text):
    # Step 1: Expand contractions
    text = contractions.fix(text)
    # Step 2: Map emojis into its word meaning
    text = emoji.demojize(text)
    # Step 3: Remove mentions, hashtags, numbers and links
    pattern = r'@[A-Za-z0-9_]+|#[A-Za-z0-9]+|\d+|https?://\S+|[^\w\s]'
    text = re.sub(pattern, '', text)
    # put text to lowercase
    text = text.lower() 
    # Step 4: Lemmatize words
    wordnet_lemmatizer = WordNetLemmatizer()
    words = text.split()
    words = [wordnet_lemmatizer.lemmatize(word, pos="v") for word in words]
    text = ' '.join(words)
    return text
    
# Make predictions using the loaded model
tweet_1 = 'The vaccine is bad, do not take it.'
tweet = preprocess(tweet_1)
predictions = loaded_model.predict([tweet])
print(tweet_1, decode(predictions) )

tweet_2 = 'The vaccine is good for everyone even though it hurts badly, please take it.'
tweet = preprocess(tweet_2)
predictions = loaded_model.predict([tweet])
print(tweet_2, decode(predictions) )

tweet_3 = 'Covid vaccine is available for everyone to take for free.'
tweet = preprocess(tweet_3)
predictions = loaded_model.predict([tweet])
print(tweet_3, decode(predictions) )

tweet_4 = 'Covid vaccine uses mrna technology to combat the virus.'
tweet = preprocess(tweet_4)
predictions = loaded_model.predict([tweet])
print(tweet_4, decode(predictions) )

The vaccine is bad, do not take it. Opinionated
The vaccine is good for everyone even though it hurts badly, please take it. Opinionated
Covid vaccine is available for everyone to take for free. Neutral
Covid vaccine uses mrna technology to combat the virus. Neutral


### Time metrics for scalability
1. Training time
2. Inference time

| Metric                        | Value                        |
|:------------------------------|------------------------------|
| Training time (s)             | 6.1089                       |
| Training size                 | 6998                         |
| Training time per sample (s)  | 0.0009                       |
| Train samples per second      | 1145.5474                    |
| Inference time (s)            | 0.3750                       |
| Inference size                | 1000                         |
| Inference time per sample (s) | 0.0004                       |
| Inference samples per second  | 2666.6701                    |



It is very scalable with extremely fast training and inference.