In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
# !pip install contractions
# !pip install emoji

In [3]:
df = pd.read_csv("train.csv")
df = df[['full_text', 'Sentiment Analysis (Label)']]
df.rename(columns = {'Sentiment Analysis (Label)': 'label'}, inplace = True)
df = df[df['label'] != 'Neutral']
df.reset_index(drop=True, inplace=True)
df.groupby('label').count()

Unnamed: 0_level_0,full_text
label,Unnamed: 1_level_1
Negative,1376
Positive,1769


In [4]:
label_mapping = {'Negative': 0, 'Positive': 1}
df['polarity'] = df['label'].map(label_mapping)
df_polarity = df[['full_text','polarity']]
df_polarity.groupby('polarity').count()

Unnamed: 0_level_0,full_text
polarity,Unnamed: 1_level_1
0,1376
1,1769


In [5]:
df_polarity.head()

Unnamed: 0,full_text,polarity
0,Thinking about a conspiraboomer paying some CO...,0
1,@aja9696 @civillibertari2 @POTUS That's litera...,1
2,@POTUS I have a question Mr Presidente If I g...,0
3,"@ziggystar18 @corybernardi Ironically, yes, be...",1
4,@dysclinic And many become worse after covid v...,0


In [6]:
test_df = pd.read_csv("eval.csv")
test_df = test_df[['full_text', 'Sentiment Analysis (Label)']]
test_df.rename(columns = {'Sentiment Analysis (Label)': 'label'}, inplace = True)
test_df = test_df[test_df['label'] != 'Neutral']
test_df.dropna(inplace=True)

In [7]:
label_mapping = {'Negative': 0, 'Positive': 1}
test_df['polarity'] = test_df['label'].map(label_mapping)
polarity_test_df = test_df[['full_text','polarity']]
polarity_test_df.groupby('polarity').count()

Unnamed: 0_level_0,full_text
polarity,Unnamed: 1_level_1
0,250
1,264


### Preprocessing Tweets

We will be using these commonly used ways to pre-process tweets

1. Expand contractions
2. Map emojis into its word meaning
3. Remove mentions(@), hashtags(#), punctuations and numbers as we prioritise sentiment from the text
4. Lemmatization

In [8]:
import re
import contractions
import emoji
from nltk.stem import WordNetLemmatizer

def clean_text(text):

    # Step 1: Expand contractions
    text = contractions.fix(text)
    
    # Step 2: Map emojis into its word meaning
    text = emoji.demojize(text)

    # Step 3: Remove mentions, hashtags, numbers and links
    pattern = r'@[A-Za-z0-9_]+|#[A-Za-z0-9]+|\d+|https?://\S+|[^\w\s]'
    text = re.sub(pattern, '', text)
    # put text to lowercase
    text = text.lower() 
    
    # Step 4: Lemmatize words
    wordnet_lemmatizer = WordNetLemmatizer()
    words = text.split()
    words = [wordnet_lemmatizer.lemmatize(word, pos="v") for word in words]
    text = ' '.join(words)
    
    return text

# Example usage:
tweet = "@username #happybirthday! I can't believe it's 5 o'clock. 123 I'll be home soon https://facebook.com 🎉"
cleaned_tweet = clean_text(tweet)
print(cleaned_tweet)

i cannot believe it be of the clock i will be home soon party_popper


In [9]:
df_polarity['clean_text'] = df_polarity['full_text'].apply(clean_text)
polarity_test_df['clean_text'] = polarity_test_df['full_text'].apply(clean_text)

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from nltk.tokenize import word_tokenize

### Subjectivity Classification

In [11]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import numpy as np


def validation(pipeline):
    
    print(pipeline)
    
    # Initialize lists to store evaluation metrics
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []

    # K-Fold Cross Validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    for fold, (train_index, val_index) in enumerate(kf.split(df_polarity)):
            X_train_fold, X_val_fold = df_polarity['clean_text'].iloc[train_index], df_polarity['clean_text'].iloc[val_index]
            y_train_fold, y_val_fold = df_polarity['polarity'].iloc[train_index], df_polarity['polarity'].iloc[val_index]

            pipeline.fit(X_train_fold, y_train_fold)

            # Get predictions on validation fold
            predictions = pipeline.predict(X_val_fold)

            # Compute accuracy for this fold
            fold_accuracy = accuracy_score(y_val_fold, predictions)
            accuracies.append(fold_accuracy)
            
            # Compute precision, recall, and F1-score for this fold
            fold_report = classification_report(y_val_fold, predictions, output_dict=True)

            # Append precision, recall, and F1-score to respective lists
            precisions.append(fold_report['weighted avg']['precision'])
            recalls.append(fold_report['weighted avg']['recall'])
            f1_scores.append(fold_report['weighted avg']['f1-score'])

    # Calculate the mean accuracy, precision, recall, and F1-score across all folds
    mean_accuracy = np.mean(accuracies)
    mean_precision = np.mean(precisions)
    mean_recall = np.mean(recalls)
    mean_f1_score = np.mean(f1_scores)
    
    # Test against test set
    pipeline.fit(df_polarity['clean_text'], df_polarity['polarity'])
    
    # Get predictions on test set
    predictions = pipeline.predict(polarity_test_df['clean_text'])
    
    # Compute accuracy for test set
    test_accuracy = accuracy_score(polarity_test_df['polarity'], predictions)

    # Compute precision, recall, and F1-score for test set
    test_report = classification_report(polarity_test_df['polarity'], predictions, output_dict=True)

    print("Mean K-Fold Accuracy:", mean_accuracy)
    print("Mean K-Fold Precision:", mean_precision)
    print("Mean K-Fold Recall:", mean_recall)
    print("Mean K-Fold F1-Score:", mean_f1_score)
    
    print("Test Accuracy:", test_accuracy)
    print("Test Precision:", test_report['weighted avg']['precision'])
    print("Test Recall:", test_report['weighted avg']['recall'])
    print("Test F1-Score:", test_report['weighted avg']['f1-score'])

### Pipelines using unigrams

In [12]:
# Define count vectorizer pipeline for Logistic Regression
lr_cv_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=word_tokenize)),  # Use NLTK's word_tokenize function
    ('classifier', LogisticRegression(max_iter=1000))
])

# Define count vectorizer pipeline for Support Vector Machine (SVM)
svm_cv_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=word_tokenize)),  # Use NLTK's word_tokenize function
    ('classifier', SVC())
])

# Define count vectorizer pipeline for Naive Bayes
nb_cv_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=word_tokenize)),  # Use NLTK's word_tokenize function
    ('classifier', MultinomialNB())
])

# Define count vectorizer pipeline for Random Forest
rf_cv_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=word_tokenize)),  # Use NLTK's word_tokenize function
    ('classifier', RandomForestClassifier())
])

# Define TF-IDF vectorizer pipeline for Logistic Regression
lr_tfidf_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize)),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Define TF-IDF vectorizer pipeline for Support Vector Machine (SVM)
svm_tfidf_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize)),
    ('classifier', SVC())
])

# Define TF-IDF vectorizer pipeline for Naive Bayes
nb_tfidf_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize)),
    ('classifier', MultinomialNB())
])

# Define TF-IDF vectorizer pipeline for Random Forest
rf_tfidf_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize)),
    ('classifier', RandomForestClassifier())
])


In [13]:
cv_pipelines = [lr_cv_pipeline, svm_cv_pipeline, nb_cv_pipeline, rf_cv_pipeline,
                lr_tfidf_pipeline, svm_tfidf_pipeline, nb_tfidf_pipeline, rf_tfidf_pipeline]

for pipeline in cv_pipelines:
    validation(pipeline)
    print()

Pipeline(steps=[('vectorizer',
                 CountVectorizer(tokenizer=<function word_tokenize at 0x000001D31662AAC0>)),
                ('classifier', LogisticRegression(max_iter=1000))])
Mean K-Fold Accuracy: 0.7036565977742448
Mean K-Fold Precision: 0.7026243155986036
Mean K-Fold Recall: 0.7036565977742448
Mean K-Fold F1-Score: 0.7023988833002877
Test Accuracy: 0.669260700389105
Test Precision: 0.6692723246714242
Test Recall: 0.669260700389105
Test F1-Score: 0.6687685063909293

Pipeline(steps=[('vectorizer',
                 CountVectorizer(tokenizer=<function word_tokenize at 0x000001D31662AAC0>)),
                ('classifier', SVC())])
Mean K-Fold Accuracy: 0.6922098569157392
Mean K-Fold Precision: 0.6914550077352666
Mean K-Fold Recall: 0.6922098569157392
Mean K-Fold F1-Score: 0.6871883101623848
Test Accuracy: 0.6575875486381323
Test Precision: 0.6594247063529582
Test Recall: 0.6575875486381323
Test F1-Score: 0.6552323639488311

Pipeline(steps=[('vectorizer',
                 

### Pipelines using n-grams(1,2) : Unigrams + Bigrams

In [14]:
# Define CountVectorizer pipeline for Logistic Regression with n-grams (1, 2)
lr_cv_ngram_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2))),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Define CountVectorizer pipeline for Support Vector Machine (SVM) with n-grams (1, 2)
svm_cv_ngram_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2))),
    ('classifier', SVC())
])

# Define CountVectorizer pipeline for Naive Bayes with n-grams (1, 2)
nb_cv_ngram_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2))),
    ('classifier', MultinomialNB())
])

# Define CountVectorizer pipeline for Random Forest with n-grams (1, 2)
rf_cv_ngram_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2))),
    ('classifier', RandomForestClassifier())
])
# Define TF-IDF vectorizer pipeline for Logistic Regression with n-grams (1, 2)
lr_tfidf_ngram_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2))),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Define TF-IDF vectorizer pipeline for Support Vector Machine (SVM) with n-grams (1, 2)
svm_tfidf_ngram_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2))),
    ('classifier', SVC())
])

# Define TF-IDF vectorizer pipeline for Naive Bayes with n-grams (1, 2)
nb_tfidf_ngram_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2))),
    ('classifier', MultinomialNB())
])

# Define TF-IDF vectorizer pipeline for Random Forest with n-grams (1, 2)
rf_tfidf_ngram_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2))),
    ('classifier', RandomForestClassifier())
])

In [15]:
ngrams_pipelines = [lr_cv_ngram_pipeline, svm_cv_ngram_pipeline, nb_cv_ngram_pipeline, rf_cv_ngram_pipeline,
                   lr_tfidf_ngram_pipeline, svm_tfidf_ngram_pipeline, nb_tfidf_ngram_pipeline, rf_tfidf_ngram_pipeline]

for pipeline in ngrams_pipelines:
    validation(pipeline)
    print()

Pipeline(steps=[('vectorizer',
                 CountVectorizer(ngram_range=(1, 2),
                                 tokenizer=<function word_tokenize at 0x000001D31662AAC0>)),
                ('classifier', LogisticRegression(max_iter=1000))])
Mean K-Fold Accuracy: 0.7236883942766295
Mean K-Fold Precision: 0.7226287155551465
Mean K-Fold Recall: 0.7236883942766295
Mean K-Fold F1-Score: 0.7219093869972267
Test Accuracy: 0.7217898832684825
Test Precision: 0.7217965366083615
Test Recall: 0.7217898832684825
Test F1-Score: 0.7215714657412097

Pipeline(steps=[('vectorizer',
                 CountVectorizer(ngram_range=(1, 2),
                                 tokenizer=<function word_tokenize at 0x000001D31662AAC0>)),
                ('classifier', SVC())])
Mean K-Fold Accuracy: 0.6966613672496026
Mean K-Fold Precision: 0.6971693511455634
Mean K-Fold Recall: 0.6966613672496026
Mean K-Fold F1-Score: 0.6903543302216981
Test Accuracy: 0.669260700389105
Test Precision: 0.6736851775569656
Test Rec

### Pipelines using n-grams(2,2) : Bigrams (Tried, but perfromance is low so it is not considered)

In [16]:
# # Define CountVectorizer pipeline for Logistic Regression with bigrams only
# lr_cv_ngram_pipeline = Pipeline([
#     ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(2, 2))),
#     ('classifier', LogisticRegression(max_iter=1000))
# ])

# # Define CountVectorizer pipeline for Support Vector Machine (SVM) with bigrams only
# svm_cv_ngram_pipeline = Pipeline([
#     ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(2, 2))),
#     ('classifier', SVC())
# ])

# # Define CountVectorizer pipeline for Naive Bayes with bigrams only
# nb_cv_ngram_pipeline = Pipeline([
#     ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(2, 2))),
#     ('classifier', MultinomialNB())
# ])

# # Define CountVectorizer pipeline for Random Forest with bigrams only
# rf_cv_ngram_pipeline = Pipeline([
#     ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(2, 2))),
#     ('classifier', RandomForestClassifier())
# ])

# # Define TF-IDF vectorizer pipeline for Logistic Regression with bigrams only
# lr_tfidf_ngram_pipeline = Pipeline([
#     ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize, ngram_range=(2, 2))),
#     ('classifier', LogisticRegression(max_iter=1000))
# ])

# # Define TF-IDF vectorizer pipeline for Support Vector Machine (SVM) with bigrams only
# svm_tfidf_ngram_pipeline = Pipeline([
#     ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize, ngram_range=(2, 2))),
#     ('classifier', SVC())
# ])

# # Define TF-IDF vectorizer pipeline for Naive Bayes with bigrams only
# nb_tfidf_ngram_pipeline = Pipeline([
#     ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize, ngram_range=(2, 2))),
#     ('classifier', MultinomialNB())
# ])

# # Define TF-IDF vectorizer pipeline for Random Forest with bigrams only
# rf_tfidf_ngram_pipeline = Pipeline([
#     ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize, ngram_range=(2, 2))),
#     ('classifier', RandomForestClassifier())
# ])

In [17]:
# ngrams_pipelines = [lr_cv_ngram_pipeline, svm_cv_ngram_pipeline, nb_cv_ngram_pipeline, rf_cv_ngram_pipeline,
#                    lr_tfidf_ngram_pipeline, svm_tfidf_ngram_pipeline, nb_tfidf_ngram_pipeline, rf_tfidf_ngram_pipeline]

# for pipeline in ngrams_pipelines:
#     validation(pipeline)
#     print()

### Pipelines using n-grams(2,3) : Bigrams + Trigrams (Tried, but perfromance is low so it is not considered)

In [18]:
# # Define CountVectorizer pipeline for Logistic Regression with bigrams and trigrams
# lr_cv_ngram_pipeline = Pipeline([
#     ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(2, 3))),
#     ('classifier', LogisticRegression(max_iter=1000))
# ])

# # Define CountVectorizer pipeline for Support Vector Machine (SVM) with bigrams and trigrams
# svm_cv_ngram_pipeline = Pipeline([
#     ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(2, 3))),
#     ('classifier', SVC())
# ])

# # Define CountVectorizer pipeline for Naive Bayes with bigrams and trigrams
# nb_cv_ngram_pipeline = Pipeline([
#     ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(2, 3))),
#     ('classifier', MultinomialNB())
# ])

# # Define CountVectorizer pipeline for Random Forest with bigrams and trigrams
# rf_cv_ngram_pipeline = Pipeline([
#     ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(2, 3))),
#     ('classifier', RandomForestClassifier())
# ])

# # Define TF-IDF vectorizer pipeline for Logistic Regression with bigrams and trigrams
# lr_tfidf_ngram_pipeline = Pipeline([
#     ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize, ngram_range=(2, 3))),
#     ('classifier', LogisticRegression(max_iter=1000))
# ])

# # Define TF-IDF vectorizer pipeline for Support Vector Machine (SVM) with bigrams and trigrams
# svm_tfidf_ngram_pipeline = Pipeline([
#     ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize, ngram_range=(2, 3))),
#     ('classifier', SVC())
# ])

# # Define TF-IDF vectorizer pipeline for Naive Bayes with bigrams and trigrams
# nb_tfidf_ngram_pipeline = Pipeline([
#     ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize, ngram_range=(2, 3))),
#     ('classifier', MultinomialNB())
# ])

# # Define TF-IDF vectorizer pipeline for Random Forest with bigrams and trigrams
# rf_tfidf_ngram_pipeline = Pipeline([
#     ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize, ngram_range=(2, 3))),
#     ('classifier', RandomForestClassifier())
# ])

In [19]:
# ngrams_pipelines = [lr_cv_ngram_pipeline, svm_cv_ngram_pipeline, nb_cv_ngram_pipeline, rf_cv_ngram_pipeline,
#                    lr_tfidf_ngram_pipeline, svm_tfidf_ngram_pipeline, nb_tfidf_ngram_pipeline, rf_tfidf_ngram_pipeline]

# for pipeline in ngrams_pipelines:
#     validation(pipeline)
#     print()

### Selected three best models
Based on Test Accuracy and Mean K-Fold Accuracy, and similar scores in Precision and Recall:
- 1. Tfidfvectorizer(Unigrams) + SVM
- 2. CountVectorizer(Unigrams + Bigrams) + Logistic Regression
- 3. TfidfVectorizer(Unigrams + Bigrams) + SVM

| Measure                | Tfidfvectorizer(Unigrams) + SVM | CountVectorizer(Unigrams + Bigrams) + Logistic Regression) | TfidfVectorizer(Unigrams + Bigrams) + SVM      |
|:-----------------------|:-------------------:|:-----------------------------:|:--------:|
| Mean K-Fold Accuracy   | 0.7202              | 0.7237                        | 0.7103   |
| Mean K-Fold Precision  | 0.7211              | 0.7226                        | 0.7246   |
| Mean K-Fold Recall     | 0.7202              | 0.7237                        | 0.7103   |
| Mean K-Fold F1-Score   | 0.7152              | 0.7219                        | 0.6950   |
| Test Accuracy          | 0.6887              | 0.7218                        | 0.6887   |
| Test Precision         | 0.6894              | 0.7218                        | 0.7025   |
| Test Recall            | 0.6887              | 0.7218                        | 0.6887   |
| Test F1-Score          | 0.6878              | 0.7216                        | 0.6812   |


In [20]:
# Define TF-IDF vectorizer pipeline for Support Vector Machine (SVM)
svm_tfidf_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize)),
    ('classifier', SVC())
])

# Define CountVectorizer pipeline for Logistic Regression with n-grams (1, 2)
lr_cv_ngram_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2))),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Define TF-IDF vectorizer pipeline for Support Vector Machine (SVM) with n-grams (1, 2)
svm_tfidf_ngram_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2))),
    ('classifier', SVC())
])


In [21]:
validation(svm_tfidf_pipeline)
print()
validation(lr_cv_ngram_pipeline)
print()
validation(svm_tfidf_ngram_pipeline)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function word_tokenize at 0x000001D31662AAC0>)),
                ('classifier', SVC())])
Mean K-Fold Accuracy: 0.7201907790143084
Mean K-Fold Precision: 0.7211328432169456
Mean K-Fold Recall: 0.7201907790143084
Mean K-Fold F1-Score: 0.7151659876397811
Test Accuracy: 0.688715953307393
Test Precision: 0.6893815277493345
Test Recall: 0.688715953307393
Test F1-Score: 0.6877781938361852

Pipeline(steps=[('vectorizer',
                 CountVectorizer(ngram_range=(1, 2),
                                 tokenizer=<function word_tokenize at 0x000001D31662AAC0>)),
                ('classifier', LogisticRegression(max_iter=1000))])
Mean K-Fold Accuracy: 0.7236883942766295
Mean K-Fold Precision: 0.7226287155551465
Mean K-Fold Recall: 0.7236883942766295
Mean K-Fold F1-Score: 0.7219093869972267
Test Accuracy: 0.7217898832684825
Test Precision: 0.7217965366083615
Test Recall: 0.7217898832684825
Test F1-Score: 0.7215714657412

### Enhancements
1. NER is not used as it does not perform well, WSD on every token takes too long to train due to large corpus.
2. Ensemble model
3. Grid Search

In [22]:
# Uncomment the download if you have not downloaded
# nltk.download('maxent_ne_chunker')

In [23]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn
from nltk import pos_tag
from nltk.chunk import ne_chunk
import nltk
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

### Ensemble model
- 1. Tfidfvectorizer(Unigrams) + SVM
- 2. TfidfVectorizer(Unigrams + Bigrams) + SVM
- 3. CountVectorizer(Unigrams + Bigrams) + Logistic Regression

In [24]:
# Define TF-IDF vectorizer pipeline for Support Vector Machine (SVM)
baseline_model = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize)),
    ('classifier', SVC())
])

# Define TF-IDF vectorizer pipeline for Support Vector Machine (SVM) with n-grams (1, 2)
svm_model = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2))),
    ('classifier', SVC())
])

# Define CountVectorizer pipeline for Logistic Regression with n-grams (1, 2) (WSD)
lr_model = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2))),
    ('classifier', LogisticRegression(max_iter=1000))
])

### Recap of Individual Scores

In [25]:
validation(baseline_model)
print()
validation(svm_model)
print()
validation(lr_model)
print()

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function word_tokenize at 0x000001D31662AAC0>)),
                ('classifier', SVC())])
Mean K-Fold Accuracy: 0.7201907790143084
Mean K-Fold Precision: 0.7211328432169456
Mean K-Fold Recall: 0.7201907790143084
Mean K-Fold F1-Score: 0.7151659876397811
Test Accuracy: 0.688715953307393
Test Precision: 0.6893815277493345
Test Recall: 0.688715953307393
Test F1-Score: 0.6877781938361852

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(ngram_range=(1, 2),
                                 tokenizer=<function word_tokenize at 0x000001D31662AAC0>)),
                ('classifier', SVC())])
Mean K-Fold Accuracy: 0.7103338632750397
Mean K-Fold Precision: 0.7245600872567473
Mean K-Fold Recall: 0.7103338632750397
Mean K-Fold F1-Score: 0.6950501802513942
Test Accuracy: 0.688715953307393
Test Precision: 0.7024501751909613
Test Recall: 0.688715953307393
Test F1-Score: 0.6812102258263938

Pipeline(steps=[('vectori

In [26]:
from sklearn.ensemble import VotingClassifier

# Define the ensemble model using a VotingClassifier
ensemble_model = VotingClassifier(estimators=[
    ('baseline', baseline_model),
    ('svm', svm_model),
     ('lr', lr_model),
    
], voting='hard')

validation(ensemble_model)

VotingClassifier(estimators=[('baseline',
                              Pipeline(steps=[('vectorizer',
                                               TfidfVectorizer(tokenizer=<function word_tokenize at 0x000001D31662AAC0>)),
                                              ('classifier', SVC())])),
                             ('svm',
                              Pipeline(steps=[('vectorizer',
                                               TfidfVectorizer(ngram_range=(1,
                                                                            2),
                                                               tokenizer=<function word_tokenize at 0x000001D31662AAC0>)),
                                              ('classifier', SVC())])),
                             ('lr',
                              Pipeline(steps=[('vectorizer',
                                               CountVectorizer(ngram_range=(1,
                                                                          

### Conclusion of ensemble model

1. The ensemble model is 2nd best in terms of Test metrics but loses to the Logistic Regression model for all metrics.
2. We will use the Logistic Regression model.

| Measure                | Baseline (SVM) | SVM      | Logistic Regression | Ensemble  |
|------------------------|----------------|----------|---------------------|-----------|
| Mean K-Fold Accuracy   | 0.7202         | 0.7103   | 0.7215              | 0.7189    |
| Mean K-Fold Precision  | 0.7211         | 0.7246   | 0.7204              | 0.7224    |
| Mean K-Fold Recall     | 0.7202         | 0.7103   | 0.7215              | 0.7189    |
| Mean K-Fold F1-Score   | 0.7152         | 0.6951   | 0.7193              | 0.7115    |
| Test Accuracy          | 0.6887         | 0.6887   | 0.7335              | 0.7062    |
| Test Precision         | 0.6894         | 0.7025   | 0.7337              | 0.7087    |
| Test Recall            | 0.6887         | 0.6887   | 0.7335              | 0.7062    |
| Test F1-Score          | 0.6878         | 0.6812   | 0.7331              | 0.7045    |




### Enhancement by grid search for CountVectorizer(Unigrams + Bigrams) + Logistic Regression
We will not use the ensemble model.

In [28]:
from sklearn.model_selection import GridSearchCV

In [29]:
X_train = df_polarity['clean_text']
y_train = df_polarity['polarity']
X_test = polarity_test_df['clean_text']
y_test = polarity_test_df['polarity']

In [30]:
# Define CountVectorizer pipeline for Logistic Regression with n-grams (1, 2)
lr_model = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2))),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Define the parameter grid for grid search for lr ngram model
lr_param_grid = {
    'classifier__C': [0.1, 1, 10],  # Parameters for logistic regression in lr_model
    'classifier__solver': ['lbfgs', 'liblinear', 'newton-cg', 'sag'],  # Solver options for logistic regression in lr_model
}

# Perform grid search for logistic Regression model
lr_grid_search = GridSearchCV(lr_model, lr_param_grid, cv=5)
lr_grid_search.fit(X_train, y_train)

# Get best parameters and score for SVM model
print("Best Parameters (LR):", lr_grid_search.best_params_)
print("Best Score (LR):", lr_grid_search.best_score_)

# Evaluate the best model found by grid search on test data
test_accuracy = lr_grid_search.score(X_test, y_test)
print("Test Accuracy (LR):", test_accuracy)

Best Parameters (LR): {'classifier__C': 10, 'classifier__solver': 'sag'}
Best Score (LR): 0.7100158982511924
Test Accuracy (LR): 0.7237354085603113


### Testing the updated LR model

In [31]:
# Original
lr_model = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2))),
    ('classifier', LogisticRegression(max_iter=1000))
])
validation(lr_model)

Pipeline(steps=[('vectorizer',
                 CountVectorizer(ngram_range=(1, 2),
                                 tokenizer=<function word_tokenize at 0x000001D31662AAC0>)),
                ('classifier', LogisticRegression(max_iter=1000))])
Mean K-Fold Accuracy: 0.7236883942766295
Mean K-Fold Precision: 0.7226287155551465
Mean K-Fold Recall: 0.7236883942766295
Mean K-Fold F1-Score: 0.7219093869972267
Test Accuracy: 0.7217898832684825
Test Precision: 0.7217965366083615
Test Recall: 0.7217898832684825
Test F1-Score: 0.7215714657412097


In [32]:
# Updated
lr_model = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2))),
    ('classifier', LogisticRegression(max_iter=1000, C=10, solver='sag'))
])

validation(lr_model)

Pipeline(steps=[('vectorizer',
                 CountVectorizer(ngram_range=(1, 2),
                                 tokenizer=<function word_tokenize at 0x000001D31662AAC0>)),
                ('classifier',
                 LogisticRegression(C=10, max_iter=1000, solver='sag'))])
Mean K-Fold Accuracy: 0.7208267090620032
Mean K-Fold Precision: 0.7200747479463524
Mean K-Fold Recall: 0.7208267090620032
Mean K-Fold F1-Score: 0.7195594034537685
Test Accuracy: 0.7237354085603113
Test Precision: 0.7237179111281777
Test Recall: 0.7237354085603113
Test F1-Score: 0.7235510310268908


### Conclusion
1. The updated model by grid search shows a slight increase performance in Test metrics but a slight decrease in performance for K-Fold metrics , this may be due to different shuffles in K-Fold used by grid search that resulted in different performance.
2. We will use the original model as it produces results with values closer to each other for K-Fold and Test-Metrics.

In [33]:
import time
import pickle

# Original
lr_model = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer=word_tokenize, ngram_range=(1, 2))),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Start the timer
start_time = time.time()

# Train the ensemble model
lr_model.fit(X_train, y_train)

# End the timer
end_time = time.time()

# Save the model to a file
with open('polarity_model.pkl', 'wb') as f:
    pickle.dump(lr_model, f)
    
# Calculate the training time
training_time = end_time - start_time

print("Training time:", training_time, "seconds")
print("Training size:", len(X_train))
print("Training time per sample:", training_time/len(X_train), "seconds")
print("Train samples per second:", len(X_train) / training_time)
print()

# Start the timer for inference
start_time = time.time()

# Make predictions on the test data
predictions = lr_model.predict(X_test)

# End the timer for inference
end_time = time.time()

# Calculate the inference time
inference_time = end_time - start_time
print("Inference time:", inference_time, "seconds")
print("Inference size:", len(X_test))
print("Inference time per sample:", inference_time / len(X_test), "seconds")
print("Inference samples per second:", len(X_test) / inference_time)

Training time: 1.148392915725708 seconds
Training size: 3145
Training time per sample: 0.00036514878083488333 seconds
Train samples per second: 2738.609718793475

Inference time: 0.07500004768371582 seconds
Inference size: 514
Inference time per sample: 0.00014591448965703467 seconds
Inference samples per second: 6853.328976104116


In [34]:
# Later, when you want to use the model for prediction:
import re
import contractions
import emoji
from nltk.stem import WordNetLemmatizer
import pickle

# Load the model from the file
with open('polarity_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

# Decode output:
def decode(predictions):
    if predictions[0] == 0:
           return 'Negative'
    else:
        return 'Positive'
    
# Preprocess tweet    
def preprocess(text):
    # Step 1: Expand contractions
    text = contractions.fix(text)
    # Step 2: Map emojis into its word meaning
    text = emoji.demojize(text)
    # Step 3: Remove mentions, hashtags, numbers and links
    pattern = r'@[A-Za-z0-9_]+|#[A-Za-z0-9]+|\d+|https?://\S+|[^\w\s]'
    text = re.sub(pattern, '', text)
    # put text to lowercase
    text = text.lower() 
    # Step 4: Lemmatize words
    wordnet_lemmatizer = WordNetLemmatizer()
    words = text.split()
    words = [wordnet_lemmatizer.lemmatize(word, pos="v") for word in words]
    text = ' '.join(words)
    return text

# Make predictions using the loaded model
tweet_1 = 'The vaccine is bad, do not take it.'
tweet = preprocess(tweet_1)
predictions = loaded_model.predict([tweet])
print(tweet_1, decode(predictions) )

tweet_2 = 'The vaccine is good for everyone even though it hurts badly, please take it.'
tweet = preprocess(tweet_2)
predictions = loaded_model.predict([tweet])
print(tweet_2, decode(predictions) )

The vaccine is bad, do not take it. Negative
The vaccine is good for everyone even though it hurts badly, please take it. Positive


### Time tests for scalability
1. Training time
2. Inference time

| Metric                        | Value                        |
|:------------------------------|:-----------------------------|
| Training time (s)             | 1.1484                       |
| Training size                 | 3145                         |
| Training time per sample (s)  | 0.0003651                    |
| Train samples per second      | 2738.61                      |
| Inference time (s)            | 0.0750                       |
| Inference size                | 514                          |
| Inference time per sample (s) | 0.0001459                    |
| Inference samples per second  | 6853.33                      |




It is very scalable with extremely fast training and inference.
