# Training a sentiment model based on 1.6 million tweets

For the Mood Lens project. Following guides, they will be cited at the bottom.

Contributors:
- Nicholas Sanaie 3/11/2004

External packages:
- pandas
- scikit-learn
- autocorrect

### Input Annotated Dataset

In [3]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

inputp = 'input'
outputp = 'outputs'

if not os.path.exists(inputp):
    os.mkdir(inputp)
if not os.path.exists(outputp):
    os.mkdir(outputp)

twitter_sentiment = 'input/training.1600000.processed.noemoticon.csv'

columns = ['sentiment', 'id', 'time', 'query', 'user', 'tweet']
df_tweets = pd.read_csv(twitter_sentiment, encoding='UTF', names=columns, encoding_errors='ignore')
df = df_tweets[['sentiment', 'tweet']].sample(n=10000, random_state=0)
df.to_csv("outputs/10000tweets.csv")

x = df.tweet.values
y = df.sentiment.replace(4, 1)

x_train, x_validation, y_train, y_validation = train_test_split(x, y, test_size=0.35, random_state=0)

check the count of annotations. 0 = negative, 2 = neutral, 4 = postive

In [4]:
df.sentiment.value_counts()

sentiment
4    5046
0    4954
Name: count, dtype: int64

no neutral annoations! We can make this an binary classification :)

### Data cleaning

In [5]:
import re
from autocorrect import Speller
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('wordnet')
spell = Speller(lang='en')
lemm = WordNetLemmatizer()

[nltk_data] Downloading package punkt to C:\Users\Nicholas
[nltk_data]     Sanaie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Nicholas
[nltk_data]     Sanaie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
#Fixing Word Lengthening
def reduce_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

In [7]:
def text_preprocess(doc):
    #Lowercasing all the letters
    temp = doc.lower()
    #Removing hashtags and mentions
    temp = re.sub("@[A-Za-z0-9_]+","", temp)
    temp = re.sub("#[A-Za-z0-9_]+","", temp)
    #Removing links
    temp = re.sub(r"http\S+", "", temp)
    temp = re.sub(r"www.\S+", "", temp)
    #removing numbers
    temp = re.sub("[0-9]","", temp)
    #Removing '
    temp = re.sub("'"," ",temp)

    #Tokenization
    temp = word_tokenize(temp)
    #Fixing Word Lengthening
    temp = [reduce_lengthening(w) for w in temp]
    #spell corrector
    temp = [spell(w) for w in temp]
    #stem
    temp = [lemm.lemmatize(w) for w in temp]
    #Removing short words
    temp = [w for w in temp if len(w)>2]
    temp = " ".join(w for w in temp)
    
    return temp

In [8]:
text_preprocess("I loooovvvvvvve @unchapelhill very mucchhhh buuuut I also like @dukkke")

'love very much but also like'

## Building the model

attempt using a naive bayes classifier

In [10]:
#naive bayes libraries
from nltk import FreqDist
from nltk.classify import NaiveBayesClassifier,accuracy

#build the dataset
def build_dataset(X,y):
    #build the dataset
    words = [text_preprocess(word).split(" ") for word in X]
    dataset = list(zip(words, y))
    
    return dataset

# Define the feature extractor with the 2000 most used words
all_words = FreqDist(sum([w.split(" ") for w in x_train],[]))
word_features = list(all_words)[:2000]
    
def document_features(words):
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in set(words))

    return features

trainset = build_dataset(x_train, y_train)
testset = build_dataset(x_validation, y_validation)
train_set = [(document_features(d), y) for (d,y) in trainset]
test_set = [(document_features(d), y) for (d,y) in testset]

#training
nb_classifier = NaiveBayesClassifier.train(train_set)

# Test the classifier
print("accuracy score on test set:", accuracy(nb_classifier, test_set))

accuracy score on test set: 0.7077142857142857


# BERT!

In [35]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
#load the model
bert_model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

#################text cleaning
def preprocess(X):
    import re
    def text_clean(text):
        temp = text.lower()
        temp = re.sub("@[A-Za-z0-9_]+","", temp)
        temp = re.sub("#[A-Za-z0-9_]+","", temp)
        temp = re.sub(r"http\S+", "", temp)
        temp = re.sub(r"www.\S+", "", temp)
        temp = re.sub("[0-9]","", temp)
        return temp
    X_cleaned = [text_clean(text) for text in X]
    return X_cleaned
############transforming raw data to an appropriate format ready to feed into the BERT model
def convert_example_to_feature(text):
    return bert_tokenizer.encode_plus(text,
            add_special_tokens = True, # add [CLS], [SEP]
            max_length = 128, # max length of the text that can go to BERT
            pad_to_max_length = True, # add [PAD] tokens
            return_attention_mask = True, # add attention mask to not focus on pad tokens
          )
def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
    return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks,
    }, label
def encode_examples(X,y):
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []
    for text, label in zip(X, y):
        bert_input = convert_example_to_feature(text)
        input_ids_list.append(bert_input['input_ids'])
        token_type_ids_list.append(bert_input['token_type_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
        label_list.append([label])
    return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)

# train dataset
# x_train, x_validation, y_train, y_validation = train_test_split(x_train, y_train, test_size=0.2, random_state=0)
x_train, x_validation, y_train, y_validation = train_test_split(x, y, test_size=0.25, random_state=0)
ds_train_encoded = encode_examples(preprocess(x_train), y_train).shuffle(100).batch(64).repeat(4)
ds_val_encoded = encode_examples(preprocess(x_validation), y_validation).batch(64)
# test dataset
ds_test_encoded = encode_examples(preprocess(x_test), y_test).batch(64)

######### compiling the model
learning_rate = 3e-5
# choosing Adam optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)
# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
bert_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

#############training and evaluating
bert_model.fit(ds_train_encoded, epochs=4, validation_data=ds_val_encoded)

loss, acc = bert_model.evaluate(ds_test_encoded, verbose=0)
print("accuracy: {:5.2f}%".format(100 * acc))

##################Saving the model
bert_model.save_pretrained("outputs/bert_model", saved_model=True)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch 1/4
 17/472 [>.............................] - ETA: 2:28:10 - loss: 0.6565 - accuracy: 0.6305

KeyboardInterrupt: 

In [1]:
import sys
print('python', sys.version)
import tensorflow as tf
print('tensorflow', tf.__version__)
import transformers
print('transformers', transformers.__version__)

python 3.9.1 (tags/v3.9.1:1e5d33e, Dec  7 2020, 17:08:21) [MSC v.1927 64 bit (AMD64)]
tensorflow 2.10.1
transformers 4.38.2
