In [1]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import pandas as pd
import string

# download punctuation and stopwords from nltk
nltk.download('punkt')
nltk.download("stopwords")
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nigel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nigel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nigel\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\nigel\AppData\Roaming\nltk_data...


True

In [5]:
tweets_df = pd.read_csv("Resources/Tweets.csv")
# make sure the tweets in column "text" are strings
tweets_df['text'] = tweets_df['text'].astype('str')

# delete the unneccessary columns
tweets_df = tweets_df.drop(columns=["textID", "selected_text"])
tweets_df=tweets_df.rename(columns={'sentiment':'class'})
tweets_df=tweets_df[['text','class']]


In [6]:
def nb_process_tweets(tweet):
    # make the text all lowercase
    tweet = tweet.lower()
    
    # remove punctuation
    tweet = "".join(char for char in tweet if char not in string.punctuation)
  
    # remove urls
    tweet_wo_stop = "".join([i for i in tweet if 'http' not in i])
    
    
    # lemmatization
    lemm = WordNetLemmatizer()
    lemmed = [lemm.lemmatize(word) for word in tweet_wo_stop]
    
    # put string together
    final_tweet = "".join(lemmed)
    
    return final_tweet

In [7]:
tweets_df['text'] = tweets_df['text'].apply(lambda x: nb_process_tweets(x))
tweets_df

Unnamed: 0,text,class
0,id have responded if i were going,neutral
1,sooo sad i will miss you here in san diego,negative
2,my boss is bullying me,negative
3,what interview leave me alone,negative
4,sons of why couldnt they put them on the rel...,negative
...,...,...
27476,wish we could come see u on denver husband l...,negative
27477,ive wondered about rake to the client has ma...,negative
27478,yay good for both of you enjoy the break you...,positive
27479,but it was worth it,positive


In [8]:
dict_sentiment = {'positive': 2, 'neutral': 0, 'negative': 1}
tweets_df['class'] = tweets_df['class'].apply(lambda x: dict_sentiment.get(x))
targets=pd.get_dummies(tweets_df,prefix="",prefix_sep='',columns=['class'])

In [9]:
train = targets.sample(frac=0.8)
test=targets.drop(train.index)

In [12]:
from simpletransformers.classification import ClassificationModel
model = ClassificationModel('bert', 'bert-base-uncased', use_cuda=False,args={
                                                                                             'reprocess_input_data': True,
                                                                                             'overwrite_output_dir': True,
                                                                                             'fp16': False,
                                                                                             'do_lower_case': False,
                                                                                             'num_train_epochs': 2,
                                                                                             
                                                                                             'regression': False,
                                                                                      
                                                                                             "learning_rate":4e-5,
                                                                                             'weight_decay':0.0,
                                                                                             "save_eval_checkpoints": False,
                                                                                             "save_model_every_epoch": False,
                                                                                             "silent": False})

model.train_model(train)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]



  0%|          | 0/21985 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/2749 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/2749 [00:00<?, ?it/s]

(5498, 0.4416525525763555)

In [21]:
from sklearn.metrics import classification_report, accuracy_score
test_result, test_model_outputs, test_wrong_predictions = model.eval_model(test,acc=accuracy_score)



  0%|          | 0/5496 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/687 [00:00<?, ?it/s]

In [22]:
print(test_result)

{'mcc': 0.5807640123331936, 'tp': 1630, 'tn': 2762, 'fp': 515, 'fn': 589, 'auroc': 0.8743084628646844, 'auprc': 0.8265223972774384, 'acc': 0.7991266375545851, 'eval_loss': 0.5038491542704587}


# Optimized
Removing neutral sentiment

In [24]:
optimized_df = pd.read_csv("Resources/Tweets.csv")
optimized_df=optimized_df.loc[optimized_df['sentiment']!="neutral"]
# optimized_df=optimized_df.dropna()
optimized_df
optimized_df['text'] = optimized_df['text'].astype('str')

# delete the unneccessary columns
optimized_df = optimized_df.drop(columns=["textID", "selected_text"])
optimized_df=optimized_df.rename(columns={'sentiment':'class'})
optimized_df=optimized_df[['text','class']]
optimized_df['text'] = optimized_df['text'].apply(lambda x: nb_process_tweets(x))
optimized_df
dict_sentiment = {'positive': 0,'negative': 1}
optimized_df['class'] = optimized_df['class'].apply(lambda x: dict_sentiment.get(x))
opt_targets=pd.get_dummies(optimized_df,prefix="",prefix_sep='',columns=['class'])

In [25]:
opt_train = opt_targets.sample(frac=0.8)
opt_test=opt_targets.drop(opt_train.index)

In [26]:
from simpletransformers.classification import ClassificationModel
opt_model = ClassificationModel('bert', 'bert-base-uncased', use_cuda=False,args={
                                                                                             'reprocess_input_data': True,
                                                                                             'overwrite_output_dir': True,
                                                                                             'fp16': False,
                                                                                             'do_lower_case': False,
                                                                                             'num_train_epochs': 2,
                                                                                             
                                                                                             'regression': False,
                                                                                      
                                                                                             "learning_rate":4e-5,
                                                                                             'weight_decay':0.0,
                                                                                             "save_eval_checkpoints": False,
                                                                                             "save_model_every_epoch": False,
                                                                                             "silent": False})

opt_model.train_model(opt_train)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

  0%|          | 0/13090 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/1637 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/1637 [00:00<?, ?it/s]

(3274, 0.23010731098924736)

In [29]:
from sklearn.metrics import classification_report, accuracy_score
opt_test_result, opt_test_model_outputs, opt_test_wrong_predictions = opt_model.eval_model(opt_test,acc=accuracy_score)



  0%|          | 0/3273 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/410 [00:00<?, ?it/s]

In [30]:
opt_test_result

{'mcc': 0.861698050255382,
 'tp': 1598,
 'tn': 1449,
 'fp': 102,
 'fn': 124,
 'auroc': 0.9793138591789343,
 'auprc': 0.9811517095005018,
 'acc': 0.9309501985945615,
 'eval_loss': 0.29111145737314054}