# Donwload training dataset

https://github.com/cardiffnlp/tweeteval/tree/main/datasets/sentiment

In [None]:
import pandas as pd

In [3]:
test_labels = pd.read_csv("../raw_data/cardiffnlp_tweeteval/test_labels.csv")
test_text = pd.read_csv("../raw_data/cardiffnlp_tweeteval/test_text.csv")
train_labels = pd.read_csv("../raw_data/cardiffnlp_tweeteval/train_labels.csv")
val_labels = pd.read_csv("../raw_data/cardiffnlp_tweeteval/val_labels.csv")
val_text = pd.read_csv("../raw_data/cardiffnlp_tweeteval/val_text.csv")

In [None]:
train_text = pd.read_csv("../raw_data/cardiffnlp_tweeteval/train_text.txt", sep="\" \"", header=None)
train_text

# Import some models trained

In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AutoModelForSeq2SeqLM, AutoConfig, pipeline

In [10]:
def get_model_elements(model_path: str, model_type: str):
    model, config = None, None
    if model_type == 'classifier':
        model = AutoModelForSequenceClassification.from_pretrained(model_path)
        config = AutoConfig.from_pretrained(model_path)
    elif model_type == 'generator':
        model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
    else:
        print(f'ERROR: model type {model_type} not suppported')
        return
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    return tokenizer, model, config 

In [11]:
sentiment_path = 'cardiffnlp/twitter-roberta-base-sentiment'

sentiment_tokenizer, sentiment_model, sentiment_config = \
            get_model_elements(sentiment_path, 'classifier')
        
        # Emotion pipeline: return e.g {'sadness': ..., 'joy': ..., 'anger': ..,
        #                               'disgust': ..., 'fear': ..., 'surprise': ...}
        
emotions_path = 'Emanuel/bertweet-emotion-base'
emotions_tokenizer, emotions_model, emotions_config = \
            get_model_elements(emotions_path, 'classifier')

Downloading:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/515M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/359 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/824k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


In [15]:
from typing import List, Dict

In [16]:
def get_scores(text: str, tokenizer, model, config, keys: List[str]) -> Dict[str, float]:
    """
    Get score (sentiment or emotions for each text in `texts`
    Args:
        text: text to score
        tokenizer: Tokenizer for each input text
        model: Classification model for tokenized texts
        config: Classification config (output label mapping) from `AutoConfig`
        keys: List containing "better" values than `config` label, aligned with
            them in alphabetical order
    Returns: dict containing `text`'s score for each `model`'s output label
    """
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    result = {config.id2label[ranking[i]]: scores[ranking[i]] for i in range(scores.shape[0])}
    result = dict(sorted(result.items()))
    result = {k: float(v) for k, v in zip(keys, result.values())}
    return result

Trying the model with some tweets:

In [31]:
import pandas as pd
import re
import string
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import text_to_word_sequence

def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation,
    remove words containing numbers, remove numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = ''.join([i for i in text if not i.isdigit()])
    return text

In [56]:
X_pred = pd.read_csv("../raw_data/tweets.csv")
X_pred.rename(columns = {'2022-05-31 15:38:40.948508':'date', "@Apple Pricey. That's iPhone.": "text" }, inplace = True)
X_pred

Unnamed: 0,date,text
0,2022-05-31 15:38:40.948508,No there is nothing you can immutably claim at...
1,2022-05-31 15:38:40.948508,@Brueck1988 @Apple Might need to call for a tu...
2,2022-05-31 15:38:40.948508,Guys I have a question. If @Apple states that ...
3,2022-05-31 15:38:40.948508,@Apple is also happy to give access to your ac...
4,2022-05-31 15:38:40.948508,"//End of Thread// If you enjoyed this story, ..."
...,...,...
1098,2022-05-31 15:38:40.948508,@Apple needs to buy @Mullen_USA NOW! @Tesla
1099,2022-05-31 15:38:40.948508,@rfog42 @tim_cook @Apple 🤣🤣🤣 https://t.co/A8Bj...
1100,2022-05-31 15:38:40.948508,@JudiciaryGOP This country is bathing in guns ...
1101,2022-05-31 15:38:40.948508,Why doesn’t the #iphone autocorrect “constipat...


In [57]:
X_pred["text"] = X_pred["text"].apply(clean_text)
X_pred

Unnamed: 0,date,text
0,2022-05-31 15:38:40.948508,no there is nothing you can immutably claim at...
1,2022-05-31 15:38:40.948508,apple might need to call for a tutorial on th...
2,2022-05-31 15:38:40.948508,guys i have a question if apple states that th...
3,2022-05-31 15:38:40.948508,apple is also happy to give access to your acc...
4,2022-05-31 15:38:40.948508,end of thread if you enjoyed this story make ...
...,...,...
1098,2022-05-31 15:38:40.948508,apple needs to buy mullenusa now tesla
1099,2022-05-31 15:38:40.948508,timcook apple 🤣🤣🤣
1100,2022-05-31 15:38:40.948508,judiciarygop this country is bathing in guns a...
1101,2022-05-31 15:38:40.948508,why doesn’t the iphone autocorrect “constipate...


In [60]:
ls_train = X_pred["text"].tolist()
X_pred = [text_to_word_sequence(i) for i in ls_train]
X_pred

[['no',
  'there',
  'is',
  'nothing',
  'you',
  'can',
  'immutably',
  'claim',
  'at',
  'apple',
  'you',
  'are',
  'permanently',
  'at',
  'the',
  'visitor',
  'center',
  'nothing',
  'else',
  'johnny',
  'ives',
  'told',
  'you',
  'you',
  'have',
  'your',
  'prostitute',
  'you',
  'are',
  'immutably',
  'done',
  'thanks'],
 ['apple',
  'might',
  'need',
  'to',
  'call',
  'for',
  'a',
  'tutorial',
  'on',
  'that',
  'laterbout',
  'to',
  'lose',
  'my',
  'damn',
  'mind'],
 ['guys',
  'i',
  'have',
  'a',
  'question',
  'if',
  'apple',
  'states',
  'that',
  'the',
  'reason',
  'why',
  'they',
  'are',
  'not',
  'including',
  'any',
  'charging',
  'bricks',
  'on',
  'their',
  'iphones',
  'but',
  'for',
  'ipads',
  'they',
  'are',
  'including',
  'one',
  'just',
  'because',
  'its',
  'so',
  'expensive',
  'so',
  'where',
  'is',
  'saving',
  'the',
  'environment',
  'in',
  'that',
  'anyone',
  'thank',
  'you'],
 ['apple',
  'is',
  'a

In [65]:
ls_train[0]

'no there is nothing you can immutably claim at apple you are permanently at the visitor center nothing else johnny ives told you you have your  prostitute you are immutably done thanks'