In [3]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax

In [4]:
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [5]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
def run_model(text):

  text = preprocess(text)
  encoded_input = tokenizer(text, return_tensors='pt')
  output = model(**encoded_input)
  scores = output[0][0].detach().numpy()
  scores = softmax(scores)

  # Print labels and scores
  ranking = np.argsort(scores)
  ranking = ranking[::-1]
  return scores

In [11]:
import pandas as pd
def extract_from_file(filename):
  print(f'{filename} starting')
  df = pd.read_json(filename, lines=True)
  df[['negative','neutral','positive']] =  df.apply(lambda x: run_model(x["text"]), axis=1, result_type="expand")
  df.to_csv(f'{filename}.csv')
  print(f'{filename} done')
  # print(df)

In [12]:
import os

a_directory = "tweets"
incr = 0
for filename in os.listdir(a_directory):
    try:
        filepath = os.path.join(a_directory, filename)
        extract_from_file(filepath)
    except Exception as e:
        print(e)
        
    incr += 1
    print(incr)

tweets/2014-10-14 starting
Expected object or value
1
tweets/2015-06-20 starting
Expected object or value
2
tweets/2016-03-03 starting
tweets/2016-03-03 done
3
tweets/2015-02-10 starting
tweets/2015-02-10 done
4
tweets/2015-05-07 starting
tweets/2015-05-07 done
5
tweets/2015-01-08 starting
tweets/2015-01-08 done
6
tweets/2014-08-20 starting
tweets/2014-08-20 done
7
tweets/2015-02-17 starting
tweets/2015-02-17 done
8
tweets/2016-03-04 starting
tweets/2016-03-04 done
9
tweets/2014-08-18 starting
tweets/2014-08-18 done
10
tweets/2015-06-27 starting
tweets/2015-06-27 done
11
tweets/2014-10-13 starting
tweets/2014-10-13 done
12
tweets/2015-01-30 starting
tweets/2015-01-30 done
13
tweets/2014-08-27 starting
tweets/2014-08-27 done
14
tweets/2015-02-28 starting
tweets/2015-02-28 done
15
tweets/2015-06-18 starting
tweets/2015-06-18 done
16
tweets/2015-02-21 starting
tweets/2015-02-21 done
17
tweets/2015-06-11 starting
tweets/2015-06-11 done
18
tweets/2014-10-25 starting
tweets/2014-10-25 done
1