# Project Tweet Analysis

In [3]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
import pandas as pd
from scipy.special import softmax
import csv
import urllib.request

  return torch._C._cuda_getDeviceCount() > 0


In [3]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []


    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [4]:
# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

In [5]:
tweets = pd.read_csv('tweets.csv')

In [6]:
tweets.columns = ['user', 'datetime', 'content']

In [7]:
tweets

Unnamed: 0,user,datetime,content
0,repdonyoung,2020-12-07 16:46:19,"Today, we remember the courageous souls lost 7..."
1,repdonyoung,2020-12-04 20:52:29,I will continue working with my colleagues and...
2,repdonyoung,2020-12-04 20:52:29,"As co-founder of the House Cannabis Caucus, I ..."
3,repdonyoung,2020-12-04 20:52:29,"Since Alaska legalized marijuana, I have heard..."
4,repdonyoung,2020-12-04 20:52:29,I am a passionate supporter of a states’ right...
...,...,...,...
61533,RepLizCheney,2020-04-09 00:52:31,Click here for the latest updates on the feder...
61534,RepLizCheney,2020-04-08 22:44:33,Wishing everyone observing tonight a happy Pas...
61535,RepLizCheney,2020-04-07 21:39:30,Since the launch of the Paycheck Protection Pr...
61536,RepLizCheney,2020-04-07 20:12:41,We need to help Americans who are hurting and ...


In [9]:
tweets = tweets[tweets.user != 'housedemocrats']
tweets = tweets[tweets.user != 'housegop']

In [10]:
tweets

Unnamed: 0,user,datetime,content
0,repdonyoung,2020-12-07 16:46:19,"Today, we remember the courageous souls lost 7..."
1,repdonyoung,2020-12-04 20:52:29,I will continue working with my colleagues and...
2,repdonyoung,2020-12-04 20:52:29,"As co-founder of the House Cannabis Caucus, I ..."
3,repdonyoung,2020-12-04 20:52:29,"Since Alaska legalized marijuana, I have heard..."
4,repdonyoung,2020-12-04 20:52:29,I am a passionate supporter of a states’ right...
...,...,...,...
61533,RepLizCheney,2020-04-09 00:52:31,Click here for the latest updates on the feder...
61534,RepLizCheney,2020-04-08 22:44:33,Wishing everyone observing tonight a happy Pas...
61535,RepLizCheney,2020-04-07 21:39:30,Since the launch of the Paycheck Protection Pr...
61536,RepLizCheney,2020-04-07 20:12:41,We need to help Americans who are hurting and ...


In [31]:
def classify(row):
    text = preprocess(row['content'])
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    for i in range(scores.shape[0]):
        l = labels[ranking[i]]
        s = scores[ranking[i]]
        if l == 'positive':
            positive = scores[ranking[i]]
        elif l == "neutral":
            neutral = scores[ranking[i]]
        elif l == "negative":
            negative = scores[ranking[i]]
    return positive, neutral, negative 

In [28]:
n = 500  #chunk row size
tweets_df = [tweets[i:i+n] for i in range(0,tweets.shape[0],n)]

In [36]:
tweets_final = pd.DataFrame()
for chunk in tweets_df:
    chunk['positive'], chunk['neutral'], chunk['negative'] = zip(*chunk.apply(classify, axis=1))
    #tweets_final[['positive', 'neutral', 'negative']] = tweets_final.apply(lambda row: classify(row), axis=1)
    tweets_final = tweets_final.append(chunk)
    print("Processed chunk of 500")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed chunk of 500
Processed c

In [37]:
tweets_final.to_csv(r'./tweets_final.csv',index=False)

In [38]:
tweets_final

Unnamed: 0,user,datetime,content,positive,neutral,negative
0,repdonyoung,2020-12-07 16:46:19,"Today, we remember the courageous souls lost 7...",0.680861,0.277483,0.041656
1,repdonyoung,2020-12-04 20:52:29,I will continue working with my colleagues and...,0.800538,0.195412,0.004050
2,repdonyoung,2020-12-04 20:52:29,"As co-founder of the House Cannabis Caucus, I ...",0.586395,0.393374,0.020230
3,repdonyoung,2020-12-04 20:52:29,"Since Alaska legalized marijuana, I have heard...",0.015326,0.282598,0.702076
4,repdonyoung,2020-12-04 20:52:29,I am a passionate supporter of a states’ right...,0.220732,0.544757,0.234510
...,...,...,...,...,...,...
61533,RepLizCheney,2020-04-09 00:52:31,Click here for the latest updates on the feder...,0.073239,0.907484,0.019278
61534,RepLizCheney,2020-04-08 22:44:33,Wishing everyone observing tonight a happy Pas...,0.980894,0.018106,0.001000
61535,RepLizCheney,2020-04-07 21:39:30,Since the launch of the Paycheck Protection Pr...,0.443877,0.550937,0.005186
61536,RepLizCheney,2020-04-07 20:12:41,We need to help Americans who are hurting and ...,0.550615,0.416058,0.033327


In [4]:
tweets_final = pd.read_csv('./tweets_final.csv')

In [5]:
us_reps

Unnamed: 0,user,datetime,content,positive,neutral,negative
0,repdonyoung,2020-12-07 16:46:19,"Today, we remember the courageous souls lost 7...",0.680861,0.277483,0.041656
1,repdonyoung,2020-12-04 20:52:29,I will continue working with my colleagues and...,0.800538,0.195412,0.004050
2,repdonyoung,2020-12-04 20:52:29,"As co-founder of the House Cannabis Caucus, I ...",0.586395,0.393374,0.020230
3,repdonyoung,2020-12-04 20:52:29,"Since Alaska legalized marijuana, I have heard...",0.015326,0.282598,0.702076
4,repdonyoung,2020-12-04 20:52:29,I am a passionate supporter of a states’ right...,0.220732,0.544757,0.234510
...,...,...,...,...,...,...
52515,RepLizCheney,2020-04-09 00:52:31,Click here for the latest updates on the feder...,0.073239,0.907484,0.019278
52516,RepLizCheney,2020-04-08 22:44:33,Wishing everyone observing tonight a happy Pas...,0.980894,0.018106,0.001000
52517,RepLizCheney,2020-04-07 21:39:30,Since the launch of the Paycheck Protection Pr...,0.443877,0.550936,0.005186
52518,RepLizCheney,2020-04-07 20:12:41,We need to help Americans who are hurting and ...,0.550615,0.416058,0.033327


In [27]:
tweets_small

Unnamed: 0,user,datetime,content,positive,neutral,negative
0,repdonyoung,2020-12-07 16:46:19,"Today, we remember the courageous souls lost 7...",0.680861,0.277483,0.041656
1,repdonyoung,2020-12-04 20:52:29,I will continue working with my colleagues and...,0.800538,0.195412,0.004050
2,repdonyoung,2020-12-04 20:52:29,"As co-founder of the House Cannabis Caucus, I ...",0.586395,0.393374,0.020230
3,repdonyoung,2020-12-04 20:52:29,"Since Alaska legalized marijuana, I have heard...",0.015326,0.282598,0.702076
4,repdonyoung,2020-12-04 20:52:29,I am a passionate supporter of a states’ right...,0.220732,0.544757,0.234510
...,...,...,...,...,...,...
495,RepMikeRogersAL,2020-09-02 13:25:37,Montgomery County! Have you completed your cen...,0.174953,0.813216,0.011831
496,RepMikeRogersAL,2020-09-02 00:38:28,“Mostly Peaceful” https://t.co/nyjywv836v,0.433050,0.545009,0.021941
497,RepMikeRogersAL,2020-09-02 00:37:45,Thank you @realdonaldtrump for showing some re...,0.971366,0.027205,0.001429
498,RepMikeRogersAL,2020-09-01 17:56:50,Exciting stuff coming tomorrow! #AlabamaCounts...,0.985020,0.014078,0.000903


In [None]:
tweets

In [None]:
tweets.to_csv(r'./tweets_annotated.csv',index=False)

## Model Testing

In [25]:

text = """



"""
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)

ranking = np.argsort(scores)
ranking = ranking[::-1]


# # TF
# model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
# model.save_pretrained(MODEL)

# text = "Good night 😊"
# encoded_input = tokenizer(text, return_tensors='tf')
# output = model(encoded_input)
# scores = output[0][0].numpy()
# scores = softmax(scores)


array([0, 1, 2])

In [None]:
scores[ranking[2]]
if labels[ranking[1]] == 'neutral':
    print(labels[ranking[1]])

In [None]:
    for i in range(scores.shape[0]):
        l = labels[ranking[i]]
        s = scores[ranking[i]]
        print(f"{i+1}) {l} {np.round(float(s), 4)}")