In [1]:
# !pip install transformers

In [2]:
import re
from transformers import pipeline
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
import pandas as pd
from scipy.special import softmax
import csv
import urllib.request

url_pattern = re.compile(r'https?://\S+')

# Merge all the labeled data

In [6]:
total = pd.read_csv('filtered.csv')
total = total.drop(['Unnamed: 0', 'index'], axis=1)
total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127181 entries, 0 to 127180
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   tweet_id          127181 non-null  float64
 1   created_datetime  127181 non-null  object 
 2   content           127181 non-null  object 
 3   author_id         127180 non-null  float64
 4   place_id          122938 non-null  object 
 5   location          122904 non-null  object 
 6   longitude         127180 non-null  float64
 7   latitude          127180 non-null  float64
 8   county            127180 non-null  object 
dtypes: float64(4), object(5)
memory usage: 8.7+ MB


In [7]:
df_1 = pd.read_excel('CDFW_labeled_1.xlsx')
df_2 = pd.read_excel('CDFW_labeled_2.xlsx')
df_3 = pd.read_excel('Xin_labeled_3.xlsx')
df_4 = pd.read_excel('Xin_labeled_4.xlsx')
df_5 = pd.read_excel('Xin_labeled_5.xlsx')

df = pd.concat([df_1, df_2, df_3, df_4, df_5])
df['Label_detail'] = df['Label_detail'].apply(lambda x: 'a' if x == 'A' else x)
df = df[(df['Label_basic']==1)|(df['Label_basic']==2)].reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2682 entries, 0 to 2681
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   tweet_id          2682 non-null   float64
 1   created_datetime  2682 non-null   object 
 2   county            2682 non-null   object 
 3   Label_basic       2682 non-null   float64
 4   Label_detail      319 non-null    object 
 5   content           2682 non-null   object 
 6   index             908 non-null    float64
 7   author_id         2462 non-null   float64
 8   place_id          2440 non-null   object 
 9   location          2439 non-null   object 
 10  longitude         2462 non-null   float64
 11  latitude          2462 non-null   float64
 12  tokenized_tweets  224 non-null    object 
 13  tokenized         224 non-null    object 
 14  merged            224 non-null    object 
 15  Unnamed: 0        1330 non-null   float64
dtypes: float64(7), object(9)
memory usage: 335

In [17]:
# df.to_csv('tt.csv')

In [11]:
df_6 = pd.read_csv('Xin_labeled_6.csv')
df_6 = df_6[(df_6['Label_basic']==1)|(df_6['Label_basic']==2)].reset_index(drop=True)
df_6 = df_6.drop(['Unnamed: 0.1', 'Unnamed: 0', 'index', 'merged'], axis=1)
df_6.rename(columns={'Label_basic': 'label'}, inplace=True)
df_6.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7658 entries, 0 to 7657
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   tweet_id          7658 non-null   float64
 1   created_datetime  7658 non-null   object 
 2   content           7658 non-null   object 
 3   author_id         7658 non-null   float64
 4   place_id          7586 non-null   object 
 5   location          7579 non-null   object 
 6   longitude         7658 non-null   float64
 7   latitude          7658 non-null   float64
 8   county            7658 non-null   object 
 9   label             7658 non-null   float64
dtypes: float64(5), object(5)
memory usage: 598.4+ KB


In [12]:
df_filtered = df[~df['content'].isin(df_6['content']) & ~df['tweet_id'].isin(df_6['tweet_id'])]
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1258 entries, 0 to 2679
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   tweet_id          1258 non-null   float64
 1   created_datetime  1258 non-null   object 
 2   county            1258 non-null   object 
 3   Label_basic       1258 non-null   float64
 4   Label_detail      99 non-null     object 
 5   content           1258 non-null   object 
 6   index             480 non-null    float64
 7   author_id         1091 non-null   float64
 8   place_id          1080 non-null   object 
 9   location          1080 non-null   object 
 10  longitude         1091 non-null   float64
 11  latitude          1091 non-null   float64
 12  tokenized_tweets  84 non-null     object 
 13  tokenized         84 non-null     object 
 14  merged            84 non-null     object 
 15  Unnamed: 0        527 non-null    float64
dtypes: float64(7), object(9)
memory usage: 167

In [13]:
final = []
labels = []
for index, row in df_filtered.iterrows():
    match = total.loc[(total['content'] == row['content']) | (total['tweet_id'] == row['tweet_id'])]
    if not match.empty:
        final.append(match.iloc[0])
        labels.append(row['Label_basic'])
        
final_df = pd.DataFrame(final)
final_df['label'] = labels
final_df = pd.concat([final_df, df_6]).reset_index(drop=True)
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8516 entries, 0 to 7657
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   tweet_id          8516 non-null   float64
 1   created_datetime  8516 non-null   object 
 2   content           8516 non-null   object 
 3   author_id         8516 non-null   float64
 4   place_id          8432 non-null   object 
 5   location          8425 non-null   object 
 6   longitude         8516 non-null   float64
 7   latitude          8516 non-null   float64
 8   county            8516 non-null   object 
 9   label             8516 non-null   float64
dtypes: float64(5), object(5)
memory usage: 731.8+ KB


In [14]:
final_df = final_df.drop_duplicates(subset=['tweet_id', 'content'])
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8488 entries, 0 to 7657
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   tweet_id          8488 non-null   float64
 1   created_datetime  8488 non-null   object 
 2   content           8488 non-null   object 
 3   author_id         8488 non-null   float64
 4   place_id          8405 non-null   object 
 5   location          8398 non-null   object 
 6   longitude         8488 non-null   float64
 7   latitude          8488 non-null   float64
 8   county            8488 non-null   object 
 9   label             8488 non-null   float64
dtypes: float64(5), object(5)
memory usage: 729.4+ KB


In [16]:
final_df.sample(n=50)

Unnamed: 0,tweet_id,created_datetime,content,author_id,place_id,location,longitude,latitude,county,label
6018,7.29e+17,5/8/16,Love this bear so much @ California Academy of...,2600977000.0,5a110d312052166f,"San Francisco, CA",-122.466354,37.769936,SAN FRANCISCO,2.0
552,5.88851e+17,2015-04-16,Don't sleep on Grizzly Bear,2724594000.0,3b77caf94bfc81fe,"Los Angeles, CA",-118.36518,33.99941,LOS ANGELES,2.0
55,1.485202e+17,2011-12-18,Basement bear just wants to hibernate in peace...,7214022.0,866269c983527d5a,"Ashbury Heights, San Francisco",-122.44995,37.76596,SAN FRANCISCO,2.0
6547,7.86e+17,10/12/16,Bear Attack Victim Might Have Interrupted Mati...,325165000.0,1927193c57f35d51,"West Hollywood, CA",-118.36331,34.089542,LOS ANGELES,2.0
6987,8.67e+17,5/24/17,@campstake My grandmother and a bear 🐻 surpris...,3315540000.0,7d27a1f115a32664,"Ojai, CA",-119.226685,34.459187,VENTURA,1.0
6525,7.84e+17,10/5/16,"Ledger hoped the Jeep’s smell of metal, oil an...",43370190.0,a409256339a7c6a1,"Redwood City, CA",-122.25409,37.500957,SAN MATEO,1.0
3431,3.08e+17,3/2/13,Polar bear! #sfzoo http://t.co/JUVAB1fiOb,14695820.0,5a110d312052166f,"San Francisco, CA",-122.47957,37.766644,SAN FRANCISCO,1.0
1923,1.38e+18,4/20/21,@DEvanAltman That's a bear,1190000000.0,aa30747001a23f03,"Danville, CA",-121.921234,37.776703,CONTRA COSTA,1.0
3371,2.9e+17,1/12/13,•x• Cute little bear,824236600.0,3134f9d2892d2685,"Hawthorne, CA",-118.36669,33.909264,LOS ANGELES,1.0
29,1.469177e+18,2021-12-10,I do not blame the bear at all. https://t.co/6...,881361300.0,3b77caf94bfc81fe,"Los Angeles, CA",-118.36518,33.99941,LOS ANGELES,2.0


In [15]:
final_df.groupby(final_df['label']).count()

Unnamed: 0_level_0,tweet_id,created_datetime,content,author_id,place_id,location,longitude,latitude,county
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1.0,6054,6054,6054,6054,6004,5997,6054,6054,6054
2.0,2434,2434,2434,2434,2401,2401,2434,2434,2434


# Add sentiment labels

In [18]:
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

### Method 1

In [19]:
classifier_1 = pipeline("sentiment-analysis", model="michellejieli/emotion_text_classifier")

for line in final_df.content[:20]:
    # line = url_pattern.sub('', line)
    line = preprocess(line)
    print(line)
    print(classifier_1(line))

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/413 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


11.8.21 
Mama Bear and her cub. Welcome to the family, Herbie. (Also #adoptdontshop)

#instadog #newpuppyparents #dogsofinstagram @ Palm Springs, California http
[{'label': 'joy', 'score': 0.8302091956138611}]
@user The media is going back to the polar bear lies. Despicable
[{'label': 'anger', 'score': 0.6893523335456848}]
If you ever see me fighting in the forest with a grizzly bear. HELP THE BEAR! cause that bitch gone need it.
[{'label': 'anger', 'score': 0.988049328327179}]
@user Her current obsession is Masha and the Bear so HELL YEAH!
[{'label': 'joy', 'score': 0.3551543056964874}]
Fighting the urge to disappear into the forest and getting eaten by a bear, by looking fucking cute as all hell. http
[{'label': 'disgust', 'score': 0.47347891330718994}]
What do they say about what a bear does on the woods? Now you can have it on you mantel. http
[{'label': 'neutral', 'score': 0.95692378282547}]
Sometimes you eat the bear and sometimes the bear eats you. 

Pandemic has been unbearable

### Method 2 (not used)

In [None]:
# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)
labels

Downloading (…)lve/main/config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [None]:
labels

['negative', 'neutral', 'positive']

In [None]:
for text in final_df.content[:20]:
    print(text)
    text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    ranking = np.argsort(scores)[::-1]
    for i in range(scores.shape[0]):
        l = labels[ranking[i]]
        s = scores[ranking[i]]
        print(f"{i+1}) {l} {np.round(float(s), 4)}")

Follow my YouTube WILD CHARLES for longer vids! 

*Never feed a wild bear this one is at a rescue* https://t.co/Tn8y7b10TG
1) neutral 0.5986
2) positive 0.2319
3) negative 0.1694
Polar bear investigating a camera, Churchill, Manitoba, Canada https://t.co/VKCfn9z6BA
1) neutral 0.8891
2) positive 0.0739
3) negative 0.0371
So today, #Petaluma has a bear wandering around backyards on west side. @PetalumaCityGov police need to send a current update for residents to read. #nixel https://t.co/GVfwLilb5Z
1) neutral 0.6385
2) negative 0.3195
3) positive 0.0421
Bear is up in a tree. #Petaluma https://t.co/X2OIZJyZWf
1) neutral 0.752
2) negative 0.1827
3) positive 0.0652
@Paaauuuliinaa the polar bears are eating each other! so sad :( http://t.co/CynLZR5S
1) negative 0.9695
2) neutral 0.0279
3) positive 0.0026
@KTVU Bear is down from tree
1) neutral 0.7573
2) negative 0.1908
3) positive 0.0519
@tlhicks713 I know. And there's some good ones out there, too. 
Btw I wouldn't want to be anywhere near a

### Label the final dataset with Method 1

In [20]:
sentiment_1 = []
for line in final_df.content:
    line = preprocess(line)
    sentiment_1.append(classifier_1(line)[0]['label'])

# sentiment_2 = []
# for text in final_df.content:
#     text = preprocess(text)
#     encoded_input = tokenizer(text, return_tensors='pt')
#     output = model(**encoded_input)
#     scores = output[0][0].detach().numpy()
#     scores = softmax(scores)
#     ranking = np.argsort(scores)[-1]
#     sentiment_2.append(labels[ranking])

final_df['sentiment_1'] = sentiment_1     
# final_df['sentiment_2'] = sentiment_2
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8488 entries, 0 to 7657
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   tweet_id          8488 non-null   float64
 1   created_datetime  8488 non-null   object 
 2   content           8488 non-null   object 
 3   author_id         8488 non-null   float64
 4   place_id          8405 non-null   object 
 5   location          8398 non-null   object 
 6   longitude         8488 non-null   float64
 7   latitude          8488 non-null   float64
 8   county            8488 non-null   object 
 9   label             8488 non-null   float64
 10  sentiment_1       8488 non-null   object 
dtypes: float64(5), object(6)
memory usage: 795.8+ KB


In [21]:
final_df.head()

Unnamed: 0,tweet_id,created_datetime,content,author_id,place_id,location,longitude,latitude,county,label,sentiment_1
0,1.458125e+18,2021-11-09,11.8.21 \nMama Bear and her cub. Welcome to th...,2660056000.0,4265ece9285a2872,"Palm Springs, CA",-116.5279,33.8443,RIVERSIDE,2.0,joy
1,1.458291e+18,2021-11-10,@CNN The media is going back to the polar bear...,1.333958e+18,f95304ef80fecc3f,"Temecula, CA",-117.083496,33.522667,RIVERSIDE,2.0,anger
2,1.459411e+18,2021-11-13,If you ever see me fighting in the forest with...,1.126902e+18,3b77caf94bfc81fe,"Los Angeles, CA",-118.36518,33.99941,LOS ANGELES,2.0,anger
3,1.459424e+18,2021-11-13,@mashaslamovich Her current obsession is Masha...,360405000.0,5ecbd073f39c00fa,"Hayward, CA",-122.032455,37.68489,CONTRA COSTA,2.0,joy
4,1.459562e+18,2021-11-13,Fighting the urge to disappear into the forest...,461353000.0,a592bd6ceb1319f7,"San Diego, CA",-117.10973,32.801037,SAN DIEGO,2.0,disgust


In [22]:
final_df.to_csv('final_db.csv')