### Quick and dirty sentiment analysis of tweets by Finnish political party leaders (as of November 2021)
Author: [Oguzhan (Ouz) Gencoglu](https://www.linkedin.com/posts/ogencoglu_nlp-activity-6861189201119989760-P2-n)

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from glob2 import glob
from collections import Counter
from itertools import compress
from pprint import pprint
from tqdm import tqdm
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch import nn
import seaborn as sns
import matplotlib.pyplot as plt

### Fetch the data

First run *twint* from command line for each Twitter user, e.g.: 
>twint -u marinsanna --timeline -o marinsanna.csv --csv

Save the csv files under *'data/'*

In [None]:
name_map = {
    'turtiainenano': 'Ano Turtiainen',
    'mariaohisalo': 'Maria Ohisalo',
    'annikasaarikko': 'Annika Saarikko',
    'sariessayah': 'Sari Essayah',
    'petteriorpo': 'Petteri Orpo',
    'anna_maja': 'Anna-Maja Henriksson',
    'hjallisharkimo': 'Harry Harkimo',
    'liandersson': 'Li Andersson',
    'ir_rkp': 'Riikka Purra',
    'marinsanna': 'Sanna Marin',
            }

In [None]:
# put all data into a dataframe
filepaths = glob(pathname="data/*.csv")
top = 500
data = []
for p in filepaths:
    temp = pd.read_csv(p, delimiter="\t", usecols=["username", "retweet_id", "tweet"])
    temp = temp[temp["retweet_id"].isna()][0:top]  # filter out retweets
    data.append(temp)
data = pd.concat(data)
data.drop(["retweet_id"], inplace=True, axis=1)
data["username"] = data["username"].map(name_map)
data.reset_index(drop=True, inplace=True)

pprint(Counter(data["username"]))

### Load pre-trained Finnish sentiment classification model

In [None]:
tokenizer = AutoTokenizer.from_pretrained("fergusq/finbert-finnsentiment")
model = AutoModelForSequenceClassification.from_pretrained("fergusq/finbert-finnsentiment")

### Classify sentiment of tweets

In [None]:
def get_sentiment(tokenizer, model, batch_of_text):
    batch = tokenizer(batch_of_text, padding=True, truncation=True, max_length=512, return_tensors="pt")
    outputs = model(**batch)
    predictions = nn.functional.softmax(outputs.logits, dim=-1)  # this is important!
    return predictions.cpu().detach().numpy()

def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]
        
batches = chunks(lst=list(data["tweet"]), n=50)
preds = []  # neg | neut | pos
for batch in tqdm(batches):
    pred = get_sentiment(tokenizer=tokenizer, model=model, batch_of_text=batch)
    preds.append(pred)
preds = np.vstack(preds)
print(preds.shape)

In [None]:
# Add the labels to dataframe
data["neg"] = preds[:, 0]
data["neut"] = preds[:, 1]
data["pos"] = preds[:, 2]
data["class"] = data[["neg", "neut", "pos"]].idxmax(axis=1)

# groupby
counts = data.groupby(["username", "class"]).count().reset_index()
counts["tweet"] = counts["tweet"]/5  # turn into %

### Visualize

In [None]:
sns.set_theme(style="whitegrid")

g = sns.catplot(data=counts, kind="bar", x="class", y="tweet", hue="username",
    ci=None, palette="tab10", alpha=.8, height=9, aspect=1.4, legend_out=False)
g.despine(left=True)
g.set_axis_labels("", "% of last 500 tweets")
g.legend.set_title("")
g.set_yticklabels(size=15)
plt.legend(fontsize='x-large', title_fontsize='40')
plt.show()

g.fig.savefig('plot.png', dpi=600, bbox_inches='tight')  # save image