# Imports

In [2]:
!pip install vaderSentiment
!pip install transformers
!pip install cryptography

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━[0m [32m102.4/126.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [3]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import os
import glob
import pandas as pd
from google.colab import drive, userdata
from typing import Protocol
from cryptography.fernet import Fernet
from tqdm.auto import tqdm
from scipy.special import softmax

In [4]:
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


# Post Analyzer

In [None]:
gdrive_directory = '/content/gdrive/MyDrive/.../'
output_directory = '/content/gdrive/MyDrive/.../'
cipher = Fernet(userdata.get('TYPEKINDLY_KEY'))

In [6]:
tqdm.pandas()

## Models

In [7]:
def save_file(filename, dfs: list[pd.DataFrame]):
  output_file = os.path.join(output_directory, f'{filename}.csv')
  print(f"Saving file {output_file}")
  df = pd.concat(dfs, axis=1)
  df = df.loc[:, ~df.columns.duplicated()]
  df.to_csv(output_file, mode='w', index=False)

In [8]:
class SentimentAnalyzerModel():
    def __init__(self):
        self.name = ""

    def interpret_csv_results(self):
        csv_files = glob.glob(os.path.join(gdrive_directory, '*.csv'))
        saved_dfs: dict[str, pd.DataFrame] = {}
        with tqdm(total=len(csv_files), desc="CSV files to interpret") as csv_pbar:
          for csv_file in csv_files:
              # opening the .csv file
              file_name = os.path.basename(csv_file).split('.')[0]
              df = pd.read_csv(csv_file).dropna()
              df[[f'{self.name}_sentiment', f'{self.name}_negative_score', f'{self.name}_neutral_score', f'{self.name}_positive_score']] = df.progress_apply(self.get_sentiment, axis=1, result_type="expand")
              df = df.drop(['post_text', 'translated_post_text', 'poster'], axis=1)
              saved_dfs[file_name] = df
              csv_pbar.update(1)
        return saved_dfs

    def decrypt_post_text(self, text: str) -> str:
      return cipher.decrypt(text.encode()).decode()

    def get_sentiment(self, row: pd.Series) -> dict[str, float]:
      print("you must override me!")
      return {"": 0.0}


In [9]:
class VaderModel(SentimentAnalyzerModel):
    def __init__(self):
        self.sid = SentimentIntensityAnalyzer()
        self.name = "vader"

    def examine_compound(self, compound_score):
      if compound_score >= 0.05: return "positive"
      elif compound_score > -0.05 and compound_score < 0.05: return "neutral"
      elif compound_score <= -0.05: return "negative"

    def get_sentiment(self, row: pd.Series) -> tuple[str, float, float, float]:
        # polarity_scores = self.sid.polarity_scores(row['post_text'])
        polarity_scores = self.sid.polarity_scores(self.decrypt_post_text(row['post_text']))
        sentiment = self.examine_compound(polarity_scores['compound'])
        return (sentiment, polarity_scores['neg'], polarity_scores['neu'], polarity_scores['pos'])

In [10]:
class XLMRobertaModel(SentimentAnalyzerModel):
    def __init__(self):
        self.MODEL = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
        self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL)
        self.model = AutoModelForSequenceClassification.from_pretrained(self.MODEL)
        self.name = "xlmroberta"
        self.sentiment_pipeline = pipeline("sentiment-analysis", model=self.MODEL)
        self.labels = ['negative', 'neutral', 'positive']

    def get_sentiment(self, row: pd.Series) -> tuple[str, float, float, float]:
        encoded_text = self.tokenizer(self.decrypt_post_text(row['post_text']), return_tensors='pt')
        output = self.model(**encoded_text)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        max_score = scores.argmax()
        return (self.labels[max_score], scores[0], scores[1], scores[2])


## analyze the posts now

In [11]:
models: list[SentimentIntensityAnalyzer] = [VaderModel(), XLMRobertaModel()]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/841 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Device set to use cpu


In [12]:
def run_all_models(models: list[SentimentIntensityAnalyzer]):
  all_dfs: dict[str, list[pd.DataFrame]] = {}
  for model in models:
    partial_dfs = model.interpret_csv_results()
    for filename, df_output in partial_dfs.items():
      if filename not in all_dfs:
        all_dfs[filename] = []
      all_dfs[filename].append(df_output)

  for filename, dfs in all_dfs.items():
    save_file(filename, dfs)

In [13]:
run_all_models(models)

CSV files to interpret:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/84 [00:00<?, ?it/s]

CSV files to interpret:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/84 [00:00<?, ?it/s]

Saving file /content/gdrive/MyDrive/CS192/final-results/00_1.csv
