In [2]:
#TODO: load the base model using tensorflow

# Loading the base model, the tokenizer and the config
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import GenerationConfig


In [4]:

model_name='google/flan-t5-base'
xl_model_name = 'google/flan-t5-xl'

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
#model_xl = AutoModelForSeq2SeqLM.from_pretrained(xl_model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, batched = True)
#tokenizer_xl = AutoTokenizer.from_pretrained(xl_model_name, use_fast=True, batched = True)
#xl_model = AutoModelForSeq2SeqLM.from_pretrained(xl_model_name)

In [8]:
import pandas as pd
import csv

submissions_filename = "reddit_submissions_data.csv"
comments_filename = "reddit_comments_data.csv"
comments_df = pd.read_csv("../redditanalysis/"+comments_filename)

In [9]:
comments_df[:5]

Unnamed: 0,ID,Comment,Unixtime,Upvotes,SubmissionID
0,kwdnu7z,Ethereum [pros](/r/CryptoCurrency/comments/1bm...,1711308000.0,1,1bmshtm
1,kwdqtko,Nobody knows shit,1711309000.0,331,1bmshtm
2,kwdpxif,Ethereum could also top 20k. Or go back to 700...,1711308000.0,549,1bmshtm
3,kwdsan3,Wtf is the point of following this sub every p...,1711309000.0,137,1bmshtm
4,kwdqa7o,Could.,1711309000.0,15,1bmshtm


In [10]:
### Basic cleaning ###

# Converted emojis to word representations, removed newlines and tabs

comments_df = comments_df.dropna(axis=0, how='any')
# remove removed comments
comments_df=comments_df[comments_df["Comment"]!="[removed]"]
# remove admin bot disclaimer comments 
comments_df = comments_df[~comments_df["Comment"].str.contains("\[pros\]\(/r/CryptoCurrency/")]

# \s on reddit means sarcasm - how to best represent this? Important for sentiment analysis
#comments_df["Comment"].str.split(expand=True).stack().value_counts()
#comments_df["Comment"].replace('\s', '*SARCASM*')

In [None]:
# Various prompts trialed
"""
prompt = f'''\n{text}"Which #Cryptocurrency# is this Reddit comment about and what is the sentiment?
Sentiment options: {options_}.
Output format: [#Cryptocurrency#, sentiment]'''

prompt = f'''\n{text}"Which #Cryptocurrency# is this Reddit comment about and what is the sentiment?
Sentiment options: {options_}.
Output format: [#Cryptocurrency#, sentiment].
Example output ["Etherium", negative]'''

prompt = f'''\n{text}What is the sentiment of this review?\n{options_}'''

"""

In [39]:
# take 20 comments to speed up the process
comment_list = comments_df["Comment"][:20]

# Zero-shot inference

# Sentiment only

# Using the prompting technique from the imdb dataset sentiment type tasks described here https://github.com/google-research/FLAN/blob/main/flan/v2/flan_templates_branched.py
options_ = {"positive", "negative", "neutral"}

# Coins and sentiment

# TODO: background/context + question
def tokenize_prompt(data,  tokenizer):
    dataset = []
    options_ = {"positive", "negative", "neutral"}
    processed_comments = []
    
    for text in data:
        #### comment out the desired prompt ###
        
        ###################### ZERO SHOT ################################
        # Note: zero-shot performs sufficiently well for sentiment extraction based on qualitative analysis #
        
        """prompt = f'''\n{text}"Which #Cryptocurrency# is this Reddit comment about?
    Example: "Etherium"'''"""
        
        # this prompt performs poorly, does not identify the currency
        """
        prompt = f'''\n{text}"Which #Cryptocurrency# is this Reddit comment about and what is the sentiment?
        Sentiment options: {options_}.
        Output format: [#Cryptocurrency#, sentiment]'''
        """
        
        # this prompt performs poorly, does not identify the currency
        
        prompt = f'''\n{text}"Which #Cryptocurrency# is this Reddit comment about and what is the sentiment?
        Sentiment options: {options_}.
        Output format: [#Cryptocurrency#, sentiment].
        Example output ["Etherium", negative]'''
        
        """
        prompt = f'''\n{text}What is the sentiment of this review?\n{options_}'''
        """
        
        ###################### ONE SHOT ################################
        prompt = f'''
        \nExample:
        "I hate bitcoin and how slow it is."
        Output: ["bitcoin", "negative"]
        
        \n{text}"Which #Cryptocurrency# is this Reddit comment about and what is the sentiment?
        Output -> ?
        '''
        
        """
        prompt = f'''\n{text}What is the sentiment of this review?\n{options_}'''
        """
        inputs = tokenizer(prompt, return_tensors='pt')
        # save the prompt and the original comment
        dataset.append([inputs, text])
    return dataset

def get_sentiment(model, tokenizer, comment_list):
    data = tokenize_prompt(comment_list, tokenizer)
    options_ = {"positive", "negative", "neutral"}
    processed_comments = []
    
    for inputs in data:
        output = tokenizer.decode(
            model.generate(
                inputs[0]['input_ids'], 
                )[0], 
                skip_special_tokens=True
        )
        processed_comments.append([inputs[1], output])
    return processed_comments



In [37]:
comments_sentiment = get_sentiment(model, tokenizer, comment_list)
comments_sentiment



[['Nobody knows shit', 'negative'],
 ['Ethereum could also top 20k. Or go back to 700$. Or 7k. Or anything else',
  'negative'],
 ['Wtf is the point of following this sub every post is the exact same unfounded prediction.',
  'negative'],
 ['Could.', 'negative'],
 ['It will either go up or down says my crystal ball :crystal_ball: with 100% accuracy',
  'negative'],
 ["tldr; Standard Chartered predicts Ethereum could reach $8,000 by the end of this year and possibly $14,000 by 2025, contingent on the approval of spot Ethereum exchange-traded funds (ETFs) by the SEC. The bank's optimism is partly due to Ethereum's recent upgrade, which is expected to significantly reduce transaction fees, making the network more competitive. Additionally, Standard Chartered also forecasts Bitcoin could hit $150,000 per coin by year-end, following the approval and popularity of Bitcoin ETFs. *This summary is auto generated by a bot and not meant to replace reading the original article. As always, DYOR.",


In [23]:
# extract only coin
comments_coins_sentiment = get_sentiment(model, tokenizer, comment_list, get_coins=True)
comments_coins_sentiment

[['Nobody knows shit', 'crypto'],
 ['Ethereum could also top 20k. Or go back to 700$. Or 7k. Or anything else',
  'ethereum'],
 ['Wtf is the point of following this sub every post is the exact same unfounded prediction.',
  '@sad_sad_sad_sad_sa'],
 ['Could.', 'crypto'],
 ['It will either go up or down says my crystal ball :crystal_ball: with 100% accuracy',
  'Cryptocurrency'],
 ["tldr; Standard Chartered predicts Ethereum could reach $8,000 by the end of this year and possibly $14,000 by 2025, contingent on the approval of spot Ethereum exchange-traded funds (ETFs) by the SEC. The bank's optimism is partly due to Ethereum's recent upgrade, which is expected to significantly reduce transaction fees, making the network more competitive. Additionally, Standard Chartered also forecasts Bitcoin could hit $150,000 per coin by year-end, following the approval and popularity of Bitcoin ETFs. *This summary is auto generated by a bot and not meant to replace reading the original article. As alw

In [None]:
# TODO: a different dataset: post text + top level comments
# Model gets post text as context, then a top-level comment