In [2]:
#TODO: load the base model using tensorflow

# Loading the base model, the tokenizer and the config
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import GenerationConfig


In [4]:

model_name='google/flan-t5-base'
xl_model_name = 'google/flan-t5-xl'

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
#model_xl = AutoModelForSeq2SeqLM.from_pretrained(xl_model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, batched = True)
#tokenizer_xl = AutoTokenizer.from_pretrained(xl_model_name, use_fast=True, batched = True)
#xl_model = AutoModelForSeq2SeqLM.from_pretrained(xl_model_name)

In [8]:
import pandas as pd
import csv

submissions_filename = "reddit_submissions_data.csv"
comments_filename = "reddit_comments_data.csv"
comments_df = pd.read_csv("../redditanalysis/"+comments_filename)

In [9]:
comments_df[:5]

Unnamed: 0,ID,Comment,Unixtime,Upvotes,SubmissionID
0,kwdnu7z,Ethereum [pros](/r/CryptoCurrency/comments/1bm...,1711308000.0,1,1bmshtm
1,kwdqtko,Nobody knows shit,1711309000.0,331,1bmshtm
2,kwdpxif,Ethereum could also top 20k. Or go back to 700...,1711308000.0,549,1bmshtm
3,kwdsan3,Wtf is the point of following this sub every p...,1711309000.0,137,1bmshtm
4,kwdqa7o,Could.,1711309000.0,15,1bmshtm


In [10]:
### Basic cleaning ###

# Converted emojis to word representations, removed newlines and tabs

comments_df = comments_df.dropna(axis=0, how='any')
# remove removed comments
comments_df=comments_df[comments_df["Comment"]!="[removed]"]
# remove admin bot disclaimer comments 
comments_df = comments_df[~comments_df["Comment"].str.contains("\[pros\]\(/r/CryptoCurrency/")]

# \s on reddit means sarcasm - how to best represent this? Important for sentiment analysis
#comments_df["Comment"].str.split(expand=True).stack().value_counts()
#comments_df["Comment"].replace('\s', '*SARCASM*')

In [None]:
# Various prompts trialed
"""
prompt = f'''\n{text}"Which #Cryptocurrency# is this Reddit comment about and what is the sentiment?
Sentiment options: {options_}.
Output format: [#Cryptocurrency#, sentiment]'''

prompt = f'''\n{text}"Which #Cryptocurrency# is this Reddit comment about and what is the sentiment?
Sentiment options: {options_}.
Output format: [#Cryptocurrency#, sentiment].
Example output ["Etherium", negative]'''

prompt = f'''\n{text}What is the sentiment of this review?\n{options_}'''

"""

In [90]:
# take 20 comments to speed up the process
comment_list = comments_df["Comment"][:20]

# Zero-shot inference

# Sentiment only

# Using the prompting technique from the imdb dataset sentiment type tasks described here https://github.com/google-research/FLAN/blob/main/flan/v2/flan_templates_branched.py
options_ = {"positive", "negative", "neutral"}

# Coins and sentiment

# TODO: background/context + question
def tokenize_prompt(data,  tokenizer, sentiment_only=False):
    dataset = []
    coins_list = ["Bitcoin/BTC", "Etherium/ETH", "Solana/SOL", "Avalanche/AVA", "Other"]
    options_ = {"positive", "negative", "neutral"}
    processed_comments = []
    for text in data:
        #### comment out the desired prompt ###
        
        ###################### ZERO SHOT ################################
        
        # Note: zero-shot performs sufficiently well for sentiment extraction based on qualitative analysis #
        
        """prompt = f'''\n{text}"Which #Cryptocurrency# is this Reddit comment about?
    Example: "Etherium"'''"""
    
        # adding the coin options isn't helpful, need to reengineer this prompt structure
        # removing the below bitcoin example from the prompt gives better preformance (otherwise it classifies everything as bitcoin
        """Comment:\n"I hate bitcoin and how slow it is."
        Which #Cryptocurrency# is this Reddit comment about?\nOptions:{coins_list}
        Answer: "Bitcoin/BTC"""
        
        prompt = f'''
        Comment:\n"ETH is the future, no other coin compares."
        Which #Cryptocurrency# is this Reddit comment about?\nOptions:{coins_list}
        Answer: "Etherium/ETH"
        
        Comment:\n"No one knows anything in this market."
        Which #Cryptocurrency# is this Reddit comment about?\nOptions:{coins_list}
        Answer: "Other"
        
        Comment:\n{text}"Which #Cryptocurrency# is this Reddit comment about?\n{coins_list}
        Answer: ?"
        '''
    
        # this prompt performs poorly, does not identify the currency
        """
        prompt = f'''\n{text}"Which #Cryptocurrency# is this Reddit comment about and what is the sentiment?
        Sentiment options: {options_}.
        Output format: [#Cryptocurrency#, sentiment]'''
        """
        
        # this prompt performs poorly, does not identify the currency
        
        """prompt = f'''\n{text}"Which #Cryptocurrency# is this Reddit comment about and what is the sentiment?
        Sentiment options: {options_}.
        Output format: [#Cryptocurrency#, sentiment].
        Example output ["Etherium", negative]'''"""
        
        if sentiment_only:
            prompt = f'''\n{text}What is the sentiment of this review?\n{options_}'''
    
        
        ###################### FEW SHOT ################################
        
        # Performs poorly on sentiment, tends to extract the coin correctly
        """prompt = f'''
        \nExample 1:
        "I hate bitcoin and how slow it is."
        Output: ["Bitcoin", "negative"]
        
        Example 2:
        "Top crypto analyst predicts Etherium can go as high as USD 10000 in 2025."
        Output: ["Etherium", "positive"]
        
        Example 3:
        "ETH is the future, I am very bullish on ETH."
        Output: ["ETH", "positive"]
        
        {text}"Which #Cryptocurrency# is this Reddit comment about and what is the sentiment?
        Output -> ?
        '''"""
        
        # worst performance so far #
        """prompt = f'''
        Comment:\n"I hate bitcoin and how slow it is."
        What #Cryptocurrency# does this comment mention and what is the sentiment?
        Answer: ["Bitcoin", "negative"]
        
        Comment:\n"Top crypto analyst predicts Etherium can go as high as USD 10000 in 2025."
        What #Cryptocurrency# does this comment mention and what is the sentiment?
        Answer: ["Etherium", "positive"]
        
        Comment:\n"ETH is the future, I am very bullish on ETH."
        What #Cryptocurrency# does this comment mention and what is the sentiment?
        Answer: ["ETH", "positive"]
        
        Comment:\n{text}
        What #Cryptocurrency# does this comment mention and what is the sentiment?
        Answer -> ?
        '''"""
        
        inputs = tokenizer(prompt, return_tensors='pt')
        # save the prompt and the original comment
        dataset.append([inputs, text])
    return dataset

def get_sentiment(model, tokenizer, comment_list, sentiment_only):
    data = tokenize_prompt(comment_list, tokenizer, sentiment_only)
    options_ = {"positive", "negative", "neutral"}
    processed_comments = []
    
    for inputs in data:
        output = tokenizer.decode(
            model.generate(
                inputs[0]['input_ids'], max_new_tokens = 50
                )[0], 
                skip_special_tokens=True
        )
        processed_comments.append([inputs[1], output])
    return processed_comments



In [89]:
comments_sentiment = get_sentiment(model, tokenizer, comment_list, sentiment_only=False)
pd.DataFrame(comments_sentiment)

Unnamed: 0,0,1
0,Nobody knows shit,Bitcoin/BTC
1,Ethereum could also top 20k. Or go back to 700...,Ethereum
2,Wtf is the point of following this sub every p...,Etherium/ETH
3,Could.,Bitcoin/BTC
4,It will either go up or down says my crystal b...,Etherium/ETH
5,tldr; Standard Chartered predicts Ethereum cou...,Ethereum/ETH
6,Source: trust me bro,Etherium/ETH
7,"I think $37k ETH, $272k BTC",Etherium/ETH
8,Coin stock is probably the shining star for cr...,Etherium/ETH
9,Pretty much everyone I know says eth $10k targ...,Bitcoin/BTC


In [23]:
# extract only coin
comments_coins_sentiment = get_sentiment(model, tokenizer, comment_list, get_coins=True)
comments_coins_sentiment

[['Nobody knows shit', 'crypto'],
 ['Ethereum could also top 20k. Or go back to 700$. Or 7k. Or anything else',
  'ethereum'],
 ['Wtf is the point of following this sub every post is the exact same unfounded prediction.',
  '@sad_sad_sad_sad_sa'],
 ['Could.', 'crypto'],
 ['It will either go up or down says my crystal ball :crystal_ball: with 100% accuracy',
  'Cryptocurrency'],
 ["tldr; Standard Chartered predicts Ethereum could reach $8,000 by the end of this year and possibly $14,000 by 2025, contingent on the approval of spot Ethereum exchange-traded funds (ETFs) by the SEC. The bank's optimism is partly due to Ethereum's recent upgrade, which is expected to significantly reduce transaction fees, making the network more competitive. Additionally, Standard Chartered also forecasts Bitcoin could hit $150,000 per coin by year-end, following the approval and popularity of Bitcoin ETFs. *This summary is auto generated by a bot and not meant to replace reading the original article. As alw

In [75]:
# TODO: run zero shot sentiment and zero shot coin extraction separately and merge the two datasets
sentiment = get_sentiment(model, tokenizer, comment_list, sentiment_only=True)
coins = get_sentiment(model, tokenizer, comment_list, sentiment_only=False)

In [76]:
df1=pd.DataFrame(sentiment)
df2=pd.DataFrame(coins).drop(0, axis=1)
result = pd.concat([df1, df2], axis=1, join="inner")
result


Unnamed: 0,0,1,1.1
0,Nobody knows shit,negative,crypto
1,Ethereum could also top 20k. Or go back to 700...,negative,ethereum
2,Wtf is the point of following this sub every p...,negative,@sad_sad_sad_sad_sad_sad_sad_sad_sad_sad_sad_s...
3,Could.,negative,crypto
4,It will either go up or down says my crystal b...,positive,Cryptocurrency
5,tldr; Standard Chartered predicts Ethereum cou...,positive,Ethereum
6,Source: trust me bro,positive,crypto
7,"I think $37k ETH, $272k BTC",positive,ETH
8,Coin stock is probably the shining star for cr...,positive,coin stock
9,Pretty much everyone I know says eth $10k targ...,negative,eth


In [None]:
# TODO: add a list of coins
# TODO: a different dataset: post text + top level comments
# Model gets post text as context, then a top-level comment
# TODO: try the xl model