# Capstone Project Notebook

### Generating more negative class labels using Chat GPT 

In [20]:
import os
import time

import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix

import openai

import seaborn as sns
import matplotlib.pyplot as plt

In [21]:
openai.api_key = os.getenv('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')

In [22]:
# Read in data
df_all = pd.read_csv('../data/reddit_cleaned_slim.csv')
print(df_all.shape)
# df = df.head(9)
#print(df.shape)

(1260374, 18)


### Eliminating rows and columns that are not relevant to investing and stock trading

In [23]:
# Getting frequency counts for 'FolderName'
folder_name_counts = df_all['FolderName'].value_counts()

# Displaying the frequency counts
print(folder_name_counts)

FolderName
wallstreetbets          719899
gme                     268376
stocks                   68524
pennystocks              50712
stockmarket              41388
investing                38162
options                  27965
robinhoodpennystocks     21235
robinhood                17564
finance                   6549
Name: count, dtype: int64


In [24]:
# Filter out rows where 'link_flair_text' is irrelevant

# List of irrelevant flair texts
irrelevant_flairs = ['Credit', 'Taxes', 'Other', 'Housing', 'Retirement', 'Planning', 'Saving', 'Debt', 'Auto', 'Employment', 'Insurance', "Budgeting", "Advice", 'Advice Request']

# Filter out rows where 'link_flair_text' is in the list of irrelevant flairs
df_all = df_all[~df_all['link_flair_text'].isin(irrelevant_flairs)].copy()
print(df_all.shape)

(1246161, 18)


In [25]:
# Convert 'title' column to string
df_all['title'] = df_all['title'].astype(str)

# Filter out rows where 'title' starts with 'Daily '
df_all = df_all[~df_all['title'].str.startswith('Daily ')].copy()
print(df_all.shape)

(1245087, 18)


### Optional filters (untoggle as needed)

In [275]:
# Filter out rows where 'selftext' is '[removed]'
df_all = df_all[df_all['selftext'] != '[removed]'].copy()
df_all = df_all[df_all['selftext'] != '[deleted]'].copy()
print(df_all.shape)

(618703, 18)


In [26]:
# Convert 'date' column to datetime
df_all['date'] = pd.to_datetime(df_all['date'])

In [27]:
# Filter rows where 'date' is before April 1, 2021
df_all = df_all[df_all['date'] > '2021-04-01'].copy()
print(df_all.shape)

(489325, 18)


In [261]:
# Select rows containing "GME", "Gamestop", or "$GME" in the 'Text' column
#df = df[df['Text'].str.contains(r'GME|Gamestop|\$GME', case=False, regex=True)].copy()

### Only keeping the text fields in a smaller dataframe df

In [28]:
# Selecting only the 'id', 'title', and 'selftext' columns
df = df_all[['id', 'title', 'selftext']].copy()

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 489325 entries, 46 to 1048980
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        489325 non-null  object
 1   title     489325 non-null  object
 2   selftext  296880 non-null  object
dtypes: object(3)
memory usage: 14.9+ MB


In [30]:
df.sample(5)

Unnamed: 0,id,title,selftext
473088,rcykn5,Why invest in Liberty Oilfield Services?,[removed]
401438,qfmagt,Blackberry shares thoughts?,[removed]
792572,nuhd4n,To the moon!!,
197229,mx0jcm,It’s going to happen,
990785,mkbmic,Easter & Egg Hunting with the Fam was fun.. Ok...,


In [31]:
# Concatenating 'title' and 'selftext' with a comma separator
df['Text'] = df.apply(lambda row: f"{row['title']}, {row['selftext']}", axis=1)

# Displaying the first few rows of the new concatenated column
print(df[['id', 'Text']].head())


         id                                               Text
46   msblc3  GME YOLO update — Apr 16 2021 — final update, nan
69   pu7l07  What Are Your Moves Tomorrow, September 24, 20...
87   mqp6lv  COIN IPO Megathread 4/14/2021, This is a megat...
155  pk8tne  For anyone who doesn’t understand why Hedgefon...
158  nuxr2t  r/GME Megathread for Tuesday - June 08, 2021, ...


### Eliminating lengthy messages (too expensive and lengthy for chat GPT processing)

In [32]:
from transformers import GPT2Tokenizer

# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Function to estimate tokens
def estimate_tokens(text):
    return len(tokenizer.encode(text))

# Applying the function to your dataframe
df['token_count'] = df['Text'].apply(estimate_tokens)

Token indices sequence length is longer than the specified maximum sequence length for this model (1187 > 1024). Running this sequence through the model will result in indexing errors


In [33]:
long_rows = df[df['token_count'] > 300]
long_rows.sample(5)

Unnamed: 0,id,title,selftext,Text,token_count
23441,nlhcci,Accounting 101 - Part 1: The Income Statement,"Hey everyone, here's the first part to a serie...","Accounting 101 - Part 1: The Income Statement,...",3035
378880,mr9gvc,"4/16 Option Scraper Data: Total Calls, ITM Cal...",Total Calls:\n\n&#x200B;\n\nhttps://preview.re...,"4/16 Option Scraper Data: Total Calls, ITM Cal...",334
144692,os3ca6,Newbie trade -- what do you think?,I own about 350 shares of $WFC. Because of all...,"Newbie trade -- what do you think?, I own abou...",350
1037366,mw7gvc,$ZKIN. Another one to the moon?,Since the last 15m outlook the price of ZKIN g...,"$ZKIN. Another one to the moon?, Since the las...",360
163123,nv2atm,What's the best way to simulate leveraged buy ...,"Hi all, I've learned a lot from this sub so th...",What's the best way to simulate leveraged buy ...,461


In [34]:
short_rows = df[df['token_count'] < 20]
short_rows.sample(5)

Unnamed: 0,id,title,selftext,Text,token_count
93073,o5bjl4,Dave Lauer Speaking The Truth,,"Dave Lauer Speaking The Truth, nan",8
367092,pjfr63,I want everyone’s honest opinions on CLOV,[removed],"I want everyone’s honest opinions on CLOV, [re...",16
720139,opqoz8,Thinking,[removed],"Thinking, [removed]",7
7706,ppb0hn,"Probably should have asked in the daily, but i...",,"Probably should have asked in the daily, but i...",18
350772,nrha48,"In 2021, WSB had some fun 🚀",,"In 2021, WSB had some fun 🚀, nan",13


In [287]:
# Dropping rows where token_count is greater than 400
df = df[df['token_count'] <= 400].reset_index(drop=True)
print(df.shape)

(241354, 5)


In [288]:
# Dropping rows where token_count is smaller than 20
df = df[df['token_count'] > 20].reset_index(drop=True)
print(df.shape)

(111183, 5)


In [289]:
token_total = df['token_count'].agg("sum")
print(token_total)

9751341


In [290]:
token_average = df['token_count'].agg("median")
print(token_average)

47.0


In [291]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111183 entries, 0 to 111182
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   id           111183 non-null  object
 1   title        111183 non-null  object
 2   selftext     47707 non-null   object
 3   Text         111183 non-null  object
 4   token_count  111183 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 4.2+ MB


In [224]:
#df.drop(df.index[0:3], inplace=True)

In [292]:
df.shape

(111183, 5)

In [None]:
# Function to send text to GPT-4 and receive sentiment tag (you need to implement this)
openai.api_key = "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"

def ask_gpt(text):
    try:
        # Sending the prompt to the ChatGPT model and getting the response
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",  # Adjust the model name as needed
            messages=[{"role": "system", "content": "You are a helpful assistant."}, 
                      {"role": "user", "content": text}]
        )
        # Returning the text of the response
        return response.choices[0].message['content'].strip()
    except Exception as e:
        print(f"An error occurred: {e}")
        return "Error"

def throttled_request(text, delay=1.0):
    response = ask_gpt(text)
    time.sleep(delay)
    return response

# Function to process a batch and append results to a CSV file
def process_batch(df, batch_size, csv_file):
    start = 0
    while start < len(df):
        end = min(start + batch_size, len(df))
        batch = df.iloc[start:end].copy()  # Use .copy() to avoid SettingWithCopyWarning
        batch['Sentiment_Tag'] = batch['Text'].apply(lambda text: throttled_request(f"""You will be presented with a Reddit message and your job is to provide in return a sentiment tag: choose either the “Positive” OR “Negative” OR “Unclear” tag but NOT several). 
Assess the sentiment of a Reddit message concerning shares, identified by company names or stock tickers. 
Focus exclusively on sentiments about the shares' performance, not the overall business performance. Evaluate the entire post. 
Classify the sentiment as 'Positive' for actions like buying calls, holding call options, or selling put options; 
'Negative' for buying puts, holding put options, or selling call options; 
and 'Unclear' for neutral, mixed, conflicting sentiments, or unclear mentions. 
Return back a tag with a sentiment concerning the shares. Choose tags ONLY from the list of tags provided above. 
Provide in response just a tag without an explanation.
Message: {text}"""))
        # Append batch results to CSV file
        if start == 0:
            batch.to_csv(csv_file, mode='w', header=True, index=False)  # Write mode with header for the first batch
        else:
            batch.to_csv(csv_file, mode='a', header=False, index=False)  # Append mode without header for subsequent batches
        
        start += batch_size
        print(f"Processed batch {start // batch_size} of {len(df) // batch_size}")

# Parameters
batch_size = 3  # Adjust this based on your needs
csv_file = '../data/wsb_sentiment_results.csv'

# Process the DataFrame in batches
process_batch(df, batch_size, csv_file)

print("Processing complete.")

### Re-checking the chat GPT rating by sending a portion of the negative ratings again back to chat GPT, for a repeat rating 

In [314]:
# Read in data
df_check = pd.read_csv('wsb_sentiment_results.csv')
print(df_check.shape)
# df = df.head(9)
#print(df.shape)

(1602, 6)


As a resut of partial processing of the Reddit unlabelled messages with Chat GPT 3.5 API I have accumulated 1602 rows.

In [315]:
# Set maximum column width to None to display full text content
pd.set_option('display.max_colwidth', None)
df_check.head()

Unnamed: 0,id,title,selftext,Text,token_count,Sentiment_Tag
0,mqp6lv,COIN IPO Megathread 4/14/2021,"This is a megathread for the $COIN IPO. We will allow discussion of the COIN stock, but do note that the no crypto rule still applies. If you use COIN as a proxy to discuss crypto, instead of discussing the stock, you will be banned. It’s a bit of a gray line, so ensure that you keep discussion focused on the stock.\n\n\nEdit: Direct Listing*","COIN IPO Megathread 4/14/2021, This is a megathread for the $COIN IPO. We will allow discussion of the COIN stock, but do note that the no crypto rule still applies. If you use COIN as a proxy to discuss crypto, instead of discussing the stock, you will be banned. It’s a bit of a gray line, so ensure that you keep discussion focused on the stock.\n\n\nEdit: Direct Listing*",103,Unclear
1,pk8tne,"For anyone who doesn’t understand why Hedgefonds lost, this ape explained it well.",,"For anyone who doesn’t understand why Hedgefonds lost, this ape explained it well., nan",21,Unclear
2,oxwuiw,It absolutely blows my mind that so many people and especially people on this sub are STILL using Robinhood despite what happened with the whole gamestop fiasco.,"There's still so many people using Robinhood after all the GME crap. And what really blows my mind is that I see people here on this very sub using Robinhood everyday? It absolutely baffles the living shit out of me as to why you are still using something that screwed you over, especially knowing very well that it could happen again and screw you over AGAIN. I don't want to see anyone who gets screwed over by Robinhood yet again complaining about it. You asked for it buddy.\n\nI know you stock market types only care about profit but godamn what kind of abusive relationship is this. This is mainly why I switched to the other market and projects like Merrymen who aim to undo this shit. What kind of abusive relationship do you people have with Robinhood. They give you a slick UI and you forget about all the wrongs they did to you? bruh.","It absolutely blows my mind that so many people and especially people on this sub are STILL using Robinhood despite what happened with the whole gamestop fiasco., There's still so many people using Robinhood after all the GME crap. And what really blows my mind is that I see people here on this very sub using Robinhood everyday? It absolutely baffles the living shit out of me as to why you are still using something that screwed you over, especially knowing very well that it could happen again and screw you over AGAIN. I don't want to see anyone who gets screwed over by Robinhood yet again complaining about it. You asked for it buddy.\n\nI know you stock market types only care about profit but godamn what kind of abusive relationship is this. This is mainly why I switched to the other market and projects like Merrymen who aim to undo this shit. What kind of abusive relationship do you people have with Robinhood. They give you a slick UI and you forget about all the wrongs they did to you? bruh.",214,Negative
3,mjy92g,"i'm about to YOLO my $800k life savings on starbucks gift cards, what are the tax implications ??","hey wsb i'm going to invest my life savings in starbucks gift cards cause i think the dollar is going to go down, i plan to sell them in a couple years and make an absolute killing\n\nwhat are the tax implications of doing this??\n\nwhat kind of investment vehicle are starbucks gift cards anyway? my polyamorous girlfriend says that they're most similar to bearer bonds, which makes sense; does that tie their value to starbucks' capitalization?","i'm about to YOLO my $800k life savings on starbucks gift cards, what are the tax implications ??, hey wsb i'm going to invest my life savings in starbucks gift cards cause i think the dollar is going to go down, i plan to sell them in a couple years and make an absolute killing\n\nwhat are the tax implications of doing this??\n\nwhat kind of investment vehicle are starbucks gift cards anyway? my polyamorous girlfriend says that they're most similar to bearer bonds, which makes sense; does that tie their value to starbucks' capitalization?",123,Unclear
4,p72imy,Ruined My Financial Future… This is 99% of my savings and basically everything that me and many family own… Need Baba back up to 243 to break even…,,"Ruined My Financial Future… This is 99% of my savings and basically everything that me and many family own… Need Baba back up to 243 to break even…, nan",36,Negative


In [316]:
# Getting frequency counts for 'Sentiment_Tag'
tag_counts = df_check['Sentiment_Tag'].value_counts()

# Displaying the frequency counts
print(tag_counts)

Sentiment_Tag
Unclear     565
Positive    484
Negative    464
Error        87
Neutral       2
Name: count, dtype: int64


In [317]:
df_check = df_check[df_check['Sentiment_Tag'] != 'Error'].copy()
df_check = df_check[df_check['Sentiment_Tag'] != 'Unclear'].copy()
df_check = df_check[df_check['Sentiment_Tag'] != 'Positive'].copy()
df_check = df_check[df_check['Sentiment_Tag'] != 'Neutral'].copy()
df_check.shape


(464, 6)

We have 464 negative ratings, or 29% of the data given to Chat GPT. 

When I review the quality of the GPT 3.5 ratings, I notice that while many of them are correct, not all of them are reasonable.  Some ratings I disagree with. In order to increae my confidence in having good quality ratings, I decided to screen them again, using a different GPT model. 

In the code below I am re-checking the negative ratings from the previous setp with Chat GPT-4, a more advanced model (also more expensive to process tokens). I essentially treat Chat GPT-4 as a second set of eyes (another human) to confirm the ratings given to me by Chat GPT 3.5 model.  I record the results of GPT-4 model processing in a new column Sentiment_Tag_2, in batches, in a new csv file. 

For my negative ratings upsampling I will select only those that were rated as "negative" by both models. 

In [243]:
# Function to send text to GPT-4 and receive sentiment tag (you need to implement this)
openai.api_key = "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"

def ask_gpt(text):
    try:
        # Sending the prompt to the ChatGPT model and getting the response
        response = openai.ChatCompletion.create(
            model="gpt-4",  # Adjust the model name as needed
            messages=[{"role": "system", "content": "You are a helpful assistant."}, 
                      {"role": "user", "content": text}]
        )
        # Returning the text of the response
        return response.choices[0].message['content'].strip()
    except Exception as e:
        print(f"An error occurred: {e}")
        return "Error"

def throttled_request(text, delay=5.0):
    response = ask_gpt(text)
    time.sleep(delay)
    return response

# Function to process a batch and append results to a CSV file
def process_batch(df, batch_size, csv_file):
    start = 0
    while start < len(df):
        end = min(start + batch_size, len(df))
        batch = df.iloc[start:end].copy()  # Use .copy() to avoid SettingWithCopyWarning
        batch['Sentiment_Tag_2'] = batch['Text'].apply(lambda text: throttled_request(f"""You will be presented with a Reddit message and your job is to provide in return a sentiment tag: choose either the “Positive” OR “Negative” OR “Unclear” tag but NOT several). 
Assess the sentiment of a Reddit message concerning shares, identified by company names or stock tickers. 
Focus exclusively on sentiments about the shares' performance, not the overall business performance. Evaluate the entire post. 
Classify the sentiment as 'Positive' for actions like buying calls, holding call options, or selling put options; 
'Negative' for buying puts, holding put options, or selling call options; 
and 'Unclear' for neutral, mixed, conflicting sentiments, or unclear mentions. 
Return back a tag with a sentiment concerning the shares. Choose tags ONLY from the list of tags provided above. 
Provide in response just a tag without an explanation.
Message: {text}"""))
        # Append batch results to CSV file
        if start == 0:
            batch.to_csv(csv_file, mode='w', header=True, index=False)  # Write mode with header for the first batch
        else:
            batch.to_csv(csv_file, mode='a', header=False, index=False)  # Append mode without header for subsequent batches
        
        start += batch_size
        print(f"Processed batch {start // batch_size} of {len(df) // batch_size}")

# Parameters
batch_size = 3  # Adjust this based on your needs
csv_file = '../data/wsb_sentiment_results_check.csv'

# Process the DataFrame in batches
process_batch(df_check, batch_size, csv_file)

print("Processing complete.")

Processed batch 1 of 99
Processed batch 2 of 99
Processed batch 3 of 99
Processed batch 4 of 99
Processed batch 5 of 99
Processed batch 6 of 99
Processed batch 7 of 99
Processed batch 8 of 99
Processed batch 9 of 99
Processed batch 10 of 99
Processed batch 11 of 99
Processed batch 12 of 99
Processed batch 13 of 99
Processed batch 14 of 99
Processed batch 15 of 99
Processed batch 16 of 99
Processed batch 17 of 99
Processed batch 18 of 99
Processed batch 19 of 99
Processed batch 20 of 99
Processed batch 21 of 99
Processed batch 22 of 99
Processed batch 23 of 99
Processed batch 24 of 99
Processed batch 25 of 99
Processed batch 26 of 99
Processed batch 27 of 99
Processed batch 28 of 99
Processed batch 29 of 99
Processed batch 30 of 99
Processed batch 31 of 99
Processed batch 32 of 99
Processed batch 33 of 99
Processed batch 34 of 99
Processed batch 35 of 99
Processed batch 36 of 99
Processed batch 37 of 99
Processed batch 38 of 99
Processed batch 39 of 99
Processed batch 40 of 99
Processed

### Re-checking the chat GPT-4 ratings and comparing two GPT model outputs

In [14]:
# Read in data
df_check_gpt4 = pd.read_csv('../data/wsb_sentiment_results_check.csv')
print(df_check_gpt4.shape)
# df = df.head(9)
#print(df.shape)

(464, 7)


In [15]:
df_check_gpt4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 464 entries, 0 to 463
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               464 non-null    object
 1   title            464 non-null    object
 2   selftext         337 non-null    object
 3   Text             464 non-null    object
 4   token_count      464 non-null    int64 
 5   Sentiment_Tag    464 non-null    object
 6   Sentiment_Tag_2  464 non-null    object
dtypes: int64(1), object(6)
memory usage: 25.5+ KB


Having a look at the new ratings (see new column Sentriment_Tag_2),  anc comparing the output of two GPT models:

In [16]:
df_check_gpt4.sample(5)


Unnamed: 0,id,title,selftext,Text,token_count,Sentiment_Tag,Sentiment_Tag_2
56,njizhs,When did this Subreddit become therapy?,I swear if I see one more “I invested in GME b...,"When did this Subreddit become therapy?, I swe...",136,Negative,Unclear
294,r3i8oq,Be prepared for a drawdown in pie stocks,Pi will be the next COVID variant. When the p...,"Be prepared for a drawdown in pie stocks, Pi w...",184,Negative,Negative
376,loiz1r,Buying long term puts on Netflix.,After having access to Netflix ( NFLX ) in Ame...,"Buying long term puts on Netflix., After havin...",170,Negative,Negative
348,rs41ko,RobinHood - Sell jan23 puts?,A lot of hedge funds have jumped into Robinhoo...,"RobinHood - Sell jan23 puts?, A lot of hedge f...",62,Negative,Positive
139,qthc16,Who's Shorting Rivian this week?,It's now well know that Rivian is going to hit...,"Who's Shorting Rivian this week?, It's now wel...",128,Negative,Negative


In [17]:
# Getting frequency counts for 'Sentiment_Tag'
tag_counts = df_check_gpt4['Sentiment_Tag_2'].value_counts()

# Displaying the frequency counts
print(tag_counts)
df_check_gpt4.shape

Sentiment_Tag_2
Negative    342
Unclear      87
Positive     35
Name: count, dtype: int64


(464, 7)

342 rows weer rated as "negative" by both GPT models.  This is 74% of the starting number of rowsrated as negative by Chat GPT 3.5 (464 rows).  In 8% of cases Chat GPT 4 disagreed with the negative rating given by Chat GPT 3.5 and assigned "positive" rating to the text. 

**Conclusion**:  I will treat those instances where both Chat GPT models agree that the rating was "negative" as the additional "ground truth" for my model training.  This will be very helpful in training my models as this will help to address the deficit of negative ratings in the labelled dataset. 

In [18]:
# Deleting rows where two GPT models disagree with each other. Keeping only those rows where both ratings are "negative"
df_check_gpt4 = df_check_gpt4[df_check_gpt4['Sentiment_Tag_2'] != 'Unclear'].copy()
df_check_gpt4 = df_check_gpt4[df_check_gpt4['Sentiment_Tag_2'] != 'Positive'].copy()
df_check_gpt4.shape

(342, 7)

In [19]:
# Save the DataFrame to a CSV file
df_check_gpt4.to_csv('../data/reddit_gpt4_negative_only.csv', index=False)