In [12]:
#PACKAGES

import pandas as pd
import yfinance as yf
import talib
import numpy as np
import requests
from io import StringIO
from datetime import date, datetime, timedelta
import praw
import emoji
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
import matplotlib.pyplot as plt

In [13]:
#DOWNLOADS

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/pedroalexleite/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pedroalexleite/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/pedroalexleite/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
#TICKERS

url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
headers = {"User-Agent": "Mozilla/5.0"}
html = requests.get(url, headers=headers).text
sp500 = pd.read_html(StringIO(html))[0]
tickers = sp500['Symbol'].tolist()
ticker_map = {
    "BF.B": "BF-B",
    "BRK.B": "BRK-B"
}
tickers = [ticker_map.get(t, t) for t in tickers] 
tickers = sorted(tickers)

In [16]:
#SCRAPE

def scrape(tickers):
    reddit = praw.Reddit(
        client_id='...',
        client_secret='...',
        user_agent='...'
    )
    subreddit = reddit.subreddit("wallstreetbets")

    list_posts = []
    for i, ticker in enumerate(tickers):
        posts = []
    
        if len(ticker) == 1:
            query = f"${ticker}"
        else:
            query = f"{ticker} OR ${ticker}"
    
        for submission in subreddit.search(
            query=query,
            sort="new",
            limit=None,
            syntax="lucene",
            time_filter="year"
        ):
            posts.append({
                "text": submission.selftext,
            })
    
        list_posts.append(posts)
        print(f"[{i+1}/{len(tickers)}] Collected posts for {ticker}")
    
    return list_posts

posts = scrape(tickers)

[1/503] Collected posts for A
[2/503] Collected posts for AAPL
[3/503] Collected posts for ABBV
[4/503] Collected posts for ABNB
[5/503] Collected posts for ABT
[6/503] Collected posts for ACGL
[7/503] Collected posts for ACN
[8/503] Collected posts for ADBE
[9/503] Collected posts for ADI
[10/503] Collected posts for ADM
[11/503] Collected posts for ADP
[12/503] Collected posts for ADSK
[13/503] Collected posts for AEE
[14/503] Collected posts for AEP
[15/503] Collected posts for AES
[16/503] Collected posts for AFL
[17/503] Collected posts for AIG
[18/503] Collected posts for AIZ
[19/503] Collected posts for AJG
[20/503] Collected posts for AKAM
[21/503] Collected posts for ALB
[22/503] Collected posts for ALGN
[23/503] Collected posts for ALL
[24/503] Collected posts for ALLE
[25/503] Collected posts for AMAT
[26/503] Collected posts for AMCR
[27/503] Collected posts for AMD
[28/503] Collected posts for AME
[29/503] Collected posts for AMGN
[30/503] Collected posts for AMP
[31/503] 

In [17]:
#CLEAN

def clean_post(post):  
    #lower
    post = post.lower()
    
    #excess
    post = emoji.replace_emoji(post, '')
    post = re.sub(r'http\S+|www\S+', '', post)
    post = post.replace("\\", "")
    post = re.sub(r'\n\d*', ' ', post)
    post = re.sub(r'[\(\)\[\]\"\'-]', '', post) 
    post = re.sub(r'[^\w\s]', '', post) 
    
    #stop-words
    words = nltk.word_tokenize(post)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.lower() not in stop_words]
    
    #lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    post = ' '.join(words)
    
    #trim
    post = re.sub(r'\s+', ' ', post).strip()
    
    return post

def clean_all(posts):
    cleaned_posts = []
    for ticker_posts in posts:
        cleaned_ticker_posts = []
        for post_dict in ticker_posts:
            text = post_dict.get("text", "")
            if text.strip():
                cleaned_text = clean_post(text)
                if cleaned_text:
                    cleaned_ticker_posts.append({"text": cleaned_text})
        cleaned_posts.append(cleaned_ticker_posts)
        
    return cleaned_posts

posts = clean_all(posts)

In [21]:
#SENTIMENT

def extract_sentiment(post):
    blob = TextBlob(post)
    score = blob.sentiment.polarity
    
    return score

def sentiment_all(posts):
    all_sentiments = []

    for ticker_posts in posts:
        ticker_sentiments = []
        for post_dict in ticker_posts:
            text = post_dict.get("text", "")
            if text.strip():
                score = extract_sentiment(text)
                ticker_sentiments.append(score)
        all_sentiments.append(ticker_sentiments)
    
    return all_sentiments

def sentiment_avg(all_sentiments):
    avg_sentiments = []
    for ticker_sentiments in all_sentiments:
        if ticker_sentiments:  
            avg = sum(ticker_sentiments) / len(ticker_sentiments)
        else:
            avg = 0
        avg_sentiments.append(avg)
        
    return avg_sentiments
    
sentiments = sentiment_all(posts)
avg_sentiments = sentiment_avg(sentiments)

NameError: name 'df' is not defined

In [24]:
#DATAFRAME

sentimental = pd.DataFrame({
    "Symbol": tickers,
    "Sentimental": avg_sentiments
})
sentimental['Sentimental'] = sentimental['Sentimental'].replace(0, sentimental['Sentimental'].mean())

In [25]:
#SAVE

sentimental.to_csv("/Users/pedroalexleite/Desktop/Portfolio-Construction-1/Data/sentimental.csv", index=False)