In [6]:
import praw

In [7]:
from dotenv import load_dotenv
import os

load_dotenv()

CLIENT_ID = os.getenv("R_CLIENT_ID")
CLIENT_SECRET = os.getenv("R_CLIENT_SECRET")
USER_AGENT = os.getenv("R_USER_AGENT")
USERNAME = os.getenv("R_USERNAME")
PASSWORD = os.getenv("R_PASSWORD")

In [8]:
reddit_instance = praw.Reddit(
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET,
    user_agent=USER_AGENT,
    username=USERNAME,
    password=PASSWORD
)

In [9]:
subreddit = reddit_instance.subreddit('portugal2')
subreddit

Subreddit(display_name='portugal2')

In [11]:
import pandas as pd
from datetime import datetime

top_posts = subreddit.top(limit=1000, time_filter="year")

posts_df = []
for post in top_posts:
    # Converte o timestamp Unix para um formato de data legível
    post_date = datetime.utcfromtimestamp(post.created_utc).strftime('%d-%m-%Y')

    # Adiciona os dados do post à lista
    posts_df.append({
        'Title': post.title,
        'Content': post.selftext,
        'Number of Comments': post.num_comments,
        'Created': post_date  # Data de criação
    })
posts_df = pd.DataFrame(posts_df)
posts_df.to_excel('reddit.xlsx', index=False)

In [None]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('portuguese'))
# Função para remover stopwords, números e deixar tudo em minúsculas
def clean_text(text):
    # Converter o texto para minúsculas
    text = text.lower()
    
    # Remover números
    text = re.sub(r'\d+', '', text)
    
    # Tokenizar o texto em palavras
    words = word_tokenize(text)
    
    # Remover stopwords
    words_filtered = [word for word in words if word not in stop_words]
    
    # Juntar as palavras filtradas em uma string
    return ' '.join(words_filtered)

# Aplicar a função ao dataframe
posts_df['Content'] = posts_df['Content'].apply(lambda x: clean_text(str(x)))