# Proposta de monitoramento do site Reddit

### O Reddit é um agregador social de notícias ou um social bookmarks. O Reddit é dividido em várias comunidades chamadas de "subreddits". São nesses subreddits que reside o conteúdo do site.

In [36]:
from datetime import datetime
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
import sqlite3
import pandas as pd

## Função que extrai os dados e retorna uma lista com diversos dicionários

### Na função abaixo é recebido o html da página do subreddit, procurando a tag 'main' com todos os 'articles' (que contém o conteúdo de cada post)

In [2]:
# Função para extrair o título, autor, link e tag dos posts
def extrair_reddit_posts(subreddit: str) -> list:
    infos_list = []
    soup = BeautifulSoup(subreddit, 'html5lib')
        
    # Encontrando os artigos
    articles = soup.find('main', id='main-content').find_all('article')
        
    for article in articles:
        infos = {}
        infos['Author'] = article.find('shreddit-post').get('author')
        infos['Title'] = article.find('shreddit-post').get('post-title').strip()
        infos['Link'] = f"reddit.com{article.find('shreddit-post').get('permalink')}"
        infos['Upvotes'] = article.find('shreddit-post').get('score')
        infos['Comments'] = article.find('shreddit-post').get('comment-count')
        infos['Post-type'] = article.find('shreddit-post').get('post-type')
        infos['Date'] = datetime.strptime(article.find('time').get('datetime'), "%Y-%m-%dT%H:%M:%S.%fZ").date()
        try:
            infos['Tag'] = article.find('a', class_='no-decoration').find('div').text.strip()
        except AttributeError:
            infos['Tag'] = None
        infos_list.append(infos)

    return infos_list

## A função abaixo cria uma tabela em uma database sqlite com o nome do subreddit digitado

In [3]:
def create_sql_table(subreddit: str) -> None:
    # Conectar ao banco de dados (se não existir, será criado)
    conexao = sqlite3.connect(f'{subreddit}.db')
    cursor = conexao.cursor()

    # Criar tabela (se ainda não existir)
    cursor.execute('''CREATE TABLE IF NOT EXISTS subreddits (
                        ID INTEGER PRIMARY KEY,
                        TÍTULO TEXT,
                        AUTOR TEXT,
                        LINK TEXT,
                        TAG TEXT,
                        UPVOTES INTEGER,
                        COMENTÁRIOS INTEGER,
                        POST_TYPE TEXT,
                        DATA DATE
                    )''')
    
    # Confirmar a transação
    conexao.commit()

    # Fechar conexão
    conexao.close()

## Após criar a tabela na função acima, a função abaixo salva os dados na db
### A função é iniciada com 2 argumentos, o dicionário da primeira função, contendo detalhes do post, e o subreddit para acessar a tabela na conexão com a database

In [4]:
def save_in_sqlite(content: dict, subreddit: str) -> None:
    # Conectando ao banco de dados
    conexao = sqlite3.connect(f'{subreddit}.db')
    cursor = conexao.cursor()
    
    # Conferindo se o post já existe
    cursor.execute('''SELECT LINK FROM subreddits WHERE LINK = ?''', (content['Link'],))
    existing_content = cursor.fetchone()
    
    if not existing_content:
        # Inserir dados do dicionário na tabela
        cursor.execute('''INSERT INTO subreddits (TÍTULO, AUTOR, LINK, TAG, UPVOTES, COMENTÁRIOS, POST_TYPE, DATA)
                          VALUES (?, ?, ?, ?, ?, ?, ?, ?)''', (content['Title'], content['Author'], content['Link'], content['Tag'], content['Upvotes'], content['Comments'], content['Post-type'], content['Date']))

        # Confirmar a transação
        conexao.commit()

    # Fechar conexão
    conexao.close()

In [5]:
# Função para rolar até o final da página e carregar mais posts
def scroll_to_bottom(driver) -> None:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(1)

In [24]:
sub_reddit = input("Digite o subreddit desejado: ")

url = f'https://www.reddit.com/r/{sub_reddit}/'

# Inicializando o driver do Selenium
driver = webdriver.Chrome()

# Abrindo o URL
driver.get(url)

# Encontrando o botão de login e cliquando nele
login_button = driver.find_element(By.ID, "login-button")
login_button.click()

# Aguardando um pouco para que a página de login seja carregada completamente
time.sleep(2)

# Encontrando os campos para input de usuário e senha
username_field = driver.find_element(By.ID, "login-username")
password_field = driver.find_element(By.ID, "login-password")

# Inserindo usúario e senha (*LEMBRAR DE NÃO DEIXA VISÍVEL NA HORA DE APRESENTAR*)
username_field.send_keys("loguera")
password_field.send_keys("Fontenova231")

# Enviando as credenciais (pressionando enter no campo de senha)
password_field.send_keys(Keys.RETURN)

# Esperando um pouco para o poput de login e senha sair
time.sleep(2)

# Rolando até o final da página, 120 vezes, oara carregar mais posts
for _ in range(120):
    scroll_to_bottom(driver)
    time.sleep(2)

# Extraindo posts da página inicial
html_content = driver.page_source
# Variável com a função que extrairá posts
posts = extrair_reddit_posts(html_content)
# Chamando função que cria a tabela na db
create_sql_table(sub_reddit)

for post in posts:
    # Salvando cada dicionário da lista na db
    save_in_sqlite(post, sub_reddit)
    
# Encerrando o driver
driver.quit()

Digite o subreddit desejado: datasets


The chromedriver version (123.0.6312.105) detected in PATH at C:\Users\Loguera\Desktop\chromedriver-win64\chromedriver.exe might not be compatible with the detected chrome version (124.0.6367.63); currently, chromedriver 124.0.6367.91 is recommended for chrome 124.*, so it is advised to delete the driver in PATH and retry


In [25]:
# Conectar ao banco de dados
conexao = sqlite3.connect(f'{sub_reddit}.db')

# Executar consultas
query = "SELECT * FROM subreddits"
df = pd.read_sql_query(query, conexao, index_col='ID')

# Fechar a conexão
conexao.close()

In [26]:
print("Dimensões originais: ", df.shape)

Dimensões originais:  (1000, 8)


In [27]:
df.columns

Index(['TÍTULO', 'AUTOR', 'LINK', 'TAG', 'UPVOTES', 'COMENTÁRIOS', 'POST_TYPE',
       'DATA'],
      dtype='object')

In [28]:
df.dtypes

TÍTULO         object
AUTOR          object
LINK           object
TAG            object
UPVOTES         int64
COMENTÁRIOS     int64
POST_TYPE      object
DATA           object
dtype: object

In [29]:
df.isna().sum()

TÍTULO         0
AUTOR          0
LINK           0
TAG            2
UPVOTES        0
COMENTÁRIOS    0
POST_TYPE      0
DATA           0
dtype: int64

In [30]:
df[df.isna().any(axis=1)]

Unnamed: 0_level_0,TÍTULO,AUTOR,LINK,TAG,UPVOTES,COMENTÁRIOS,POST_TYPE,DATA
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
303,Large Language Models for Data Annotation: A S...,cavedave,reddit.com/r/datasets/comments/1ax75og/large_l...,,3,4,link,2024-02-22
744,Bible datasets,cavedave,reddit.com/r/datasets/comments/17pfhgy/bible_d...,,2,0,link,2023-11-06


In [31]:
df.sort_values(by='UPVOTES', ascending=False)

Unnamed: 0_level_0,TÍTULO,AUTOR,LINK,TAG,UPVOTES,COMENTÁRIOS,POST_TYPE,DATA
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
148,Why use R instead of Python for data stuff?,Nickaroo321,reddit.com/r/datasets/comments/1bo6b2s/why_use...,question,93,77,text,2024-03-26
231,"I made OMDB, the world's largest downloadable ...",OatsCG,reddit.com/r/datasets/comments/1b9ihqa/i_made_...,dataset,73,13,link,2024-03-08
207,Dateno - a new dataset search engine,ivan-begtin,reddit.com/r/datasets/comments/1bdn4om/dateno_...,request,46,13,text,2024-03-13
947,I built a free tool that auto-generates scrape...,madredditscientist,reddit.com/r/datasets/comments/16nq9n6/i_built...,resource,33,9,multi_media,2023-09-20
155,1-Year of Life Data. What makes me happy?,tsawsum1,reddit.com/r/datasets/comments/1bnjzk5/1year_o...,dataset,29,6,multi_media,2024-03-25
...,...,...,...,...,...,...,...,...
371,Does anybody have access to a dataset of black...,The-White-Furry,reddit.com/r/datasets/comments/1ajpekn/does_an...,request,0,4,text,2024-02-05
811,Crypto currency datasets required for performi...,Varc20,reddit.com/r/datasets/comments/17e6kuu/crypto_...,request,0,4,text,2023-10-22
369,Looking for a political compass questionnaire ...,Play4u,reddit.com/r/datasets/comments/1ak4x78/looking...,request,0,0,text,2024-02-06
367,Exploring the spread of pro-Israel and pro-Pal...,cavedave,reddit.com/r/datasets/comments/1akgkbk/explori...,dataset,0,2,link,2024-02-06


In [13]:
df[df['TAG'] == 'Project'].sort_values(by='UPVOTES', ascending=False)

Unnamed: 0_level_0,TÍTULO,AUTOR,LINK,TAG,UPVOTES,COMENTÁRIOS,POST_TYPE,DATA
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
835,[P] How I found 8 bugs in Google's Gemma 6T to...,danielhanchen,reddit.com/r/MachineLearning/comments/1bipsqj/...,Project,469,59,multi_media,2024-03-19
894,[P] Paperlib: An open-source and modern-design...,GeoffreyChen,reddit.com/r/MachineLearning/comments/1bh63c1/...,Project,200,89,multi_media,2024-03-17
544,[P] SWE-agent: an open source coding agent tha...,ofirpress,reddit.com/r/MachineLearning/comments/1btwl37/...,Project,170,21,multi_media,2024-04-02
617,[P] Using ML to Annotate Dental Xrays,Responsible-Win3865,reddit.com/r/MachineLearning/comments/1brbaii/...,Project,133,23,image,2024-03-30
643,[P] Jamba: the first production-grade Mamba-ba...,ghosthamlet,reddit.com/r/MachineLearning/comments/1bqfibp/...,Project,131,18,multi_media,2024-03-29
...,...,...,...,...,...,...,...,...
668,[Project] I need to create a Raster image to V...,GodMan6660,reddit.com/r/MachineLearning/comments/1bqmy0l/...,Project,0,19,text,2024-03-29
669,Code base documentation and testing using LLM [P],Soaccer,reddit.com/r/MachineLearning/comments/1bqdkge/...,Project,0,0,multi_media,2024-03-29
677,[P]I turned Elon Musk's face into a decision b...,lildaemon,reddit.com/r/MachineLearning/comments/1bqkcor/...,Project,0,15,multi_media,2024-03-29
192,[Project] AI powered products in stores,Complete-Holiday-610,reddit.com/r/MachineLearning/comments/1c7q6o1/...,Project,0,2,text,2024-04-19


In [32]:
df[df.index == 147].LINK.values

array(['reddit.com/r/datasets/comments/1boy114/video_dataset_for_abnormal_event_detection_in/'],
      dtype=object)