In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

In [45]:
class RedditData:
    
    def __init__(self, csv):
        self.df = pd.read_csv(csv)
        self.commentdf = pd.DataFrame()
        self.postdf = pd.DataFrame()
        self.communitydf = pd.DataFrame()
        self.split_by_dataType()
        self.clean_comments()
        self.clean_posts()
        self.clean_community()

    def split_by_dataType(self):
        for name, group in self.df.groupby('dataType'):
            setattr(self, name+'df', pd.DataFrame(group))

    def clean_comments(self):
        self.commentdf = self.commentdf.dropna(axis=1)

    def clean_posts(self):
        self.postdf = self.postdf.dropna(axis=1)
    
    def clean_community(self):
        self.communitydf = self.communitydf.dropna(axis=1)

    def convert_to_eli5_structure(self):
        pass
        

r = RedditData('../dataset_reddit-scraper_2023-05-01_22-58-46-575.csv')


In [8]:
r.df.columns

Index(['alternativeTitle', 'body', 'categories/0', 'categories/1',
       'categories/2', 'categories/3', 'category', 'categoryLabel',
       'communityName', 'createdAt', 'dataType', 'html', 'id', 'isAd',
       'isVideo', 'numberOfComments', 'numberOfMembers', 'numberOfreplies',
       'parentId', 'parsedCommunityName', 'parsedId', 'scrapedAt', 'title',
       'upVotes', 'url', 'username'],
      dtype='object')

In [9]:
r.df['dataType'].value_counts()

dataType
comment      14284
post          1837
community        2
Name: count, dtype: int64

In [46]:
print(r.postdf.columns)
print(r.commentdf.columns)
print(r.communitydf.columns)

Index(['communityName', 'createdAt', 'dataType', 'id', 'isAd', 'isVideo',
       'numberOfComments', 'parsedCommunityName', 'parsedId', 'scrapedAt',
       'title', 'upVotes', 'url', 'username'],
      dtype='object')
Index(['body', 'category', 'categoryLabel', 'createdAt', 'dataType', 'html',
       'id', 'numberOfreplies', 'parentId', 'parsedId', 'scrapedAt', 'upVotes',
       'url', 'username'],
      dtype='object')
Index(['alternativeTitle', 'categories/0', 'categories/1', 'categories/2',
       'categories/3', 'createdAt', 'dataType', 'numberOfMembers', 'scrapedAt',
       'title', 'url'],
      dtype='object')


In [41]:
r.postdf.iloc[0, :].id, r.commentdf.iloc[0, :].parentId

('t3_134nh3l', 't3_12z6out')

In [24]:
import nlp
eli5 = nlp.load_dataset('eli5')

  from .autonotebook import tqdm as notebook_tqdm
Downloading: 100%|██████████| 17.8k/17.8k [00:00<00:00, 450kB/s]
Downloading: 100%|██████████| 3.71k/3.71k [00:00<00:00, 2.70MB/s]


Downloading and preparing dataset eli5/LFQA_reddit (download: 6.03 MiB, generated: 1.26 GiB, post-processed: Unknown sizetotal: 1.26 GiB) to /home/parker-alien/.cache/huggingface/datasets/eli5/LFQA_reddit/1.0.0/58e61a99404336f0891b4457a02232489b50131bdca9c1691054aeee2f6f1a6e...


Downloading: 100%|██████████| 3.50k/3.50k [00:00<00:00, 2.34MB/s]
Downloading: 100%|██████████| 576M/576M [00:11<00:00, 51.6MB/s] 
Downloading: 100%|██████████| 21.1M/21.1M [00:00<00:00, 36.7MB/s]
Downloading: 100%|██████████| 53.0M/53.0M [00:01<00:00, 31.0MB/s]
Downloading: 100%|██████████| 286M/286M [00:06<00:00, 42.1MB/s] 
Downloading: 100%|██████████| 9.65M/9.65M [00:00<00:00, 23.3MB/s]
Downloading: 100%|██████████| 17.7M/17.7M [00:00<00:00, 30.0MB/s]
Downloading: 100%|██████████| 330M/330M [00:07<00:00, 44.6MB/s] 
Downloading: 100%|██████████| 18.7M/18.7M [00:00<00:00, 34.7MB/s]
Downloading: 100%|██████████| 36.2M/36.2M [00:00<00:00, 42.0MB/s]


Dataset eli5 downloaded and prepared to /home/parker-alien/.cache/huggingface/datasets/eli5/LFQA_reddit/1.0.0/58e61a99404336f0891b4457a02232489b50131bdca9c1691054aeee2f6f1a6e. Subsequent calls will reuse this data.


In [33]:
print(eli5['test_eli5'][12345])
print(eli5.keys())
print(len(eli5['train_eli5']), len(eli5['test_eli5']), len(eli5['validation_eli5']))

{'q_id': '8houtx', 'title': 'Why does water heated to room temperature feel colder than the air around it?', 'selftext': '', 'document': '', 'subreddit': 'explainlikeimfive', 'answers': {'a_id': ['dylcnfk', 'dylcj49'], 'text': ["Water transfers heat more efficiently than air. When something feels cold it's because heat is being transferred from your skin to whatever you're touching. Since water absorbs the heat more readily than air, it feels colder.", "Air isn't as good at transferring heat compared to something like water or steel (sit on a room temperature steel bench vs. a room temperature wooden bench, and the steel one will feel more cold).\n\nWhen you feel cold, what you're feeling is heat being transferred out of you.  If there is no breeze, you feel a certain way.  If there's a breeze, you will get colder faster (because the moving air is pulling the heat away from you), and if you get into water, its quite good at pulling heat from you.   Get out of the water and have a breez

0                                     Personal Finance
1    r/Investments: Sharing and discovering new inv...
Name: title, dtype: object