In [1]:
import praw
import pandas as pd
import os
import time
from datetime import datetime, timedelta

In [None]:
reddit = praw.Reddit('DEFAULT')

In [7]:
#https://stackoverflow.com/questions/75677839/how-to-scrape-all-the-posts-from-a-subreddit-from-a-specific-period-of-time
def scrape_subreddit_year_2023(subreddit_name, limit = 1000):
    subreddit = reddit.subreddit(subreddit_name)
    posts = []
#duration 2021-2023
    start_date = datetime.strptime('01-01-21 00:00:00', '%d-%m-%y %H:%M:%S').timestamp()
    end_date = datetime.strptime('31-12-23 23:59:59', '%d-%m-%y %H:%M:%S').timestamp()

    
    for post in subreddit.top(limit=limit):#gets top post
        date = post.created_utc
        if start_date <= date <= end_date: #to get posts for the duration
            posts.append({
                "title": post.title,
                "text": post.selftext,
                "subreddit": post.subreddit.display_name,
                "created_utc": post.created_utc
            })

    return posts


In [8]:
#for both 2024 and 2025
def scrape_subreddit_year_2025(subreddit_name, limit = 1000): 
    subreddit = reddit.subreddit(subreddit_name)
    posts = []
#duration 2024-2025 present
    start_date = datetime.strptime('01-01-24 00:00:00', '%d-%m-%y %H:%M:%S').timestamp()
    end_date = datetime.strptime('31-03-25 23:59:59', '%d-%m-%y %H:%M:%S').timestamp()

    
    for post in subreddit.new(limit=limit):#gets new post
        date = post.created_utc
        if start_date <= date <= end_date: #to get posts for the duration
            posts.append({
                "title": post.title,
                "text": post.selftext,
                "subreddit": post.subreddit.display_name,
                "created_utc": post.created_utc
            })

    return posts

In [9]:
posts_from_declutter_year_2023 = scrape_subreddit_year_2023('Declutter', limit=1000)

df = pd.DataFrame(posts_from_declutter_year_2023)
df.to_csv('files/declutter_new_year_2023.csv', index = False)

In [10]:
posts_from_minimalism_year_2023 = scrape_subreddit_year_2023('Minimalism', limit=1000)

df = pd.DataFrame(posts_from_minimalism_year_2023)
df.to_csv('files/minimalism_new_year_2023.csv', index = False)

In [11]:
posts_from_declutter_year_2025 = scrape_subreddit_year_2025('Declutter', limit=1000)

df = pd.DataFrame(posts_from_declutter_year_2025)
df.to_csv('files/declutter_new_year_2024_2025.csv', index = False)

In [12]:
posts_from_minimalism_year_2025 = scrape_subreddit_year_2025('Minimalism', limit=1000)

df = pd.DataFrame(posts_from_minimalism_year_2025)
df.to_csv('files/minimalism_new_year_2024_2025.csv', index = False)

In [14]:
#all files are named by their subreddit_redditmethod.csv

directory = os.path.join(os.getcwd(), 'files')
files = [file for file in os.listdir(directory) if file.startswith('declutter') and file.endswith('.csv')]
df_list = []
for file in files:
    file_path = os.path.join(directory, file)
    df = pd.read_csv(file_path)
    df_list.append(df)


m_declutter_df = pd.concat(df_list, ignore_index=True) # concatanate into one file
merged_declutter_df = m_declutter_df.drop_duplicates(keep='first') #remove duplicates posts if overlap
sorted_declutter_df = merged_declutter_df.sort_values(by='created_utc', ascending=True) #sort by created_utc 
data_declutter_df = sorted_declutter_df.head(1000) #get first 1000 values


In [17]:
directory = os.path.join(os.getcwd(), 'files')
files = [file for file in os.listdir(directory) if file.startswith('minimalism') and file.endswith('.csv')]
df_list = []
for file in files:
    file_path = os.path.join(directory, file)
    df = pd.read_csv(file_path)
    df_list.append(df)


m_minimalism_df = pd.concat(df_list, ignore_index=True)
merged_minimalism_df = m_minimalism_df.drop_duplicates(keep='first') #remove duplicates posts if overlap
sorted_minimalism_df = merged_minimalism_df.sort_values(by='created_utc', ascending=True) #sort by created_utc 
data_minimalism_df = sorted_minimalism_df.head(1000) #get first 1000 values


In [18]:
final_df = pd.concat([data_declutter_df, data_minimalism_df], ignore_index=True)
final_df.to_csv('files/final_df.csv', index=False)