## r/iPhone scraper (v2)

##### List of dependencies to be installed

```pip install praw```

```pip install python-dotenv```

In [4]:
# Utility functions
def unixTs_into_datetime(unixTs):
    return datetime.fromtimestamp(unixTs)

def post_in_timelimit(scrapedTs, update_dt):
    return scrapedTs >= update_dt # In that case, returned True otherwise returned False

In [5]:
# This is the code responsible for retrieving comments
def fetch_comments(comment_forest):
    comments = []
    for comment in comment_forest:
        if isinstance(comment, praw.models.MoreComments): # Basically it checks whether there are more comments to look at (and skips placeholder objects)
            continue
        commentAuthor = comment.author.name if comment.author else "[deleted]"
        if(commentAuthor != "AutoModerator" and commentAuthor != "[deleted]"):
            comments.append({
                'comment_id': comment.id,
                'comment_body': comment.body,
                'comment_author': commentAuthor,
                'comment_dateTime': str(datetime.fromtimestamp(comment.created_utc)),
                'comment_score': comment.score,
                'replies': fetch_comments(comment.replies) if comment.replies else [] # Getting recursively additional replies
            })
    return comments

In [6]:
# Import all the necessary libraries
import praw
import json
import os
import time
from datetime import datetime
from dotenv import load_dotenv, dotenv_values

# Load environment variables
load_dotenv()
client_id = os.getenv("R_CLIENTID")
client_secret = os.getenv("R_SECRET")
username = os.getenv("R_USERNAME")
password = os.getenv("R_PSW")
user_agent = "ChangeMeClient/0.1 by YourUsername"

# Initialize PRAW
reddit = praw.Reddit(client_id=client_id,
                     client_secret=client_secret,
                     username=username,
                     password=password,
                     user_agent=user_agent)

# Fetch posts from r/iPhone
subreddit = reddit.subreddit('iPhone')
posts = [] # The data structure that will contain the list of posts
skip_count = 0 # First post to skip
limit = 2000 # Trying to save up to 1000 post, if possible

# The parameters you have to edit while scraping
file_count = 1
start_date = datetime(2024, 6, 18, 8, 58, 59)  # Starting date to filter posts
end_date = datetime(2024, 5, 19, 23, 59, 59)  # Ending date to stop fetching posts

count = 0


while True:
    try:
        for submission in subreddit.new(limit=None):
            # Retrieving the Author and the Category of the post
            category = submission.link_flair_text
            authorUsername = submission.author.name if submission.author else "[deleted]"
            if(category == "Support" and authorUsername != "AutoModerator" and authorUsername != "[deleted]"): # filtering for category "Support" excluding bot comments
                post_time = datetime.fromtimestamp(submission.created_utc)
                if post_time < end_date:
                    print("Ending date reached")
                    break # Stop fetching if post time is before the end date
                if post_time >= start_date:
                    print(f"Skipping for date: {skip_count}")
                    skip_count += 1
                    continue # Skipping recent posts

                # If the check on the date passes, I'll fetch the comments
                submission.comments.replace_more(limit=None) # This will ensure that all comments are loaded
                if submission.num_comments == 0:
                    comments = []
                else:
                    print("len: " + str(len(submission.comments.list())))
                    comments = fetch_comments(submission.comments)

                print("Post Title: " + submission.title)
                print("Num Comments: " + str(submission.num_comments))
                # Storing post details among with comments
                post = {
                    'post_title': submission.title,
                    'post_score': submission.score,
                    'post_id': submission.id,
                    'post_url': submission.url,
                    'post_numComments': submission.num_comments,
                    'post_dateTime': str(post_time),
                    'post_text': submission.selftext,
                    'post_comments': comments
                } # Collecting post details
                posts.append(post) # Appending the last post to the list data structure
                count += 1
                if(count % 5 == 0):
                    print(f"Fetched {len(posts)} posts")
                if(count % 50 == 0):
                    # Saving to  JSON file
                    path = f'r_iphone_posts_{file_count}.json'
                    with open(path, 'w', encoding='utf-8') as f:
                        json.dump(posts, f, ensure_ascii=False, indent=4)
                        print("Data saved to: " + path)
                    file_count += 1
                    posts = []
                    comments = []

        print("Fetched all available posts.")
        break

    except praw.exceptions.RedditAPIException as e:
        if 'RATELIMIT' in str(e):
            print("Rate limit exceeded. Sleeping for 60 seconds...")
            time.sleep(60)
        else:
            print(f"RedditAPIException: {e}")
            break
    except Exception as e:
        print(f"An error occurred: {e}")
        break

print("Exited from loop..")
# Checking for residual in memory
if(len(posts) != 0 or len(comments)!=0):
    # Saving to  JSON file
    path = f'r_iphone_posts_{file_count}.json'
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(posts, f, ensure_ascii=False, indent=4)
        print("Data saved to: " + path)
print("Scraping completed.")

Skipping for date: 0
Skipping for date: 1
Skipping for date: 2
Skipping for date: 3
Skipping for date: 4
Skipping for date: 5
Skipping for date: 6
Skipping for date: 7
Skipping for date: 8
Skipping for date: 9
Skipping for date: 10
Skipping for date: 11
Skipping for date: 12
Skipping for date: 13
Skipping for date: 14
Skipping for date: 15
Skipping for date: 16
Skipping for date: 17
Skipping for date: 18
Skipping for date: 19
Skipping for date: 20
Skipping for date: 21
Skipping for date: 22
Skipping for date: 23
Skipping for date: 24
Skipping for date: 25
Skipping for date: 26
len: 26
Post Title: Help! Woke up today to the green screen of death.
Num Comments: 26
len: 6
Post Title: help please🙏
Num Comments: 6
Post Title: iPhone crashes when clicking AirPlay button
Num Comments: 0
len: 3
Post Title: my iphone 14 only charges when powered off 
Num Comments: 3
len: 1
Post Title: Upload audio file as my voicemail greeting
Num Comments: 1
Fetched 5 posts
len: 3
Post Title: iphone 12 with li