In [6]:
import requests
import time
import pandas as pd

In [7]:
#Saving the url for the solotravel subreddit into a url variable
url = 'https://www.reddit.com/r/solotravel.json'

In [8]:
#Set up a header
headers = {'User-agent': 'Greymore 0.1' }

In [9]:
#Use requests to ping the server
res = requests.get(url, headers=headers)

In [10]:
#Ensure status_code is 200, which means everything is okay
res.status_code

200

In [11]:
#Saving the json to a json variable
the_json = res.json()


In [12]:
#Get the keys for the json file
sorted(the_json.keys())

['data', 'kind']

'data' contains the necessary information and posts

In [13]:
#Check what keys fall under 'data'
sorted(the_json['data'].keys())

['after', 'before', 'children', 'dist', 'modhash']

In [14]:
#Actually looking at what 'data' contains
the_json['data']

{'modhash': '',
 'dist': 26,
 'children': [{'kind': 't3',
   'data': {'approved_at_utc': None,
    'subreddit': 'solotravel',
    'selftext': "**!!NEW!!**\n\n* **Are you planning your first big trip to Europe? Check out our [brand-new, detailed guide to planning a solo Eurotrip](https://www.reddit.com/r/solotravel/wiki/eurotrip)!**\n\n* **Are you nervous about the Coronavirus outbreak? Have questions? Check out the discussion and online resources in our [Coronavirus megathread](https://www.reddit.com/r/solotravel/comments/gj4gab/coronavirus_faq_megathread_part_5/).**\n\n~\n\nThe purpose of this thread is for new and/or anxious travellers to **introduce themselves, ask 'newbie' questions about solo travel, and receive advice and encouragement**. This is also a thread where it is OK to ask questions that would otherwise be considered vague or repetitive under the normal subreddit rules.\n\nIf you're new to our community, please read the subreddit rules in the sidebar before posting. If y

'children' contains all the posts. It is a list of dictionaries. I will need 
- *'title'*: The main data for classification
- *'selftext'*: The post itself for classification, not to be confused with 'selftext_html')
- *'subreddit'*: The y label
- *'author'*: Optional information, but knowing the author might help with classification.

'after', found at the very bottom, is important because it tells us who the author of the next post is. This will allow us to link to the next 25 posts.

In [15]:
# Iterate a for loop to get all the posts. Set a time.sleep to ensure that Reddit doesn't think I'm ddossing it.

class Reddit_Scraper:
    """
    A Reddit scraper using reddit's API with .json. Simply initiate this scraper with the URL of the reddit you want
    to scrape. Make sure the .json slug is at the end of the url!
    
    The functions are 'scrape' and 'clean_data'. 'scrape' does the scraping, which you can then input into 'clean_data'
    and be let with only the relevant keys required for classification.
    """
    def __init__(self, url):
        self.url = url
    
    def scrape(self, n):
        """
        Input: an integer n counting how many sets of 25 messages you want to scrape
        Output: a list of dictionaries
        
        This scraper works by intitialising an empty list of posts that is then populated with the posts.
        The API can only scrape 25 posts at a time.
        'after' is a pointer referring to the author of the future post, which allows for scraping for the further 25
        posts.
        """
        #Set an empty list for posts to be populated with the posts
        posts = []
        #Initialise 'after' variable that points to the next 25 set of posts. It is first set to None for the first set
        after = None
        #Set up headers
        headers = {'User-agent': 'Greymore 0.1' }

        #"For each set of 25 posts, do..."
        for i in range(n):
            #Print which set of 25 the scraper is working on
            print(f"Currently working on set {i+1}")
            
            #"If there is no pointer for after, do..."
            if after == None:
                #Initialise a set of parameters
                params = {}
            else:
                #Set the 'after' parameter to the after variable that is set under 'if res.status_code == 200'
                params = {'after': after}
            res = requests.get(self.url, params=params, headers=headers)
            
            #"If everything is okay, do..."
            if res.status_code == 200:
                
                #Get the json
                extracted_json = res.json()
                
                #Insert the child into the initialised posts list above
                posts.extend(extracted_json['data']['children'])
                
                #Update 'after' variable to the extracted 'after' variable
                after = extracted_json['data']['after']
        
            #"If something is wrong, print out status code and break the loop"
            else:
                print(res.status_code)
                break
                
            #Set a sleep time to not overwhelm reddit and get locked out
            time.sleep(1)
            
        return posts

    def clean_data(self, list_of_dicts):
        """
        Input: a list of dictionaries. Should be a .json dictionary with relevant keys and values
        Output: a list of dictionaries with only the title, post, the subreddit the post was from, and the author
        """
        
        #Initialise a list to be populated
        relevant_data = []
        
        #For each dictionary in the list of dictionaries, where the dictionaries contains the data we want
        for index, post in enumerate(list_of_dicts):
            
            #Save the prefix into a variable 'data' for easy reference
            data = list_of_dicts[index]['data']

            #Get the title, main body (as selftext), subreddit (the label), and the author (just in case) of the post
            title = data['title']
            selftext = data['selftext']
            subreddit = data['subreddit']
            author = data['author']

            #Save the information into a data dictionary
            data_dict = {'title': title, 'selftext': selftext, 'subreddit': subreddit, 'author': author}

            #Add the data dictionary into the 'relevant_data' list
            relevant_data.append(data_dict)
        
        #Return the populated list
        return relevant_data

In [16]:
#Initialise and run the scraper class for solotravel
solo_scraper = Reddit_Scraper('https://www.reddit.com/r/solotravel.json')
solo_list_posts = solo_scraper.scrape(50)

Currently working on set 1
Currently working on set 2
Currently working on set 3
Currently working on set 4
Currently working on set 5
Currently working on set 6
Currently working on set 7
Currently working on set 8
Currently working on set 9
Currently working on set 10
Currently working on set 11
Currently working on set 12
Currently working on set 13
Currently working on set 14
Currently working on set 15
Currently working on set 16
Currently working on set 17
Currently working on set 18
Currently working on set 19
Currently working on set 20
Currently working on set 21
Currently working on set 22
Currently working on set 23
Currently working on set 24
Currently working on set 25
Currently working on set 26
Currently working on set 27
Currently working on set 28
Currently working on set 29
Currently working on set 30
Currently working on set 31
Currently working on set 32
Currently working on set 33
Currently working on set 34
Currently working on set 35
Currently working on set 36
C

In [17]:
#Initialise and run the scraper class for JapanTravel
japan_scraper = Reddit_Scraper('https://www.reddit.com/r/JapanTravel.json')
japan_list_posts = japan_scraper.scrape(50)

Currently working on set 1
Currently working on set 2
Currently working on set 3
Currently working on set 4
Currently working on set 5
Currently working on set 6
Currently working on set 7
Currently working on set 8
Currently working on set 9
Currently working on set 10
Currently working on set 11
Currently working on set 12
Currently working on set 13
Currently working on set 14
Currently working on set 15
Currently working on set 16
Currently working on set 17
Currently working on set 18
Currently working on set 19
Currently working on set 20
Currently working on set 21
Currently working on set 22
Currently working on set 23
Currently working on set 24
Currently working on set 25
Currently working on set 26
Currently working on set 27
Currently working on set 28
Currently working on set 29
Currently working on set 30
Currently working on set 31
Currently working on set 32
Currently working on set 33
Currently working on set 34
Currently working on set 35
Currently working on set 36
C

In [18]:
#Clean the data for solotravel and JapanTravel
solo_data = solo_scraper.clean_data(solo_list_posts)

japan_data = japan_scraper.clean_data(japan_list_posts)

{'title': 'New to solo travel? Post here for introductions, newbie questions, anxiety and excitement - Week of December 27', 'selftext': "**!!NEW!!**\n\n* **Are you planning your first big trip to Europe? Check out our [brand-new, detailed guide to planning a solo Eurotrip](https://www.reddit.com/r/solotravel/wiki/eurotrip)!**\n\n* **Are you nervous about the Coronavirus outbreak? Have questions? Check out the discussion and online resources in our [Coronavirus megathread](https://www.reddit.com/r/solotravel/comments/gj4gab/coronavirus_faq_megathread_part_5/).**\n\n~\n\nThe purpose of this thread is for new and/or anxious travellers to **introduce themselves, ask 'newbie' questions about solo travel, and receive advice and encouragement**. This is also a thread where it is OK to ask questions that would otherwise be considered vague or repetitive under the normal subreddit rules.\n\nIf you're new to our community, please read the subreddit rules in the sidebar before posting. If you're

{'title': 'Late May to Late June Tentative Itinerary - One Month - Newly Married Couple of First Timers! Would love advice.', 'selftext': "Hello! Reading this posts in this community has been a godsend. Thank you all so much. \n\nMy husband and I are embarking on a one-month honeymoon/trip to Japan this summer. We're getting the JR pass and thought that, since we're flying in and out of Tokyo (found great price), it would be easiest to follow the rails Southwest from Tokyo down to Hiroshima. I'm getting really close to the time I need to start booking accommodations and I have several questions, so here is our itinerary that we've sculpted around some festivals we read about. \n\nI am a teacher and he's a server - we aren't made of money. We are really traveling on a budget (and happy with that!), and in fact are even hoping to couchsurf for certain times, probably when the festivals are. Our biggest expenses will be the flying and JR pass. \n\nWe're really excited about everything fro

In [19]:
#Get the number of posts for solotravel
len(solo_data)

2467

In [20]:
#Get the number of posts for JapanTravel
len(japan_data)

2464

Let's convert the data into a dataframe that we can explore

In [21]:
#Converting the data into dataframes
solo_df = pd.DataFrame(solo_data)

japan_df = pd.DataFrame(japan_data)

Combine the two dataframes into one csv and save them.

In fact, save solo_df and japan_df for reference.

Note that my analysis datasets are saved in the csvs without 'updated'. The files with 'updated' in their name are for others to use when they want to.

In [2]:
#Concatenate the dataframes into one combined dataframe
combined_df = pd.concat([solo_df, japan_df], ignore_index = True)

NameError: name 'pd' is not defined

In [45]:
#Save the files to csv. Note that my initial scrape on 08/01/2021 is saved to '../data/japan_df.csv'.
#I'm now saving the csv to an 'updated' csv which will not affect my initial scrape and data analysis
japan_df.to_csv('../data/japan_df_updated.csv', index=False)
solo_df.to_csv('../data/solo_df_updated.csv', index=False)
combined_df.to_csv('../data/combined_df_updated.csv', index=False)

In [22]:
#Check solotravel dataframe
solo_df.head()

Unnamed: 0,title,selftext,subreddit,author
0,New to solo travel? Post here for introduction...,**!!NEW!!**\n\n* **Are you planning your first...,solotravel,AutoModerator
1,Hi! Can someone tell me how the restrictions w...,"/s\n\nYou can delete this, if this is isn't su...",solotravel,rakahr11
2,Is It Expensive Traveling With A Flexible Itin...,"I'm planning my first solo trip, hopefully, an...",solotravel,Custer_Had_It_Coming
3,Mistakes whilst travelling solo,I have personally made many mistakes whilst tr...,solotravel,aspoonfullofacid
4,Weak passport holders- how do you travel spont...,So I plan to travel once it is safe to travel....,solotravel,ruchigandhi22


In [23]:
#Check JapanTravel dataframe
japan_df.head()

Unnamed: 0,title,selftext,subreddit,author
0,"Japan Travel, COVID-19, And You: Guidelines On...",##**January 2021 - [**Japan has again closed t...,JapanTravel,amyranthlovely
1,Tourism by Prefecture Series: Cities in Japan ...,**Welcome to /r/JapanTravel’s Tourism by Prefe...,JapanTravel,amyranthlovely
2,Question: Car rental in Hokkaido for 6 people,"Hi there fellow travellers,\n\nWe are thinking...",JapanTravel,Plus294
3,Google Reviews Restaurants,Hello! \n\nI have noticed that the ‘google sco...,JapanTravel,johnnynjohnjohn
4,iPhone Screen Repair,My Phone fell down somedays ago and now has a ...,JapanTravel,ToruKurasame


In [1]:
#Check the combined dataframe
combined_df.head()

NameError: name 'combined_df' is not defined