In [70]:
##from pytube import YouTube
import praw
import json
import requests
import pandas as pd
import re
from pathlib import Path # For writing videos into the data folder

# Make sure the praw.ini file is in the correct directory
# Also, make sure there is a folder called "data" in the same directory that contains this script

In [71]:
def convert_streamable(url):
    """
    Convert streamable url from reddit to actual video url
    
    Arguments:
    url | string | the streamable link that reddit gives you
    
    Returns:
    video_url | string | the link to the actual video
    """
    
    # Replace streamable.com with api.streamable.com/videos to get the api link
    api_url = re.sub(string = url,
                     pattern = 'streamable\.com/',
                     repl = 'api.streamable.com/videos/')
    # The api link gets us a json file, we use json() to decode it
    json1 = requests.get(api_url).json()
    # Access the video url in the json file
    video_url = json1['files']['mp4']['url']
    
    return video_url

In [72]:
def convert_reddit(video_url):
    """
    Convert v.redd.it video url into audio url
    
    Arguments:
    video_url | string | the fallback video url from reddit's secure media
    
    Returns:
    audio_url | string | the audio url inferred from the fallback video url; this can be obtained
                       | in the DASHPlaylist.mpd file, search for the base url and audio sections
    """
    audio_url = ""
    
    if bool( re.search(string = video_url,
                       pattern = "\/DASH_\d{2,4}\.mp4") ):    
        # If there is a .mp4 extension in the url, replace with "DASH_audio.mp4"
        audio_url = re.sub(string = video_url,
                           pattern = "\/DASH_\d{2,4}\.mp4",
                           repl = "/DASH_audio.mp4")
    
    elif bool ( re.search(string = video_url,
                          pattern = "\/DASH_(\d{2,4})\?") ):
        # If there is no .mp4 extension in the url, just replace with "audio"
        audio_url = re.sub(string = video_url,
                           pattern = "\/DASH_(\d{2,4})\?",
                           repl = "/audio?")
    
    else:
        print("error, no match detected")

    return(audio_url)

In [73]:
def video_url_from_submission(submission):
    """
    Get the video file urls from the given submission
    
    Arguments:
    submission | praw.models.reddit.submission.Submission | the elements in the listing generator
               | you get these when you iterate through a listing generator
               | Ex. for i in reddit.subreddit("PublicFreakout").top(limit = 3):
    
    Returns:
    video_url | string array | array that contains the links to the relevant files
                             | if the link is streamable, it will be a list of size 1
                             | linking to the video because the video contains audio
                             | if the link is reddit, it will be a list of size 2
                             | with the first element linking to the video graphics
                             | and the second element linking to the audio
    
    Example conversions:
    https://streamable.com/u2jzoo into https://api.streamable.com/videos/u2jzoo,
    then retrieve url from json
    
    For reddit, get subreddit post, find the fall back video url
    and the audio url by replacing stuff
    Example: 
    https://v.redd.it/9v2san14was51/DASH_720.mp4?source=fallback to
    https://v.redd.it/9v2san14was51/DASH_audio.mp4?source=fallback
    
    https://v.redd.it/w56rwny74y351/DASH_360?source=fallback to
    https://v.redd.it/w56rwny74y351/audio?source=fallback
    """
    
    # Initialize the string array
    url_array = []
    # I'm using an array because Streamable gives us audio + video in one file
    # but Reddit has two links, one for video (graphics) only and one for audio only
    
    # Check if it's streamable or reddit
    # If streamable
    if submission.domain == "streamable.com":
        # Convert the streamable url to the actual video url
        video_url = convert_streamable(submission.url)
        # Append the url to the array
        url_array.append(video_url)
    
    # If reddit
    elif submission.domain == "v.redd.it":
        try:
            # Get the url from secure_media instead
            video_url = submission.secure_media['reddit_video']['fallback_url']
        except:
            # Try causes an error if it's a crosspost. We have to access crosspost_parent_list,
            # 0, secure_media, etc.
            # Example:
            # https://www.reddit.com/r/PublicFreakout/comments/hafl7q/cop_chokes_and_punches_teenage_girl_in_the_head.json
            video_url = submission.crosspost_parent_list[0]['secure_media']['reddit_video']['fallback_url']
            
        # Append the video url to the list
        url_array.append(video_url)
        # Get the audio
        audio_url = convert_reddit(video_url)
        url_array.append(audio_url)
        
    else:
        print("error" + submission.domain)
    
    return url_array

In [74]:
# Function for download videos from subreddit urls
def download_video(url, audio = False):
    """
    Download a file from a url
    
    Arguments:
    url | string | the link that you want to download from
    audio | boolean | True will set the extension to .mp3, False will set the extension to .mp4
    
    Returns:
    local_filename | string | name of the downloaded file
    """
        
    # Check the root website
    root_url = url.split('/')[2]
    video_url = ""
    
    # If this is reddit
    if re.match(string = root_url, pattern = "^v\.redd\.it"):
        if audio == True:
            file_ext = ".mp3"
        else:
            file_ext = ".mp4"
        # Use the second last part of the url within the slashes, it should be unique
        local_filename = url.split('/')[-2]
        # If the filename doesn't have the '.mp4' extension
        if not bool( re.search( string = local_filename,
                                pattern = "\.mp4$") ):
            local_filename += file_ext
    
    # If this is streamable, chop off the stuff after .mp4
    elif re.match(string = root_url, pattern = "^.*streamable.com"):
        # Use the second last part of th
        local_filename = re.sub(string = url.split('/')[-1],
                                pattern = "(?<=\.mp4).*",
                                repl = "")
        
    else:
        # Create file name with the last part of the url within the slashes
        local_filename = url.split('/')[-1]
    
    # Name of script
    script_name = "redditv.ipynb"
    path1 = Path("./", script_name)
    
    # Create a path for the data folder
    data_path = Path("./data").resolve()
    # If the data folder doesn't exist
    if not data_path.is_dir():
        # Create the data folder
        data_path.mkdir()
    
    # Check that the script is a file
    if path1.is_file():
        # This is the absolute path we want our file to be written in
        path2 = Path(path1.resolve().parent, "data")
        # This is the file with the path that open will write to
        path3 = Path(path2, local_filename)
    
    # Using with to automatically close the connection when we are done with it
    with requests.get(url, stream = True) as req:
        # Raise an http error if there is one
        req.raise_for_status()
        # Write the file in binary
        with open(path3, 'wb') as video_file:
            for chunk in req.iter_content(chunk_size = 4000): 
                # If you have chunk encoded response uncomment if
                # and set chunk_size parameter to None.
                #if chunk: 
                video_file.write(chunk)
                
    return local_filename

In [75]:
# Connect to Reddit's API
reddit = praw.Reddit("bot1", user_agent = "bot1")
# Check that the praw.ini file worked
print(reddit.user.me())
# Link to json file with streamable and reddit links as of Oct. 13, 2020
# https://www.reddit.com/r/PublicFreakout/top/.json?t=year

98147891230761


In [39]:
### # Select the public freakout subreddit
### sr1 = reddit.subreddit("PublicFreakout")
### # Select the top 3 (it is selecting top 3 of the year by default)
### top1 = sr1.top(limit = 3)
### 
### # Checking the contents of the top posts in "PublicFreakout"
### # Initialize arrays
### urls = []
### # Place the relevant info into the array from the selected top 3
### for i in top1:
###     urls.append(video_url_from_submission(i))

In [None]:
### # Downloading a reddit video graphics
### download_video(urls[1][0], audio = False)
### # Downloading a reddit video's audio
### download_video(urls[1][1], audio = True)
### # Downloading a streamable video
### download_video(urls[0][0])

In [92]:
# Download a dataset of 10

# Select the public freakout subreddit
sr1 = reddit.subreddit("PublicFreakout")
# Select the top 10 (it is selecting top 10 of the year by default)
top1 = sr1.top(limit = 250)
# Initialize arrays
urls = []
### buggy1 = []
# Place the relevant info into the array from the selected top 3
for i in top1:
    ### # Debugging error where some attribute doesn't exist
    print(len(urls))
    ### if(len(urls) == 15):
    ###     buggy1.append(i)
    try:
        urls.append( video_url_from_submission(i) )
    except:
        print( urls.append( video_url_from_submission(i) ) )
print("urls appended")
# Download the videos
for i in range( len(urls) ):
    for j in range( len(urls[i]) ):
        if j == 0:
            audio1 = False
        else:
            audio1 = True
        try:
            download_video(urls[i][j], audio = audio1)
        except:
            # Getting this HTTP error
            # 403 Client Error: Forbidden for url: https://v.redd.it/716d4vdxcqu41/audio?source=fallback
            # Also, some of the posts are not videos. When printing the length of the url, sometimes
            # it says "errorgfycat.com" or "errori.redd.it" for example.
            print("Error with post number " + str(i))

print("files downloaded")

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
errorgfycat.com
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
errorgfycat.com
73
74
errori.redd.it
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
errorgfycat.com
139
140
141
142
143
144
145
errori.imgur.com
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
erroryoutube.com
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
urls appended

In [94]:
urls[23]

['https://v.redd.it/716d4vdxcqu41/DASH_1080?source=fallback',
 'https://v.redd.it/716d4vdxcqu41/audio?source=fallback']

The code below is information; it is not directly used for downloading video/audio data

In [187]:
### # This entire code chunk is commented out. To uncomment it easily, move the cursor between the
### # third and fourth '#', after the space. Then, hold option and drag to the bottom of the
### # code block; this should form a straight cursor line that spans all of the rows.
### # delete the '#'. This works for re-commenting the code as well
### 
### # Example of checking the type of a submission within the listing generator
### # Initialize an list to store the submissions
### check1 = []
### # This creates a listing generator; this must be created every time you want to use top1
### top1 = sr1.top(limit = 3)
### # For through the submissions
### for i in top1:
###     # and append them to the initialized list above
###     check1.append(i)
### # Check that the intialized list not empty
### print(check1)
### # Example of checking the type of a reddit submission
### print(type(check1[0]) == praw.models.reddit.submission.Submission)
### 
### # Check the urls
### print(urls)

[Submission(id='gtsaam'), Submission(id='gzvmsr'), Submission(id='gvdl01')]
True
[['https://cdn-cf-east.streamable.com/video/mp4/u2jzoo.mp4?Expires=1602904380&Signature=N3j3TOy0K7F9hAdVuliRJIIehHQCgapiNOwJ8Rffy2AGV3~jc8bjYTvcll~kUMdaMASw1htFNF2NZCpuyQFApnqZCk-arbMJM77JQlQwqWRrM6nwCOdCHzHtbKVB8CpP351Rndp8306q0x~isrQRoSh2BVfswSxmOp0WTfkVBRog~b34xKq0YedZe8aCiYfkbLVNtaIZb5h12HIwc~xVt1L6QnY0u3kh7ow7gstSbcvXapucOj7LkIXmniiWIgenTfF6yWTMlQlcI~~wjqgE2hPJdo8w29UpTH6OiV~vDByvIh7LU6AQ9hyhamDBoyUdXli4fLadS9ygRpfDFwqGMQ__&Key-Pair-Id=APKAIEYUVEN4EVB2OKEQ'], ['https://v.redd.it/w56rwny74y351/DASH_360?source=fallback', 'https://v.redd.it/w56rwny74y351/audio?source=fallback'], ['https://v.redd.it/q356rndwpj251/DASH_720?source=fallback', 'https://v.redd.it/q356rndwpj251/audio?source=fallback']]
