-
Notifications
You must be signed in to change notification settings - Fork 2.3k
/
extract.py
201 lines (159 loc) · 5.63 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
# -*- coding: utf-8 -*-
"""This module contains all non-cipher related data extraction logic."""
import json
from collections import OrderedDict
from pytube.compat import HTMLParser
from pytube.compat import quote
from pytube.compat import urlencode
from pytube.exceptions import RegexMatchError
from pytube.helpers import regex_search
class PytubeHTMLParser(HTMLParser):
in_vid_descr = False
vid_descr = ''
def handle_starttag(self, tag, attrs):
if tag == 'p':
for attr in attrs:
if attr[0] == 'id' and attr[1] == 'eow-description':
self.in_vid_descr = True
def handle_endtag(self, tag):
if tag == 'p' and self.in_vid_descr:
self.in_vid_descr = False
def handle_data(self, data):
if self.in_vid_descr:
self.vid_descr += data
def is_age_restricted(watch_html):
"""Check if content is age restricted.
:param str watch_html:
The html contents of the watch page.
:rtype: bool
:returns:
Whether or not the content is age restricted.
"""
try:
regex_search(r'og:restrictions:age', watch_html, group=0)
except RegexMatchError:
return False
return True
def video_id(url):
"""Extract the ``video_id`` from a YouTube url.
This function supports the following patterns:
- :samp:`https://youtube.com/watch?v={video_id}`
- :samp:`https://youtube.com/embed/{video_id}`
- :samp:`https://youtu.be/{video_id}`
:param str url:
A YouTube url containing a video id.
:rtype: str
:returns:
YouTube video id.
"""
return regex_search(r'(?:v=|\/)([0-9A-Za-z_-]{11}).*', url, group=1)
def watch_url(video_id):
"""Construct a sanitized YouTube watch url, given a video id.
:param str video_id:
A YouTube video identifier.
:rtype: str
:returns:
Sanitized YouTube watch url.
"""
return 'https://youtube.com/watch?v=' + video_id
def embed_url(video_id):
return 'https://www.youtube.com/embed/{}'.format(video_id)
def eurl(video_id):
return 'https://youtube.googleapis.com/v/{}'.format(video_id)
def video_info_url(
video_id, watch_url, watch_html, embed_html,
age_restricted,
):
"""Construct the video_info url.
:param str video_id:
A YouTube video identifier.
:param str watch_url:
A YouTube watch url.
:param str watch_html:
The html contents of the watch page.
:param str embed_html:
The html contents of the embed page (for age restricted videos).
:param bool age_restricted:
Is video age restricted.
:rtype: str
:returns:
:samp:`https://youtube.com/get_video_info` with necessary GET
parameters.
"""
if age_restricted:
sts = regex_search(r'"sts"\s*:\s*(\d+)', embed_html, group=1)
# Here we use ``OrderedDict`` so that the output is consistent between
# Python 2.7+.
params = OrderedDict([
('video_id', video_id),
('eurl', eurl(video_id)),
('sts', sts),
])
else:
# I'm not entirely sure what ``t`` represents. Looks to represent a
# boolean.
t = regex_search(
r'\W[\'"]?t[\'"]?: ?[\'"](.+?)[\'"]', watch_html,
group=0,
)
params = OrderedDict([
('video_id', video_id),
('el', '$el'),
('ps', 'default'),
('eurl', quote(watch_url)),
('hl', 'en_US'),
('t', quote(t)),
])
return 'https://youtube.com/get_video_info?' + urlencode(params)
def js_url(html, age_restricted=False):
"""Get the base JavaScript url.
Construct the base JavaScript url, which contains the decipher
"transforms".
:param str watch_html:
The html contents of the watch page.
:param bool age_restricted:
Is video age restricted.
"""
ytplayer_config = get_ytplayer_config(html, age_restricted)
base_js = ytplayer_config['assets']['js']
return 'https://youtube.com' + base_js
def mime_type_codec(mime_type_codec):
"""Parse the type data.
Breaks up the data in the ``type`` key of the manifest, which contains the
mime type and codecs serialized together, and splits them into separate
elements.
**Example**:
>>> mime_type_codec('audio/webm; codecs="opus"')
('audio/webm', ['opus'])
:param str mime_type_codec:
String containing mime type and codecs.
:rtype: tuple
:returns:
The mime type and a list of codecs.
"""
pattern = r'(\w+\/\w+)\;\scodecs=\"([a-zA-Z-0-9.,\s]*)\"'
mime_type, codecs = regex_search(pattern, mime_type_codec, groups=True)
return mime_type, [c.strip() for c in codecs.split(',')]
def get_ytplayer_config(html, age_restricted=False):
"""Get the YouTube player configuration data from the watch html.
Extract the ``ytplayer_config``, which is json data embedded within the
watch html and serves as the primary source of obtaining the stream
manifest data.
:param str watch_html:
The html contents of the watch page.
:param bool age_restricted:
Is video age restricted.
:rtype: str
:returns:
Substring of the html containing the encoded manifest data.
"""
if age_restricted:
pattern = r";yt\.setConfig\(\{'PLAYER_CONFIG':\s*({.*})(,'EXPERIMENT_FLAGS'|;)" # noqa: E501
else:
pattern = r';ytplayer\.config\s*=\s*({.*?});'
yt_player_config = regex_search(pattern, html, group=1)
return json.loads(yt_player_config)
def get_vid_descr(html):
html_parser = PytubeHTMLParser()
html_parser.feed(html)
return html_parser.vid_descr