In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess
import json, csv
from urllib.parse import urlencode

In [2]:
class UdemyScraper(scrapy.Spider):
    name = "udemy_scraper"
    
    # custom headers
    headers = {
        "accept": "application/json, text/plain, */*",
        "accept-encoding": "gzip, deflate, br",
        "accept-language": "en-US,en;q=0.9,la;q=0.8",
        "authorization": "Bearer 91PeHG06c81yVFSLmxQ8AHblO64wCmawkxhJSSTv",
        "referer": "//www.udemy.com/courses/search/?p=1&q=python&src=ukw",
        "sec-ch-ua": "\" Not;A Brand\";v=\"99\", \"Google Chrome\";v=\"91\", \"Chromium\";v=\"91\"",
        "sec-ch-ua-mobile": "?1",
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "same-origin",
        "user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Mobile Safari/537.36",
        "x-requested-with": "XMLHttpRequest",
        "x-udemy-authorization": "Bearer 91PeHG06c81yVFSLmxQ8AHblO64wCmawkxhJSSTv",
        "x-udemy-cache-brand": "NGen_US",
        "x-udemy-cache-campaign-code": "NEWMT60721",
        "x-udemy-cache-device": "mobile",
        "x-udemy-cache-language": "en",
        "x-udemy-cache-logged-in": "1",
        "x-udemy-cache-marketplace-country": "NG",
        "x-udemy-cache-modern-browser": "1",
        "x-udemy-cache-price-country": "NG",
        "x-udemy-cache-release": "64b2fa9bb42353b4c322",
        "x-udemy-cache-user": "153138494",
        "x-udemy-cache-version": "1"
    }
    
    column_names = ['title',
                    'url',
                    'instructors',
                    'objectives_summary',
                    'content_info',
                    'rating',
                    'num_reviews',
                    'price',
                    'list_price',
                    'discount_price',
                    'price_detail'
               ]
        
    # courses_url
    courses_url = "https://www.udemy.com/api-2.0/search-courses/?"
    
    # string_query_parameters
    params = {
        'p': 1,
        'q': 'python',
        'skip_price': 'true'
    }
    
    # init constructor
    def __init__(self):
        with open("udemy_python_courses.csv", "w", encoding="utf-8") as f:
            f.write(','.join(self.column_names) + "\n")
    
    # crawlers_entry_point
    def start_requests(self):
        # loop over range of pages
        for page in range(0, 20):
            self.params['p'] = page
            
            # crawl_next_page
            yield scrapy.Request(
                url = self.courses_url + urlencode(self.params),
                headers = self.headers,
                callback = self.parse_courses
            )
        
    # courses API call response callback function
    def parse_courses(self, response):
        # parse json response
        courses = json.loads(response.text)['courses']
        
        # extract courses id
        ids = [str(course['id']) for course in courses]
        
        # generate price API urls
            
        # price_url
        prices_url = "https://www.udemy.com/api-2.0/pricing/?"
        
        # price API string query params
        price_params = {
            'course_ids': ','.join(ids),
            'fields[pricing_result]': 'price,discount_price,list_price,price_detail,price_serve_tracking_id'
        }
        # fetch course pricing
        yield scrapy.Request(
            url = prices_url + urlencode(price_params),
            headers = self.headers,
            meta = {
                'courses': courses
            },
            callback = self.parse_pricings
        )
        
    # prices API call response callback function
    def parse_pricings(self, response):
        # get courses from meta container
        courses = response.meta.get('courses')
        
        # parse courses
        course_prices = json.loads(response.text)['courses']
        
        # map pricings to courses
        for course in courses:
            # extracted features
            features = {
                'title': course['title'],
                'url': "https://www.udemy.com" + course['url'],
                'instructors': [ "https://www.udemy.com/" + instructor['display_name'] for instructor in course['visible_instructors'] ][0],
                'objectives_summary': '',
                'content_info': course['content_info'],
                'rating': course['rating'],
                'num_reviews': course['num_reviews'],
                'price': course_prices[str(course['id'])]['price']['price_string'],
                'list_price': course_prices[str(course['id'])]['list_price']['price_string'],
                'discount_price': '',
                'price_detail': ''
            }
            
            # try to extract discount prices
            try:
                features['discount_price'] = course_prices[str(course['id'])]['discount_price']['price_string']
            except:
                features['discount_price'] = 0
            
            try:
                features['price_detail'] = course_prices[str(course['id'])]['price_detail']['price_string']
            except:
                features['price_detail'] = 0
            
            try:
                features['objectives_summary'] = course['objectives_summary'][0]
            except:
                features['objectives_summary'] = 'No Objective Summary Available'
            
            # write file to csv
            with open("udemy_python_courses.csv", "a", encoding="utf-8") as f:
                writer = csv.DictWriter(f, self.column_names)
                writer.writerow(features)

# main driver
if __name__=="__main__":
    # run scraper
    process = CrawlerProcess()
    process.crawl(UdemyScraper)
    process.start()

2021-06-11 06:10:31 [scrapy.utils.log] INFO: Scrapy 2.4.1 started (bot: scrapybot)
2021-06-11 06:10:31 [scrapy.utils.log] INFO: Versions: lxml 4.6.1.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.2.0, Python 3.8.5 (default, Sep  3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1h  22 Sep 2020), cryptography 3.1.1, Platform Windows-10-10.0.17763-SP0
2021-06-11 06:10:31 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2021-06-11 06:10:31 [scrapy.crawler] INFO: Overridden settings:
{}
2021-06-11 06:10:32 [scrapy.extensions.telnet] INFO: Telnet Password: ecbc31a680b7590b
2021-06-11 06:10:32 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.logstats.LogStats']
2021-06-11 06:10:38 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 's

2021-06-11 06:12:45 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.udemy.com/api-2.0/search-courses/?p=0&q=python&skip_price=true> (referer: //www.udemy.com/courses/search/?p=1&q=python&src=ukw)
2021-06-11 06:12:46 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.udemy.com/api-2.0/search-courses/?p=2&q=python&skip_price=true> (referer: //www.udemy.com/courses/search/?p=1&q=python&src=ukw)
2021-06-11 06:12:47 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.udemy.com/api-2.0/search-courses/?p=6&q=python&skip_price=true> (referer: //www.udemy.com/courses/search/?p=1&q=python&src=ukw)
2021-06-11 06:12:47 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.udemy.com/api-2.0/search-courses/?p=1&q=python&skip_price=true> (referer: //www.udemy.com/courses/search/?p=1&q=python&src=ukw)
2021-06-11 06:12:48 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.udemy.com/api-2.0/search-courses/?p=14&q=python&skip_price=true> (referer: //www.udemy.com/co

2021-06-11 06:13:11 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.udemy.com/api-2.0/pricing/?course_ids=629302%2C692188%2C1547268%2C666914%2C903378%2C1035472%2C1386294%2C836376%2C2563788%2C1641648%2C577248%2C1471348%2C919038%2C2538582%2C1198640%2C775330%2C2400176%2C2167996%2C1554180%2C1034284&fields%5Bpricing_result%5D=price%2Cdiscount_price%2Clist_price%2Cprice_detail%2Cprice_serve_tracking_id> (referer: //www.udemy.com/courses/search/?p=1&q=python&src=ukw)
2021-06-11 06:13:12 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.udemy.com/api-2.0/pricing/?course_ids=1814470%2C3326842%2C3874064%2C2728500%2C3091956%2C4076292%2C433798%2C1723848%2C2030684%2C1888116%2C636008%2C1874050%2C2333118%2C918364%2C3520854%2C3101866%2C3089442%2C2789636%2C3882292%2C1860738&fields%5Bpricing_result%5D=price%2Cdiscount_price%2Clist_price%2Cprice_detail%2Cprice_serve_tracking_id> (referer: //www.udemy.com/courses/search/?p=1&q=python&src=ukw)
2021-06-11 06:13:12 [scrapy.core.engine] DE