In [1]:
%run utils.ipynb
import threading
import requests
import validators
import time
import os
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import random

#change the DIRECTORY to get the correct crawler
DIRECTORY = "classifier_crawler/links/"              
header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Safari/605.1.15'}
FILE_NAMES = ["amazon.csv",
              "avenida.csv",
              #"havan.csv",
              "cissamagazine.csv",
              #"ibyte.csv",
              "kabum.csv",
              #"magazineluiza.csv",
              "ricardoeletro.csv",
              #"taqi.csv",
              #"colombo.csv"
             ]

#sets the correct path to the .csv files
FILE_NAMES = [DIRECTORY + x for x in FILE_NAMES]

#to control concurrent executions
lock = threading.Lock()

In [2]:
#download all the pages from the .csv
def download_pages(file):
    LINK_COLUMN = "links"
    ID_COLUMN = 'id'
    PAGE_NAME = "page_{}.html"
    SITE_NAME = file.split(".")[0].split("/")[-1]
    PAGES_DIRECTORY = "pages/" + SITE_NAME
    INVALID_PAGES_DIRECTORY = "invalid_pages/"
    
    df = pd.read_csv(file)
    links = df[LINK_COLUMN]
    identifiers = df[ID_COLUMN]
    valid_links = 0
    index = 0
    total_links = len(links)
    invalid_pages = []
    
    print("Starts downloading from: {}".format(file))
    
    for link in tqdm(links):
        try:
            req = requests.get(link, headers=header)
            if(req.status_code == 200):
                valid_links += 1
                save_file(req.text, PAGES_DIRECTORY, PAGE_NAME.format(identifiers[index]))
            else:
                invalid_pages.append((identifiers[index], link))
                
        except:
            #print("error: " + link)
            invalid_pages.append((identifiers[index], link))
            
        finally:
            index += 1
            time.sleep(random.randint(1,3))
    
    #saves the invalid links to a .csv file
    if not os.path.exists(INVALID_PAGES_DIRECTORY):
        os.makedirs(INVALID_PAGES_DIRECTORY)
    df = pd.DataFrame(invalid_pages, columns=['id', 'links'])
    df.to_csv((INVALID_PAGES_DIRECTORY + SITE_NAME + '.csv'),header=True, index=False, encoding='utf-8')
    
    #saves the stats from each url in .csv file
    with lock:
        content = "{},{},{}\n".format(SITE_NAME,valid_links,total_links)
        save_file(content, "./", "stats.csv", mode="a")
    
    print("Finished downloading from: {}".format(file))
    print("Valid links/Total links: {}/{}".format(valid_links, total_links))

# Parallel downloads

In [3]:
#starts the download of all the pages
threads = []
for file in tqdm(FILE_NAMES):
    thread = threading.Thread(target=download_pages, args=(file,))
    threads.append(thread)
    thread.start()
    time.sleep(0.2)

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Starts downloading from: classifier_crawler/links/amazon.csv


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Starts downloading from: classifier_crawler/links/avenida.csv


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Starts downloading from: classifier_crawler/links/cissamagazine.csv


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Starts downloading from: classifier_crawler/links/kabum.csv


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Starts downloading from: classifier_crawler/links/ricardoeletro.csv


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Finished downloading from: classifier_crawler/links/cissamagazine.csv
Valid links/Total links: 936/1000
Finished downloading from: classifier_crawler/links/avenida.csv
Valid links/Total links: 1000/1000
Finished downloading from: classifier_crawler/links/kabum.csv
Valid links/Total links: 1000/1000

Finished downloading from: classifier_crawler/links/amazon.csv
Valid links/Total links: 982/1000

Finished downloading from: classifier_crawler/links/ricardoeletro.csv
Valid links/Total links: 772/1000


# Single Download

In [None]:
url = FILE_NAMES[0]
download_pages(url)