In [1]:
import pandas as pd
import requests  # URL 가져오는 좋은 라이브러리
from bs4 import BeautifulSoup  # HTML에서 원하는 정보 가져오기 좋은 라이브러리
import csv
from flask import Flask, render_template, request, redirect, send_file
from flask_cors import CORS
import os

def SO_get_last_page(URL):
    result = requests.get(URL)
    soup = BeautifulSoup(result.text, 'lxml')
    links = soup.find("div", {'class': 's-pagination'})
    if links:
        links = links.find_all("a")
        last_page = int(links[-2].get_text().strip())
    else:
        last_page = 0
    return last_page

def SO_extract_job(html):
    # title, company, location, link
    title = html.find("h2", {
        "class": "mb4 fc-black-800 fs-body3"
    }).find("a")['title']
    company_tag, location_tag = html.find("h3", {
        'class': 'fc-black-700 fs-body1 mb4'
    }).find_all("span", recursive=False)
    company = company_tag.get_text(strip=True)
    job_id = html['data-jobid']
    return {
        'title': title,
        'company': company,
        'link': f"https://stackoverflow.com/jobs/{job_id}"
    }

def SO_extract_jobs(last_page,URL):
    jobs = []
    for page in range(last_page):
        print(f"Scrapping SO Page: {page}")
        result = requests.get(f"{URL}&pg={page+1}")
        soup = BeautifulSoup(result.text, 'lxml')
        results = soup.find_all("div", {'class': '-job'})
        for result in results:
            job = SO_extract_job(result)
            jobs.append(job)
    return jobs

def SO_get_jobs(word):
    URL = f"https://stackoverflow.com/jobs?q={word}&sort=i"
    last_page = SO_get_last_page(URL)
    last_page = 1
    jobs = SO_extract_jobs(last_page,URL)
    return jobs


def remote_get_jobs(word):
    headers = {
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'en-US,en;q=0.8',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
    }
    URL=f"https://remoteok.io/remote-{word}-jobs"
    result = requests.get(URL,headers=headers)

    soup = BeautifulSoup(result.text, 'html.parser')
    job_li = soup.find_all('tr',{'class':'job'})
    jobs=[]
    for job in job_li:
        try:
            # Company
            company = job.select('tr > td > a > h3',limit=1)[0].text
            # Title
            title = job.select('tr > td > a > h2',limit=1)[0].text
            # Link
            link = "https://remoteok.io" + job.select('tr > td.source > a',limit=1)[0]['href']
#             print({'company':company,'title':title,'link':link})
            jobs.append({'company':company,'title':title,'link':link})
        except:
            pass
    return jobs

## wework
def wwork_get_jobs(word):
    url_wwr=f'https://weworkremotely.com/remote-jobs/search?term={word}'
    ## title, link, company
    r = requests.get(url_wwr)
    soup = BeautifulSoup(r.text, 'html.parser')

    job_con = soup.find('div',class_='jobs-container').find_all('article')
    jobs=[]
    for i in range(len(job_con)):
        job_li = job_con[i].select('li > a')
        job_li = job_li[:-1]
        for j in range(len(job_li)):
            company = job_li[j].find('span',class_='company').text
            title = job_li[j].find('span',class_='title').text
            link = 'https://weworkremotely.com' + job_li[j]['href']
            jobs.append({'company' : company,'title' : title,'link' : link})
    return jobs

def save_to_file(jobs):
    file = open('jobs.csv', mode='w', encoding='utf-8', newline='')
    writer = csv.writer(file)
    writer.writerow(["Title","Company", "Link"])
    for job in jobs:
        writer.writerow(list(job.values()))
    file.close()
    return 1



app = Flask("SuperScrapper")
CORS(app, expose_headers=["x-suggested-filename"])
db = {}

@app.route("/")
def home():
    return render_template("potato.html")

@app.route("/report")
def report():
    word = request.args.get("word") ## Query argument
    if word:
        word = word.lower()
        existingJobs = db.get(word)
        if existingJobs:
            jobs = existingJobs
        else:
            job1 = wwork_get_jobs(word)
            job2 = SO_get_jobs(word)
            job3 = remote_get_jobs(word)
            jobs = job1 + job2 + job3
            db[word] = jobs
    else:
        return redirect("/")
    return render_template('report.html',
                           SearchingBy=word,
                           resultNumber=len(jobs),
                           jobs = jobs) ## Query argument to render_template


@app.route('/export')
def export():
    try:
        word = request.args.get('word')
        if not word:
            raise Exception()
        word = word.lower()
        jobs = db.get(word)
        if not jobs:
            raise Exception()
        save_to_file(jobs)
        file_path = os.getcwd()
        result = send_file(os.path.join(filepath, f"{word}.csv"),
                           mimetype="text/csv",
                           as_attachment=True,
                          conditional=False)
        result.headers["x-suggested-filename"] = f"{word}.csv"
        return result
    except:
        return redirect('/')


app.run(host="0.0.0.0")

 * Serving Flask app "SuperScrapper" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://0.0.0.0:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [28/Dec/2020 00:56:33] "[37mGET / HTTP/1.1[0m" 200 -


Scrapping SO Page: 0


127.0.0.1 - - [28/Dec/2020 00:56:40] "[37mGET /report?word=django HTTP/1.1[0m" 200 -


In [None]:
result = send_file("/full/path/to/some/file",
                   mimetype="text/plain", # use appropriate type based on file
                   as_attachment=True,
                   conditional=False)
result.headers["x-suggested-filename"] = "use_this_filename.txt"
return result

In [None]:
!pip install -U flask-cors

In [None]:
import os

In [None]:
os.path.join(os.getcwd(), f"test.csv")