# Job Scrapping Test - Using Requests Library

## 1. Download HTML file for job list

In [4]:
# Import the required library
import requests

# Get the zip file
response = requests.get('https://www.linkedin.com/jobs/search?keywords=20data%20analyst&location=Florianopolis&refresh=true')

# Print the status code
print(response.status_code)

# Save the file locally (more about open() in the next lesson)
local_path = "job-list.html"
with open(local_path, "wb") as f:
    f.write(response.content)

200


## 2. Download HTML file for job

In [23]:
# Get the zip file
response = requests.get('https://www.linkedin.com/signup/cold-join?source=jobs_registration&session_redirect=https%3A%2F%2Fwww.linkedin.com%2Fjobs%2Fsearch%3Fkeywords%3Djunior%2520data%2520analyst%26location%3DFlorianopolis%26refresh%3Dtrue&trk=public_jobs_nav-header-join')

# Print the status code
print(response.status_code)

# Save the file locally (more about open() in the next lesson)
local_path = "job0001.html"
with open(local_path, "wb") as f:
    f.write(response.content)

200


# ETL Flow Template - DataCamp: ETL in Python

## 1. Download ZIP file
*[Original Database Source: Property Services Regulatory Authority](https://www.propertypriceregister.ie/)*

In [13]:
# Import the required library
import requests

# Get the zip file
response = requests.get('https://assets.datacamp.com/production/repositories/5899/datasets/19d6cf619d6a771314f0eb489262a31f89c424c2/ppr-all.zip')

# Print the status code
print(response.status_code)

# Save the file locally (more about open() in the next lesson)
local_path = f"PPR-ALL.zip"
with open(local_path, "wb") as f:
    f.write(response.content)

200


## 2. Explore ZIP file

In [14]:
# Import the required method
from zipfile import ZipFile

with ZipFile('PPR-ALL.zip', "r") as f:
    # Get the list of files
    file_names = f.namelist()
    print(file_names)
    # Extract the CSV file
    csv_file_path = f.extract(file_names[0])
    print(csv_file_path)

['ppr-all.csv']
/home/marianne/code/web_scraping_jobs/ppr-all.csv


## 3. Read from a CSV file

In [17]:
import csv
from pprint import pprint

# Open the csv file in read mode
with open('ppr-all.csv', mode="r", encoding="windows-1252") as csv_file:
    # Open csv_file so that each row is a dictionary
    reader = csv.DictReader(csv_file)
    
    # Print the first row
    row = next(reader)
    print(type(row))
    pprint(row)

<class 'dict'>
{'Address': '16 BURLEIGH COURT, BURLINGTON ROAD, DUBLIN 4',
 'County': 'Dublin',
 'Date of Sale (dd/mm/yyyy)': '03/01/2021',
 'Description of Property': 'Second-Hand Dwelling house /Apartment',
 'Postal Code': 'Dublin 4',
 'Price (€)': '€450,000.00'}


## 4. Write to CSV

In [21]:
new_column_names = {
    "Date of Sale (dd/mm/yyyy)": "date_of_sale",
    "Address": "address",
    "Postal Code": "postal_code",
    "County": "county",
    "Price (€)": "price",
    "Description of Property": "description",
}

In [22]:
import csv

with open('ppr-all.csv', mode="r", encoding="windows-1252") as reader_csv_file:
    reader = csv.DictReader(reader_csv_file)
    # The new file is called "PPR-2021-Dublin-new-headers.csv"
    # and will be saved inside the "tmp" folder    
    with open("ppr-all.csv-new-headers.csv",
                    mode="w",
                    encoding="windows-1252",
                ) as writer_csv_file:
        writer = csv.DictWriter(writer_csv_file, fieldnames=new_column_names)
        # Write header as first line
        writer.writerow(new_column_names)
        for row in reader:
	        # Write all rows in file
	        writer.writerow(row)