## Introduction

This IPython notebook illustrates how to crawl the web-site https://portal.edirepository.org/nis/home.jsp to obtain the limnology data files. First we need to import bs4 and other libraries.

In [11]:
# import requitred libraries
from bs4 import BeautifulSoup
import requests
import urllib.request
import os
import re
import csv

Next initialize the request session and make calls to the parent url with the session. Then, loop through the pages in the web url and find all the data links pertaining to limnology.

In [None]:
# Initializing session
s = requests.Session()
# Making call to parent url with session s
s.post('https://portal.edirepository.org/nis/browseServlet?searchValue=limnology')
links= []
for i in range(0,307,10):
    url = 'https://portal.edirepository.org/nis/simpleSearch?start='+str(i)+'&rows=10&sort=score,desc' 
    html_child = s.post(url)
    html_childContent = html_child.content.decode('utf8')
    new_links = re.findall('mapbrowse\?packageid=[\w\-\.]*',html_childContent)
    links = links + new_links

In [4]:
len(links)

614

Filter out the duplicate links obtained and modify each link appropriately to obtain the data

In [7]:
# obtain the unique links
parser = csv.reader(links)
a = []
unique_links = set()

for fields in parser:
    for i,f in enumerate(fields):
        a.append(f)
        unique_links.add('https://portal.edirepository.org/nis/'+f)
        
unique_links, len(unique_links)

({'https://portal.edirepository.org/nis/mapbrowse?packageid=edi.101.2',
  'https://portal.edirepository.org/nis/mapbrowse?packageid=edi.104.1',
  'https://portal.edirepository.org/nis/mapbrowse?packageid=edi.118.3',
  'https://portal.edirepository.org/nis/mapbrowse?packageid=edi.119.2',
  'https://portal.edirepository.org/nis/mapbrowse?packageid=edi.12.1',
  'https://portal.edirepository.org/nis/mapbrowse?packageid=edi.120.2',
  'https://portal.edirepository.org/nis/mapbrowse?packageid=edi.121.1',
  'https://portal.edirepository.org/nis/mapbrowse?packageid=edi.122.1',
  'https://portal.edirepository.org/nis/mapbrowse?packageid=edi.126.2',
  'https://portal.edirepository.org/nis/mapbrowse?packageid=edi.13.1',
  'https://portal.edirepository.org/nis/mapbrowse?packageid=edi.133.2',
  'https://portal.edirepository.org/nis/mapbrowse?packageid=edi.14.1',
  'https://portal.edirepository.org/nis/mapbrowse?packageid=edi.15.5',
  'https://portal.edirepository.org/nis/mapbrowse?packageid=edi.18.1

In [8]:
unique_links_list= list(unique_links)
len(unique_links)

307

In [6]:
# thefile=open("final_links.txt",'w')
# for item in unique_links_list:
#     thefile.write("%s\n" % item)

Iterate through the links and obtain the data downloadable links

In [12]:
#Iterate through list of links and get the download links

download_links=[]
for link in unique_links_list:
    website = urllib.request.urlopen(link)
    html = website.read().decode('utf8')
    each_download_link = re.findall('dataviewer\?packageid=[\w\-\.]*\&entityid=\w+',html)
    download_links = download_links + each_download_link  

In [13]:
len(download_links)

702

Filter out the duplicate links among the downloadable links and modify each link appropriately so that it can be queried on.

In [10]:
#unique downloadable links of the csv/non-csv files
parser = csv.reader(download_links)
a = []
downloadable_links = set()

for fields in parser:
    for i,f in enumerate(fields):
        a.append(f)
        downloadable_links.add('https://portal.edirepository.org/nis/'+f)
downloadable_links, len(downloadable_links)
downloadable_links=list(downloadable_links)

In [11]:
len(downloadable_links)

449

In [49]:
#Initialize the counts of csv and non-csv files

count_csv = 0
count_noncsv = 0
count_failed = 0
count_zip = 0

for each in downloadable_links:
    #Get filename of the url that is to be downloaded.
    try:
        response = urllib.request.urlopen(each)
        tupleValue = response.getheaders()[3]
        filename = tupleValue[1][21:]
        filename = re.sub('"','',filename)
        
        #Check from the filename if it is csv
        if (".csv" in filename) or (".CSV" in filename):  
            new_file_name = 'Limnology/csv-files/'+filename
            count_csv+=1
        elif ".zip" in filename:
            new_file_name = 'Limnology/zip-files/'+filename
            count_zip+=1
        else:
            new_file_name = 'Limnology/noncsv-files/'+filename
            count_noncsv+=1

        #Download the files in appropriate folder
        urllib.request.urlretrieve(each,new_file_name)
    except:
        print("Response Error")
        count_failed+=1
        continue
  
   

Response Error
Response Error
Response Error
Response Error
Response Error
Response Error
Response Error
Response Error
Response Error


In [50]:
print(count_noncsv)
print(count_csv)
print(count_zip)
print(count_failed)
print(count_noncsv+count_csv+count_failed+count_zip)

226
213
1
9
449


Iterate through the folder containing zipped files and retrieve the csv files present in them

In [51]:
import zipfile
import os  

#Unzipping all the .zip files
for file in os.listdir('Limnology/zip-files'):
     if os.path.isfile(file):
        zip_ref = zipfile.ZipFile(os.path.join(file), 'r')
        zip_ref.extractall('Limnology/unzipped-files')
        zip_ref.close()


In [52]:
#Separating out the csv files from them

for file in os.listdir('Limnology/unzipped-files'):
    if (".csv" in file) or (".CSV" in file):  
        #move the file to csv folder
        os.rename(os.path.join('Limnology/unzipped-files',file), os.path.join('Limnology/csv-files',file))
        count_csv+=1
    else:
        #move the file to non-csv folder
        os.rename(os.path.join('Limnology/unzipped-files',file), os.path.join('Limnology/noncsv-files',file))
        count_noncsv+=1
        


In [53]:
print(count_noncsv)
print(count_csv)
print(count_zip)
print(count_failed)
print(count_noncsv+count_csv+count_failed+count_zip)


2343
215
1
9
2568
