In [10]:
# Import necessary packages
import numpy as np #numpy
import pandas as pd #pandas
import requests #requests
import bs4
from bs4 import BeautifulSoup #BeautifulSoup
import datetime #datetime
import re #regularexpressions

In [2]:
# Read in the .csv file stored locally
nsidc_df_rawscrape = pd.read_csv('2015nsidcAll.csv')

In [3]:
# Set new column width max
pd.set_option('max_colwidth',150)
# Check dataframe
nsidc_df_rawscrape.head(5)

Unnamed: 0,dataset_id,unique_users_ip
0,g02186,24447
1,g02135,22119
2,g00472,5862
3,nsidc-0081,5629
4,nsidc-0051,4526


In [4]:
# Create a list of NSIDC Dataset IDs to scrape course data from (used in url)
nsidc_df_rawscrape['dataset_id'] = nsidc_df_rawscrape['dataset_id'].astype('str')
datasetid = nsidc_df_rawscrape['dataset_id'].str.strip()

In [5]:
# Date time code adapted from: http://www.saltycrane.com/blog/2008/06/how-to-get-current-date-and-time-in/

# BeautifulSoup code and requests code adapted from https://www.youtube.com/watch?v=f2h41uEi0xU 
# and https://www.youtube.com/watch?v=3xQTJi2tqgk

# Create empty lists to append data
spatial_coverage = []
data_format = []
scrape_date = []
scrape_time = []
spatial_resolution = []
temporal_coverage = []
temporal_resolution = []
parameters = []
platforms = []
sensors = []
version = []
contributors = []

# set i = 0 for the while loop
i = 0
# Loop through the datasets using the dataset ids
while i < len(datasetid):
    # The url is dependent upon the specific datasetid
    url = "https://nsidc.org/data/" +datasetid[i]
    # Pull the webpage using requests
    r = requests.get(url)
    # Create a BeautifulSoup object to hold the content of the url
    soup = BeautifulSoup(r.content, 'lxml')
    # Create an object to hold the data found to match the html td class for spatial coverage
    x_1 = soup.find_all("td", {"class": "views-field views-field-field-dataset-geo-coordinates views-column-odd views-column-first views-column-last"})
    # Loop through each item and add the contents to the spatial_coverage list, if no contents add empty string
    if len(x_1) == 0:
        spatial_coverage.append("")
    else:
        for item in x_1:
            spatial_coverage.append(item.text)
    # Create an object to hold the data found to match the html td class for data format
    x_2 = soup.find_all("td", {"class": "views-field views-field-field-dataset-format views-column-odd views-column-first views-column-last"})
    # Loop through each item and add the contents to the data_format list, if no contents add empty string
    if len(x_2) == 0:
        data_format.append("")
    else:
        for item in x_2:
            data_format.append(item.text)
    # Create an object to hold the data found to match the html td class for spatial resolution
    x_3 = soup.find_all("td", {"class": "views-field views-field-field-spatial-resolution-lat-lon views-column-odd views-column-first views-column-last"})
    # Loop through each item and add the contents to the spatial resolution list, if no contents add empty string
    if len(x_3) == 0:
        spatial_resolution.append("")
    else:
        for item in x_3:
            spatial_resolution.append(item.text)
    # Create an object to hold the data found to match the html td class for temporal coverage
    x_4 = soup.find_all("td", {"class": "views-field views-field-views-conditional-1 views-column-odd views-column-first views-column-last"})
    # Loop through each item and add the contents to the temporal coverage list, if no contents add empty string
    if len(x_4) == 0:
        temporal_coverage.append("")
    else:
        for item in x_4:
            temporal_coverage.append(item.text)
    # Create an object to hold the data found to match the html td class for temporal resolution
    x_5 = soup.find_all("td", {"class": "views-field views-field-field-dataset-temporal-resolutio views-column-odd views-column-first views-column-last"})
    # Loop through each item and add the contents to the temporal resolution list, if no contents add empty string
    if len(x_5) == 0:
        temporal_resolution.append("")
    else:
        for item in x_5:
            temporal_resolution.append(item.text)
    # Create an object to hold the data found to match the html td class for parameters
    x_6 = soup.find_all("td", {"class": "views-field views-field-field-dataset-parameter-gcmd views-column-odd views-column-first views-column-last"})
    # Loop through each item and add the contents to the parameters list, if no contents add empty string
    if len(x_6) == 0:
        parameters.append("")
    else:
        for item in x_6:
            parameters.append(item.text)
    # Create an object to hold the data found to match the html td class for platforms
    x_7 = soup.find_all("td", {"class": "views-field views-field-field-dataset-platform views-column-odd views-column-first views-column-last"})
    # Loop through each item and add the contents to the platforms list, if no contents add empty string
    if len(x_7) == 0:
        platforms.append("")
    else:
        for item in x_7:
            platforms.append(item.text)
    # Create an object to hold the data found to match the html td class for sensors
    x_8 = soup.find_all("td", {"class": "views-field views-field-field-dataset-sensor data-sensors views-column-odd views-column-first views-column-last"})
    # Loop through each item and add the contents to the sensors list, if no contents add empty string
    if len(x_8) == 0:
        sensors.append("")
    else:
        for item in x_8:
            sensors.append(item.text)
    # Create an object to hold the data found to match the html td class for version
    x_9 = soup.find_all("td", {"class": "views-field views-field-field-dataset-version views-column-odd views-column-first views-column-last"})
    # Loop through each item and add the contents to the version list, if no contents add empty string
    if len(x_9) == 0:
        version.append("")
    else:
        for item in x_9:
            version.append(item.text)
    # Create an object to hold the data found to match the html td class for contributors
    x_10 = soup.find_all("td", {"class": "views-field views-field-views-conditional data-contributors views-column-odd views-column-first views-column-last"})
    # Loop through each item and add the contents to the contributors list, if no contents add empty string
    if len(x_10) == 0:
        contributors.append("")
    else:
        for item in x_10:
            contributors.append(item.text)
    # Store the current date in scrape_date
    scrape_date = datetime.datetime.now().strftime("%Y-%m-%d")
    # Store the current time in scrape_time
    scrape_time = datetime.datetime.now().strftime("%H:%M:%S%p")
    # Increase i by 1 and run through loop
    i+=1

## Scrape Citation Information

In [6]:
# Date time code adapted from: http://www.saltycrane.com/blog/2008/06/how-to-get-current-date-and-time-in/

# BeautifulSoup code and requests code adapted from https://www.youtube.com/watch?v=f2h41uEi0xU 
# and https://www.youtube.com/watch?v=3xQTJi2tqgk

# Create empty lists to append data
DOI = []
citation_date = []


# set i = 0 for the while loop
i = 0
# Loop through the datasets using the dataset ids
while i < len(datasetid):
    # The url is dependent upon the specific datasetid
    url = "https://nsidc.org/data/" +datasetid[i]+ "?qt-data_set_tabs=1#qt-data_set_tabs"
    # Pull the webpage using requests
    r = requests.get(url)
    # Create a BeautifulSoup object to hold the content of the url
    soup = BeautifulSoup(r.content, 'lxml')
    
    # Create an object to hold the text data found to match the a tag starting with http://dx.doi
    x_1 = [a.findAll(text=True) for a in (soup.find_all('a', href=re.compile('^http://dx.doi')))]
    # Loop through x_1 and add the contents to the DOI list, if no contents add empty string
    if len(x_1) == 0:
        DOI.append("")
    else:
        DOI.append(x_1)
        
    # Create an object that holds the data found to match html div class 'views-field views-field_nothing'
    x_2 = soup.find_all("div", {"class": "views-field views-field-nothing"})
    # if nothing is found append an empty string to citation_date
    if len(x_2) == 0:
        citation_date.append("")
    # if something is found
    else:
        # Loop through the items in x_2 looking for span class "date-display-single" specifically in index 1 of contents
        for item in x_2:
            x_2_a = item.contents[1].find_all("span", {"class": "date-display-single"})
            # Pull out text from x_2_a
            x_2_a = [a.findAll(text=True) for a in x_2_a]
            # if x_2_a contents any text, append it to citation_date
            if len(x_2_a) != 0:
                citation_date.append(x_2_a)
    # Update i            
    i+=1

In [7]:
# Add lists from above code chunk to new columns in nsidc_df
nsidc_df_rawscrape['scrape_date'] = scrape_date
nsidc_df_rawscrape['scrape_time'] = scrape_time
nsidc_df_rawscrape['data_format_original'] = data_format
nsidc_df_rawscrape['contributors_original'] = contributors
nsidc_df_rawscrape['spatial_coverage_original'] = spatial_coverage
nsidc_df_rawscrape['spatial_resolution_original'] = spatial_resolution
nsidc_df_rawscrape['temporal_coverage_original'] = temporal_coverage
nsidc_df_rawscrape['temporal_resolution_original'] = temporal_resolution
nsidc_df_rawscrape['parameters_original'] = parameters
nsidc_df_rawscrape['platforms_original'] = platforms
nsidc_df_rawscrape['sensors_original'] = sensors
nsidc_df_rawscrape['version_original'] = version
nsidc_df_rawscrape['doi_address'] = DOI
nsidc_df_rawscrape['citation_date'] = citation_date

In [8]:
# Create an empty list
version = []
# Set i = 0 for while loop
i=0
# Loop through rows in nsidc_df
while i < len(nsidc_df_rawscrape):
    try:
        # Remove \n occurrences
        thing = re.sub("\\\n", "", nsidc_df_rawscrape.loc[i, 'version_original'])
        # Create a regex to compile everything after V
        regexp = re.compile("V(.*)$", re.I)
        str1 = regexp.search(thing).group(1)
        if str1 == "None":
            version.append("")
        else:
            version.append(str1)
        i+=1
    # If there was an error in the try code append the empty string, update i, iterate
    except:
        version.append("")
        i+=1
        pass

# Add column to dataframe with new data
nsidc_df_rawscrape['version_clean'] = version
# Check results
nsidc_df_rawscrape.head(2)

Unnamed: 0,dataset_id,unique_users_ip,scrape_date,scrape_time,data_format_original,contributors_original,spatial_coverage_original,spatial_resolution_original,temporal_coverage_original,temporal_resolution_original,parameters_original,platforms_original,sensors_original,version_original,doi_address,citation_date,version_clean
0,g02186,24447,2016-05-18,10:45:22AM,\nPNG\nESRI Shapefile\nNetCDF\nMicrosoft Excel\nKeyhole Markup Language (.kml)\nASCII Text (.txt)\nGeoTIFF\n,"\n Florence Fetterer, Pablo Clemente-Colón, Matthew Savoie, Sean Helfrich","\nN: 90, S: 0, E: 180, W: -180\n\n",\n4 km x 4 km\n1 km x 1 km\n,\n1 October 2006\n (updated daily),\n 1 day,\nSea Ice > Ice Edges\nSea Ice > Ice Extent\nSea Ice > Ice Growth/Melt\n,"\n ALOS, AQUA, DMSP, ENVISAT, ERS-2, GOES, MSG, NOAA POES, RADARSAT-2, SATELLITES","\nAMSR-E, AMSU-A, AMSU-B, ASAR, AVHRR, GOES I-M IMAGER, MODIS, PALSAR, SAR, SEVIRI, SSM/I",\n V1,[[http://dx.doi.org/10.7265/N5GT5K3K]],[[2010]],1
1,g02135,22119,2016-05-18,10:45:22AM,\nPNG\nASCII Text (.txt)\nESRI Shapefile\n,"\n F. Fetterer, Kenneth Knowles, Walt Meier, Matthew Savoie","\nN: -39.23, S: -90, E: 180, W: -180\n\nN: 90, S: 30.98, E: 180, W: -180\n\n",\n25 km x 25 km\n,\n26 October 1978\n (updated daily),\n 1 day,\nSea Ice > Ice Extent\nSea Ice > Ice Growth/Melt\nSea Ice > Sea Ice Concentration\n,"\n DMSP, DMSP 5D-3/F17, NIMBUS-7, SATELLITES","\nSMMR, SSM/I, SSMIS",\n V1,[[http://dx.doi.org/10.7265/N5QJ7F7W]],[[2002]],1


In [9]:
# Create a list of NSIDC Version #s to scrape course data from (used in url)
nsidc_df_rawscrape['version_clean'] = nsidc_df_rawscrape['version_clean'].astype('str')
version = nsidc_df_rawscrape['version_clean'].str.strip()
print datasetid[1:5]
print version[1:5]

1        g02135
2        g00472
3    nsidc-0081
4    nsidc-0051
Name: dataset_id, dtype: object
1    1
2    1
3    1
4    1
Name: version_clean, dtype: object


In [10]:
# BeautifulSoup code and requests code adapted from https://www.youtube.com/watch?v=f2h41uEi0xU 
# and https://www.youtube.com/watch?v=3xQTJi2tqgk

# Create empty lists to append data
location_original = []
keyword_original = []
date_creation_original = []
last_updated_original = []
title_original = []


# set i = 0 for the while loop
i = 0
# Loop through the datasets using the dataset ids
while i < len(datasetid):
    # The url is dependent upon the specific datasetid and version number
    url = "https://nsidc.org/data/" +datasetid[i]+ "/versions/" +version[i]+ "/metadata"
    # Pull the webpage using requests
    r = requests.get(url)
    # Create a BeautifulSoup object to hold the content of the url
    soup = BeautifulSoup(r.content, 'lxml')
    # Create an object to hold the data found to match the html h2 tag
    title = soup.findAll('h2')
    if len(title) == 0:
        title_original.append("")
    else:
        # The first element is the title of the dataset - append to title list
        title_original.append(title[0].text)
    # Create an object to hold the data found to match the html div class for location
    x_1 = soup.find_all("div", {"class": "field field-name-field-dataset-location field-type-text field-label-above"})
    # Loop through each item and add the contents to the location list, if no contents add empty string
    if len(x_1) == 0:
        location_original.append("")
    else:
        for item in x_1:
            location_original.append(item.text)
    # Create an object to hold the data found to match the html div class for keyword
    x_2 = soup.find_all("div", {"class": "field field-name-field-dataset-keyword field-type-text field-label-above"})
    # Loop through each item, if no contents add empty string
    if len(x_2) == 0:
        keyword_original.append("")
    else:
        # Loop through the current contenst of x_2 looking for div class "field item"
        keywords = x_2[0].findAll("div", {"class": "field-item"})
        # Create empty list
        thingy = []
        # For each object in keywords, append the text
        for word in range(len(keywords)):
            thingy.append(keywords[word].text)
        # Append the contents of thingy to the keyword list
        keyword_original.append(thingy)
    # Create an object to hold the data found to match the html div class for datetime
    x_3 = soup.find_all("div", {"class": "field field-name-field-dataset-dc-date field-type-datetime field-label-inline clearfix"})
    # Loop through each item, if no contents add empty string
    if len(x_3) == 0:
        date_creation_original.append("")
    else:
        # Loop through the items in x_3 looking for span class "date-display-single" specifically in index 1 of contents
        for item in x_3:
            x_3_a = item.contents[1].find_all("span", {"class": "date-display-single"})
            # Pull out text from x_3_a
            x_3_a = [a.findAll(text=True) for a in x_3_a]
            # if x_3_a contains any text, append it to date_creation_original
            if len(x_3_a) != 0:
                date_creation_original.append(x_3_a)
    # Create an object to hold the data found to match the html div class for datetime
    x_4 = soup.find_all("div", {"class": "field field-name-field-dataset-last-updated field-type-datetime field-label-inline clearfix"})
    # Loop through each item, if no contents add empty string
    if len(x_4) == 0:
        last_updated_original.append("")
    else:
        # Loop through the items in x_4 looking for span class "date-display-single" specifically in index 1 of contents
        for item in x_4:
            x_4_a = item.contents[1].find_all("span", {"class": "date-display-single"})
            # Pull out text from x_4_a
            x_4_a = [a.findAll(text=True) for a in x_4_a]
            # if x_4_a contents any text, append it to last_updated_original
            if len(x_4_a) != 0:
                last_updated_original.append(x_4_a)
    # Increase i by 1 and run through loop
    i+=1

In [11]:
# Add lists from above code chunk to new columns in nsidc_df
nsidc_df_rawscrape['location_original'] = location_original
nsidc_df_rawscrape['keyword_original'] = keyword_original
nsidc_df_rawscrape['date_creation_original'] = date_creation_original
nsidc_df_rawscrape['last_updated_original'] = last_updated_original
nsidc_df_rawscrape['title_original'] = title_original

In [12]:
# Capture the current date and time in object nowtime
nowtime = datetime.datetime.now().strftime("%Y-%m-%d_%H%M")
# Save dataframe to csv
nsidc_df_rawscrape.to_csv('nsidc_df_rawscrape_' + nowtime + '.csv', index=False, encoding='utf-8')