In [1]:
!pip install bs4


import requests
from bs4 import BeautifulSoup

import pandas as pd
import re

import time
from IPython.display import clear_output


clear_output()

In [2]:
url_prefix = 'https://catalog.data.gov/dataset'
total_data = 22000
pages = int(total_data / 20)

print(f'pages = {pages}')

pages = 1100


In [3]:
def get_docs_by_page(url):
    r = requests.get(url)
    time.sleep(3)
    r.encoding = "UTF-8"

    soup = BeautifulSoup(r.text,'html.parser')
    dataset_blocks = soup.find_all(class_="dataset-heading")

    dataset_headings = []
    for dataset_block in dataset_blocks:
        dataset_heading = dataset_block.get_text()[1:-1]
        dataset_headings.append(dataset_heading)

    return dataset_headings


def get_all_pages(first_page, last_page, url):
    data = []
    print(f'page number from {first_page} to {(last_page)}')
    for page_number in range(first_page, last_page+1):
        print(f'page number: {page_number}', end='\r')
        url =  url_prefix + '?page=' + str(page_number)
        docs = get_docs_by_page(url) 
        data.extend(docs)
        
    print(f'page number: {page_number}')
    print(f'len(data) = {len(data)}')
    
    return data

In [4]:
%%time

additional_gov_datasets_popular = get_all_pages(1, pages, url_prefix)

additional_gov_datasets_popular = pd.DataFrame(additional_gov_datasets_popular, columns=['title'])
additional_gov_datasets_popular.head()

page number from 1 to 1100
page number: 1100
len(data) = 22000
CPU times: user 2min 57s, sys: 2.58 s, total: 2min 59s
Wall time: 1h 22min 52s


Unnamed: 0,title
0,Department for the Aging (DFTA) Geriatric Ment...
1,Low-altitude aerial imagery obtained with unma...
2,Forestry Planting Spaces
3,2006 - 2011 NYS Math Test Results by Grade - C...
4,High Operational Temperature MWIR detectors wi...


In [5]:
def text_cleaning(text):
    '''
    Converts all text to lower case, Removes special charecters, emojis and multiple spaces
    text - Sentence that needs to be cleaned
    '''
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
    text = re.sub(' +', ' ', text)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text

In [6]:
additional_gov_datasets_popular['title'] = additional_gov_datasets_popular['title'].apply(text_cleaning)

additional_gov_datasets_popular.to_csv(f'additional_gov_datasets_{total_data}popular.csv', index=False)
additional_gov_datasets_popular.head()

Unnamed: 0,title
0,department for the aging dfta geriatric mental...
1,low altitude aerial imagery obtained with unma...
2,forestry planting spaces
3,2006 2011 nys math test results by grade cityw...
4,high operational temperature mwir detectors wi...


In [7]:
adnl_govt_labels = pd.read_csv('../input/bigger-govt-dataset-list/data_set_800.csv')
adnl_govt_labels.head()

Unnamed: 0,title
0,adni
1,cccsl
2,ibtracs
3,noaa c cap
4,noaa c-cap


In [8]:
for final_row in range(2000, total_data+1, 2000):
    data_set_800_with_popular = pd.concat([adnl_govt_labels, 
                                           additional_gov_datasets_popular.iloc[:final_row]]).reset_index(drop=True)
    data_set_800_with_popular.to_csv(f'data_set_800_with{final_row}popular.csv', index=False)
    print(f'data_set_800_with{final_row}popular.csv is saved with len = {len(data_set_800_with_popular)}')

data_set_800_with_popular

data_set_800_with2000popular.csv is saved with len = 4339
data_set_800_with4000popular.csv is saved with len = 6339
data_set_800_with6000popular.csv is saved with len = 8339
data_set_800_with8000popular.csv is saved with len = 10339
data_set_800_with10000popular.csv is saved with len = 12339
data_set_800_with12000popular.csv is saved with len = 14339
data_set_800_with14000popular.csv is saved with len = 16339
data_set_800_with16000popular.csv is saved with len = 18339
data_set_800_with18000popular.csv is saved with len = 20339
data_set_800_with20000popular.csv is saved with len = 22339
data_set_800_with22000popular.csv is saved with len = 24339


Unnamed: 0,title
0,adni
1,cccsl
2,ibtracs
3,noaa c cap
4,noaa c-cap
...,...
24334,a robust real time collaboration technology fo...
24335,free form mirrors for ultra compact high speed...
24336,calipso wide field camera wfc l1b science 1 km...
24337,radiation mitigation methods for reprogrammabl...
