In [1]:
from bs4 import BeautifulSoup
from os.path import expanduser
import os
from tqdm import tqdm
import pandas as pd
import requests
import urllib3
import time
import lxml

In [2]:
BASEURL = "https://www.cia.gov"

SCIAB = 'scientific-abstracts' # 1653
GREC = 'general-cia-records' # 38439]
JPRS = 'jprs' # 127
LOC = 'library-congress' # 394
NIS = 'nis' # 22

FIELDS = ['id','title','classification','publication_date','page_count','content_type']

PDF_DIR = expanduser("~") + "/cia_pdfs/"
SKIPPED_FILES = []

In [3]:
def retrieve_file(url, name):
    filepath = PDF_DIR + name + ".pdf"
    if not os.path.exists(PDF_DIR):
        os.makedirs(PDF_DIR)
    try:
        response = urllib2.urlopen(url)
        with open(filepath, 'w+') as f:
            f.write(response.read())
            f.close()
    except urllib2.URLError as e:
        print ('WiFi connection perhaps lost !! Trying one more time...')
        try:
            response = urllib2.urlopen(url)
            with open(filepath, 'w+') as f:
                f.write(response.read())
                f.close()
        except:
            print ('WiFi connection really lost !! Bailing out..')
            print (e)
            SKIPPED_FILES.append(name)

In [4]:
def gather_meta(collection, pages, download): # str, int, bool
    df = pd.DataFrame(columns=FIELDS)
    idx = 0
    
    for i in tqdm(range(pages)):
        
        pagination_link = BASEURL + '/library/readingroom/collection/' + collection + '?page=' + str(i)
        pagination_page = requests.get(pagination_link)
        p_soup = BeautifulSoup(pagination_page.content, 'lxml') # pagination soup
        
        for doc in p_soup.find_all("h4", class_="field-content"):
            
            (PUB_DATE, ID, CLASSIFICATION, PAGE_COUNT, CONTENT_TYPE) = "","","","",""
            
            a = doc.select_one("a")
            link = str(a.get('href'))
            TITLE = str(a.string or "")
            
            doc_page = requests.get(BASEURL + link)
            m_soup = BeautifulSoup(doc_page.content, 'lxml') # metadata soup
            time.sleep (.05)
        
            try:
                PUB_DATE = m_soup.select_one(".field-name-field-pub-date").select_one("span").get('content')
                ID = m_soup.select_one(".field-name-field-document-number").select_one(".field-item").string
                CLASSIFICATION = m_soup.select_one(".field-name-field-original-classification").select_one(".field-item").string
                PAGE_COUNT = m_soup.select_one(".field-name-field-page-count").select_one(".field-item").string
                CONTENT_TYPE = m_soup.select_one(".field-name-field-content-type").select_one(".field-item").string
            except:
                pass
            
            if download:
                PDF = m_soup.select_one(".file").select_one("a").get('href')
                retrieve_file(PDF, ID)
            
            df.loc[idx] = [ID, TITLE, CLASSIFICATION, PUB_DATE, PAGE_COUNT, CONTENT_TYPE]
            idx+=1  
            
    return df

In [5]:
def totalpages(df):
    return df.page_count.astype(int).sum()

In [6]:
def values(df, col):
    return str(df[col].unique())

In [7]:
jprs = gather_meta(JPRS,1,False)
# jprs.to_csv("jprs.csv")
jprs.head()

100%|██████████| 1/1 [00:05<00:00,  5.81s/it]


Unnamed: 0,id,title,classification,publication_date,page_count,content_type
0,,JPRS ID: 10001 USSR REPORT ECONOMIC AFFAIRS D...,,,,
1,,JPRS ID: 10003 SOUTH AND EAST ASIA REPORT,,,,
2,,"JPRS ID: 10005 CHINA REPORT POLITICAL, SOCIOL...",,,,
3,,JPRS ID: 10007 WEST EUROPE REPORT,,,,
4,,JPRS ID: 10008 SUB-SAHARAN AFRICA REPORT,,,,


In [8]:
nis = gather_meta(NIS,1,False)
# nis.to_csv("nis.csv")
nis.head()

100%|██████████| 1/1 [00:04<00:00,  4.42s/it]


Unnamed: 0,id,title,classification,publication_date,page_count,content_type
0,,NATIONAL INTELLIGENCE SURVEY 11; SWEDEN; ARMED...,,,,
1,,NATIONAL INTELLIGENCE SURVEY 11; SWEDEN; COUNT...,,,,
2,,NATIONAL INTELLIGENCE SURVEY 11; SWEDEN; GOVER...,,,,
3,,NATIONAL INTELLIGENCE SURVEY 11; SWEDEN; MILIT...,,,,
4,,NATIONAL INTELLIGENCE SURVEY 11; SWEDEN; SCIENCE,,,,


In [9]:
sciab = gather_meta(SCIAB,1,False)
# sciab.to_csv("sciab.csv")
sciab.head()

100%|██████████| 1/1 [00:14<00:00, 14.98s/it]


Unnamed: 0,id,title,classification,publication_date,page_count,content_type
0,CIA-RDP86-00513R001031010018-6,"""SCIENTIFIC ABSTRACT LVOVSKIY, P.G. - LYADOVA...",S,1967-12-31T01:01:01-05:00,100,SCIENTIFIC ABSTRACT
1,CIA-RDP86-00513R000204310003-8,"""SCIENTIFIC ABSTRACT BELENKIY, M.N. - BELENKI...",S,1967-12-31T01:01:01-05:00,100,SCIENCEAB
2,CIA-RDP86-00513R000204310001-0,"""SCIENTIFIC ABSTRACT BELENKIY, G. A. - BELENKI...",S,1967-12-31T01:01:01-05:00,100,SCIENCEAB
3,CIA-RDP86-00513R000204310002-9,"""SCIENTIFIC ABSTRACT BELENKIY, L.I. - BELENKIY...",S,1967-12-31T01:01:01-05:00,100,SCIENCEAB
4,CIA-RDP86-00513R000204310004-7,"""SCIENTIFIC ABSTRACT BELENKIY, N. - BELENKIY, ...",S,1967-12-31T01:01:01-05:00,100,SCIENCEAB


In [10]:
grec = gather_meta(GREC,1,False)
# grec.to_csv("grec.csv")
grec.head()

100%|██████████| 1/1 [00:41<00:00, 41.33s/it]


Unnamed: 0,id,title,classification,publication_date,page_count,content_type
0,,! DE RODILLAS ANTE SUS AMOS!,,,,
1,,""" NATIONAL FRONT "" AS AN AGENT OF SOVIET POLICY",,,,
2,,"""'SKLAD 28' STORAGE DEPOT IN TASHKENT, UZBEK ...",,,,
3,,"""CHALLENGES TO SECURITY IN EAST ASIA""",,,,
4,,"""CHART II - ORGANIZATION OF THE KOREAN LABOR P...",,,,
