In [75]:
import pandas as pd
import requests
import json
import logging
from bs4 import BeautifulSoup
import time

logging.basicConfig(filename='webscraping.log',
                            filemode='a',
                            format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
                            datefmt='%H:%M:%S',
                            level=logging.DEBUG)

logger = logging.getLogger('webscraper')
# GET COUNTRIES
country_names = [c_name for c_name in pd.read_csv('/Users/meggan/dsi-10/portfolio/capstone/data/country_names.csv')['name']]
    # SOURCES: http://worldpopulationreview.com/countries/most-visited-countries/
    # & https://www.travel-advisory.info/api

# GET ALL URLS
try:
    with open('temp_file_articles.json') as f:
        file_articles=json.load(f)
except:
    file_articles = []
    
    
try:
    with open('temp_file_search.json') as f:
        search_articles=json.load(f)
except:
    search_articles = []

try:
    with open('countries.json') as f:
        countries=json.load(f)
except:
    countries = []
    
file_pages = {}
search_pages = {}

print("STARTING")
for country in country_names[len(countries):2]:
    countries.append(country)
    logger.info(f'Searching {country}...')

    # FIND THE MAXIMUM PAGES OF BLOG POSTS BY COUNTRY
    url_file = f'https://fathomaway.com/{country}'
    url_search = f'https://fathomaway.com/search/?q={country}'
    res_file = requests.get(url_file)
    res_search = requests.get(url_search)
    soup_file = BeautifulSoup(res_file.content, 'lxml')
    soup_search = BeautifulSoup(res_search.content, 'lxml')
    time.sleep(10)
    try:
        file_pages[country] =  soup_file.find_all(attrs={'class': 'pagination__link'})[-2].text
    except:
        file_pages[country] =  1
    try:
        search_pages[country] =  soup_search.find_all(attrs={'class': 'pagination__link'})[-2].text
    except:
        search_pages[country] =  1
    
    # GET URLS FOR EACH COUNTRY FILE
    logger.info(f'... {file_pages[country]} pages of filed articles ...')
    
    for num in range(int(file_pages[country])): 
        url_base = f'https://fathomaway.com/{country}/?page={num}'
        res_ = requests.get(url_base)
        soup_ = BeautifulSoup(res_.content, 'lxml')
        time.sleep(10)

        for link in soup_.find_all(attrs={'class': 'content__url'}):
            article = {}
            article['href'] = link.attrs['href']
            article['title'] = link.text.strip()
            article['country'] = country
            file_articles.append(article)
            

    # GET URLS FOR COUNTRY BY SEARCH
    logger.info(f'... and {search_pages[country]} pages of searched articles ...')
    
    for num in range(int(search_pages[country])): 
        url_base = f'https://fathomaway.com/search/?q={country}&page={num}'
        res_ = requests.get(url_base)
        soup_ = BeautifulSoup(res_.content, 'lxml')
        time.sleep(10)

        for link in soup_.find_all(attrs={'class': 'content__url'}):
            article = {}
            article['href'] = link.attrs['href']
            article['title'] = link.text.strip()
            article['country'] = country
            search_articles.append(article)
    logger.info("SAVING FILES FOR " + country)
    with open('countries.json', 'w') as outfile:
        json.dump(countries, outfile)
    with open('temp_file_articles.json', 'w') as outfile:
        json.dump(file_articles, outfile)
    with open('temp_file_search.json', 'w') as outfile:
        json.dump(search_articles, outfile)        


df_file = pd.DataFrame(file_articles)
df_file = df_file.sort_values(by='title', ascending=False).drop_duplicates('href').copy()

df_search = pd.DataFrame(search_articles)
df_search = df_search.sort_values(by='title', ascending=False).drop_duplicates('href').copy()

# PARSE CONTENT OF EVERY BLOG POST - FILED
logger.info(f'Pulling text from filed articles ...')
try:
    with open('content_filed.json') as f:
        content_filed=json.load(f)
except:
    content_filed = []
    
try:
    with open('href_cached.json') as f:
        hrefs=json.load(f)
except:
    hrefs = []

print("PULLING TEXT")
for url in df_file['href'][len(hrefs):]:
    hrefs.append(url)
    url_post = f'https://fathomaway.com{url}'
    res_post = requests.get(url_post)
    soup_post = BeautifulSoup(res_post.content, 'lxml')

    paragraphs = soup_post.find_all(attrs={'class': 'article-detail__main'})

    text = ''.join([paragraph.text.replace('\n', '') for paragraph in paragraphs])
    content_filed.append(text)
    with open('href_cached.json', 'w') as outfile:
        json.dump(hrefs, outfile) 
    with open('content_filed.json', 'w') as outfile:
        json.dump(content_filed, outfile) 
    time.sleep(10)

# PARSE CONTENT OF EVERY BLOG POST - SEARCHED
print("PARSING CONTENT")
logger.info(f'Pulling text from searched articles ...')
try:
    with open('content_searched.json') as f:
        content_searched=json.load(f)
except:
    content_searched = []
    
try:
    with open('href_cached_content_search.json') as f:
        hrefs=json.load(f)
except:
    hrefs = []

for url in df_search['href'][len(hrefs):]:
    hrefs.append(url)
    url_post = f'https://fathomaway.com{url}'
    res_post = requests.get(url_post)
    soup_post = BeautifulSoup(res_post.content, 'lxml')

    paragraphs = soup_post.find_all(attrs={'class': 'article-detail__main'})

    text = text.join([paragraph.text.replace('\n', '') for paragraph in paragraphs])
    content_searched.append(text)
    with open('href_cached_content_search.json', 'w') as outfile:
        json.dump(hrefs, outfile) 
    with open('content_searched.json', 'w') as outfile:
        json.dump(content_searched, outfile) 
    time.sleep(10)

df_file['text'] = content_filed
df_search['text'] = content_searched

df_search.to_csv('./data/search_all_articles.csv')
df_file.to_csv('./data/files_all_articles.csv')


STARTING
PULLING TEXT
PARSING CONTENT


In [77]:
df_search

Unnamed: 0,href,title,country,text
31,/love-pan-am/,There Is Love After Pan Am,United Arab Emirates,"I owe my career in travel, which has been larg..."
19,/24-best-travelers-instagram/,The 24 Best Travelers on Instagram 2017,United Arab Emirates,"Whether you love it, hate it, or fall somewher..."
41,/15-hottest-hotel-openings-summer-and-fall-2018/,The 15 Hottest Hotel Openings of Fall 2018,United Arab Emirates,Keeping tabs on new hotels for our quarterly r...
29,/united-arab-emirates-tourists-oman/,On the Road: Oman,United Arab Emirates,OMAN – The road from Dubai to Oman passes thro...
15,/fathom-questionnaire-kym-canter/,Meet the Traveler: Kym Canter,United Arab Emirates,"Hometown: Born and raised in New Haven, Connec..."
37,/holiday-barr-al-jissah-resort-muscat-oman/,Just Back From: Muscat,United Arab Emirates,"Just back from: Muscat, Oman.Was it your first..."
25,/what-to-do-layover-abu-dhabi/,How to Spend a Layover in Abu Dhabi,United Arab Emirates,Passing through the UAE on a layover with a fe...
21,/best-hotel-openings-2018/,Here They Come: The 20 Best Hotel Openings of ...,United Arab Emirates,We’re calling it now: 2018 is going to be an e...
35,/cool-dubai-photo-story/,Dubai Is Way Cooler Than You Think,United Arab Emirates,There's more to Dubai than all the glitz and g...
23,/24-best-hotel-openings-2017/,Coming Soon: The 24 Best Hotel Openings of 2017,United Arab Emirates,If you need a reason to get excited about 2017...


In [72]:
content_filed

[{'href': '/what-to-do-layover-abu-dhabi/',
  'title': '',
  'country': 'United Arab Emirates'},
 {'href': '/what-to-do-layover-abu-dhabi/',
  'title': 'How to Spend a Layover in Abu Dhabi',
  'country': 'United Arab Emirates'},
 {'href': '/empty-quarter-desert-uae/',
  'title': '',
  'country': 'United Arab Emirates'},
 {'href': '/empty-quarter-desert-uae/',
  'title': 'A Grain of Sand in the Empty Quarter',
  'country': 'United Arab Emirates'},
 {'href': '/united-arab-emirates-tourists-oman/',
  'title': '',
  'country': 'United Arab Emirates'},
 {'href': '/united-arab-emirates-tourists-oman/',
  'title': 'On the Road: Oman',
  'country': 'United Arab Emirates'},
 {'href': '/love-pan-am/', 'title': '', 'country': 'United Arab Emirates'},
 {'href': '/love-pan-am/',
  'title': 'There Is Love After Pan Am',
  'country': 'United Arab Emirates'},
 {'href': '/ask-fathom-can-i-go-middle-east/',
  'title': '',
  'country': 'United Arab Emirates'},
 {'href': '/ask-fathom-can-i-go-middle-east/

In [58]:
print(len(content_searched),len(content_filed))

12 4


In [76]:
print(len(content_searched),len(content_filed))

12 4


In [38]:
with open('href_cached.json') as f:
    hrefs=json.load(f)

In [39]:
hrefs

['/united-arab-emirates-tourists-oman/',
 '/what-to-do-layover-abu-dhabi/',
 '/cool-dubai-photo-story/',
 '/empty-quarter-desert-uae/']