# firebasePopulate
Description: Crawls, and analyses articles from stated URLs (and Mothership, because it's special/troublesome), churns out parameters via analyseArticle, and pushes them to Firebase.

The parameters are:
```{"title", "url", "authors", "date", "summary", "polarity", "subjectivity", "keywords", "images", "videos"}```

### Initialise

In [7]:
print("\nINITIALISING MODULES\n.")

%run 'analyseArticle.ipynb'
%run 'firebasePush.ipynb'

import traceback
import newspaper
import requests
import time
from bs4 import BeautifulSoup
from timeit import default_timer as timer

start = timer()

print("OPENING LOGS\n.")
log = open("CRAWL_LOG.txt", "w")

print("LOADING URL LISTS\n.\n")

COMPLETED = []

QUEUE = []

newsURLs = ["www.straitstimes.com","www.allsingaporestuff.com"]

mothershipURLs = ["mothership.sg/category/news","mothership.sg/category/perspectives",
                  "mothership.sg/category/community","mothership.sg/category/almost-famous",
                  "mothership.sg/category/mps-in-the-house","mothership.sg/category/humour"]

print("\nINITIALISED FIREBASEPOPULATE")


INITIALISING MODULES
.
OPENING LOGS
.
LOADING URL LISTS
.


INITIALISED FIREBASEPOPULATE


### Crawl and analyse the latest Mothership Articles this month, outputting parameters, and pushing to Firebase

In [None]:
mcount = 0
mnoteng = 0
mfailed = 0
mtooshort = 0
mfetcherror = 0

print("RUN MOTHERSHIP MODULE\n")

for URL in mothershipURLs:
    print("Retrieving URL...\n")
    try:
        sourceCode = requests.get("http://" + str(URL))
        soup = BeautifulSoup(sourceCode.content, "lxml")
        print("Target URL: " + str(URL))

        for div in soup.find_all("div", class_="ind-article"):
            for a in div.find_all("a"):
                if "mothership.sg" in a.get("href"):
                    try:
                        print(str(mcount + mnoteng + mfailed + mtooshort + mfetcherror + 1)+": ", end="")
                        parameters = analyseArticle(a.get("href")) #for getting link
                        
                        if parameters == "ZERO_SENTIMENT_ERROR": #Check for zero sentiment, means article is too short or redirected
                            mtooshort += 1
                            print("SKIPPING: ZERO_SENTIMENT_ERROR, NO SENTIMENT DETECTED!", end=" #")
                            print(str(mtooshort))
                            continue
                            
                        if parameters == "FETCH_ERROR": #Check for zero sentiment, means article is too short or redirected
                            mfetcherror += 1
                            print("SKIPPING: FETCH_ERROR, COULD NOT DOWNLOAD ARTICLE!", end=" #")
                            print(str(mfetcherror))
                            continue
            
                        if str(parameters["language"]) != "en": #Check if article is in English, if it isn't skip
                            mnoteng += 1
                            print("SKIPPING: LANG_ERROR, ARTICLE NOT IN ENGLISH!", end=" #")
                            print(str(mnoteng) + " (" + str(parameters["language"]) + ")")
                            continue
                        
                        title = str(parameters["title"])
                        url = str(parameters["url"])
                        authors = parameters["authors"]
                        date = str(parameters["date"])
                        summary = str(parameters["summary"])
                        polarity = str(parameters["polarity"])
                        subjectivity = str(parameters["subjectivity"])
                        keywords = parameters["keywords"]
                        images = str(parameters["images"])
                        videos = str(parameters["videos"])
                        text = str(parameters["text"])

                        firebasePush(title, url, authors, date, summary, polarity, subjectivity, keywords, images, videos, text)
                        mcount += 1
                        print("Processed article #", end="")
                        print(mcount)
                        
                    except Exception as ex:
                        mfailed += 1
                        print("FAILED article: #", end=" | ")
                        print(ex)
                        print(mfailed,end=" | Moving on...\n")
            
                        log.write("\n\n ------------------------ ")
                        log.write("\n\nMOTHERSHIP MODULE UNKNOWN ERROR DUMP | Fetch #")
                        log.write(str(mcount + mnoteng + mfailed + mtooshort + mfetcherror))
                        log.write(": \n\n")
                        log.write("ERROR:")
                        log.write(str(traceback.format_exc()))  #FOR DEBUGGING
                        log.write("\n\n")
                        log.write("Data:")
                        log.write(str(parameters))              #FOR DEBUGGING
                        
    except Exception as ex:
        print("Failed URL", end=" | ")
        print(ex)
        
    print("\n ------------------------ ")
    string = "FINISHED: " + str(URL)
    print(string.center(63))
    log.write("PROCESSED: ")
    log.write(str(URL))
    log.write("\n")
    log.flush()
    
    print(" ------------------------ \n")

methylHalf()

print("\n  ------------------------ ")
print("                FINISHED PROCESSING MOTHERSHIP")
log.write("FINISHED PROCESSING: ")
log.write("MOTHERSHIP")
log.write("\n\n")
print(" ------------------------ \n")

print("SUMMARY:")
print("Elapsed time: ",end="")
checkpoint = timer()
print(checkpoint - start,end="")
print(" seconds\n")
log.write("Elapsed Time: " + str(checkpoint - start))
log.write("\n\n")
log.flush

print(str(mcount + mnoteng + mfailed + mtooshort + mfetcherror) + " Total Articles Accessed")
print(str(mcount) + " Processed Articles\n")

print(str(mnoteng) + " LANG_ERRORs (Article not in English)")
print(str(mtooshort) + " ZERO_SENTIMENT_ERRORs (No sentiment detected)")
print(str(mfetcherror) + " FETCH_ERRORs (Failed to fetch article)")
print(str(mfailed) + " Failed Articles\n")

firebaseRefresh()
time.sleep(1)

print(" ------------------------ ")

RUN MOTHERSHIP MODULE

Retrieving URL...

Target URL: mothership.sg/category/news
1: Processed article #1
2: Processed article #2
3: Processed article #3
4: Processed article #4
5: Processed article #5
6: Processed article #6
7: Processed article #7
8: Processed article #8
9: Processed article #9
10: Processed article #10

 ------------------------ 
             FINISHED: mothership.sg/category/news             
 ------------------------ 

Retrieving URL...

Target URL: mothership.sg/category/perspectives
11: Processed article #11
12: Processed article #12
13: Processed article #13
14: Processed article #14
15: Processed article #15
16: Processed article #16
17: Processed article #17
18: Processed article #18
19: Processed article #19
20: Processed article #20

 ------------------------ 
         FINISHED: mothership.sg/category/perspectives         
 ------------------------ 

Retrieving URL...

Target URL: mothership.sg/category/community
21: Processed article #21
22: Processed artic

### Crawl and analyse the other URLs, outputting parameters, and pushing to Firebase

In [None]:
count = 0
noteng = 0
failed = 0
tooshort = 0
fetcherror = 0

print("RUN URL MODULE\n")

for URL in newsURLs:
    print("Building domain...\n")
    
    try:
        paper = newspaper.build("http://" + str(URL), memoize_articles=False)
        print("Domain building complete for: " + str(URL))
    except Exception as ex:
        print("Failed DOMAIN", end=" | ")
        print(ex, end =" | moving on...\n")

    for article in paper.articles:
        try:
            print(str(count + noteng + failed + tooshort + fetcherror + 1)+": ",end="")
            parameters = analyseArticle(article.url)

            if parameters == "ZERO_SENTIMENT_ERROR": #Check for zero sentiment, means article is too short or redirected
                tooshort += 1
                print("SKIPPING: ZERO_SENTIMENT_ERROR, NO SENTIMENT DETECTED!", end=" #")
                print(str(tooshort))
                print(article.url)
                continue
                
            if parameters == "FETCH_ERROR":
                fetcherror +=1
                print("SKIPPING: FETCH_ERROR, COULD NOT DOWNLOAD ARTICLE!", end=" #")
                print(str(fetcherror))
                continue
                
            if str(parameters["language"]) != "en": #Check if article is in English, if it isn't skip
                noteng += 1
                print("SKIPPING: LANG_ERROR, ARTICLE NOT IN ENGLISH!", end=" #")
                print(str(noteng) + " (" + str(parameters["language"]) + ")")
                print(article.url)
                continue

            title = parameters["title"]
            url = str(article.url)
            authors = parameters["authors"]
            date = str(parameters["date"])
            summary = str(parameters["summary"])
            polarity = str(parameters["polarity"])
            subjectivity = str(parameters["subjectivity"])
            keywords = parameters["keywords"]
            images = str(parameters["images"])
            videos = str(parameters["videos"])
            text = str(parameters["text"])

            firebasePush(title, url, authors, date, summary, polarity, subjectivity, keywords, images, videos, text)
            count += 1
            print("Processed article #", end="")
            print(count)
  
        except Exception as ex:
            failed += 1
            print("FAILED article: #",end="")
            print(failed, end=" | ")
            print(ex,end=" | Moving on...\n")

            log.write("\n\n ------------------------ ")
            log.write("\n\nURL MODULE UNKNOWN ERROR DUMP | Fetch #")
            log.write(str(count + noteng + failed + tooshort + fetcherror))
            log.write(": \n\n")
            log.write("ERROR:")
            log.write(str(traceback.format_exc()))  #FOR DEBUGGING
            log.write("\n\n")
            log.write("DATA:\n")
            log.write(str(parameters))              #FOR DEBUGGING

            
    print("\n  ------------------------ ")
    string = "FINISHED: " + str(URL)
    print(string.center(63))
    log.write("PROCESSED: ")
    log.write(str(URL))
    log.write("\n")
    log.flush()
    print("  ------------------------ ")

    print("RUNNING SUMMARY:")
    print("Elapsed time: ",end="")
    checkpoint = timer()
    print(checkpoint - start,end="")
    print(" seconds\n")
    log.write("Elapsed Time: " + str(checkpoint - start))
    log.write("\n\n")
    log.flush
    
    print(str(count + noteng + failed + tooshort + fetcherror) + " Total Articles Fetched")
    print(str(count) + " Processed Articles\n")
    
    
    print(str(noteng) + " LANG_ERRORs (Article not in English)")
    print(str(tooshort) + " ZERO_SENTIMENT_ERRORs (No sentiment detected)")
    print(str(fetcherror) + " FETCH_ERRORs (Failed to fetch article)")
    print(str(failed) + " Failed Articles\n")
    
    firebaseRefresh()
    time.sleep(1)
    
    print(" ------------------------ ")

methylHalf()    
print("\n ------------------------ ")
print("                   FINISHED PROCESSING URLS!")
log.write("FINISHED PROCESSING: ")
log.write("URLS")
log.write("\n\n")
print("  ------------------------ \n")

print("SUMMARY:")
print(str(count + noteng + failed + tooshort + fetcherror) + " Total Articles Accessed")
print(str(count) + " Processed Articles\n")

print(str(noteng) + " LANG_ERRORs (Article not in English)")
print(str(tooshort) + " ZERO_SENTIMENT_ERRORs (No sentiment detected)")
print(str(fetcherror) + " FETCH_ERRORs (Failed to fetch article)")
print(str(failed) + " Failed Articles\n")

print(" ------------------------ \n")

print("Elapsed time: ",end="")
checkpoint = timer()
print(checkpoint - start,end="")
print(" seconds\n")
print("SHUTTING DOWN")
log.write("Elapsed Time: " + str(checkpoint - start))
log.write("\n\n")
log.write("SHUTTING DOWN")
log.flush

log.close()

RUN URL MODULE

Building domain...



unable to cache TLDs in file /usr/local/lib/python3.5/dist-packages/tldextract/.tld_set: [Errno 13] Permission denied: '/usr/local/lib/python3.5/dist-packages/tldextract/.tld_set'


Domain building complete for: www.straitstimes.com
1: Processed article #1
2: SKIPPING: ZERO_SENTIMENT_ERROR, NO SENTIMENT DETECTED! #1
http://www.straitstimes.com/files/donald-trump-scraps-key-obamacare-subsidies-urges-democrats-to-fix-broken-mess
3: Processed article #2
4: Processed article #3
5: Processed article #4
6: Processed article #5
7: Processed article #6
8: Processed article #7
9: Processed article #8
10: Processed article #9
11: Processed article #10
12: Processed article #11
13: Processed article #12
14: SKIPPING: ZERO_SENTIMENT_ERROR, NO SENTIMENT DETECTED! #2
http://www.straitstimes.com/multimedia/photos/in-pictures-the-stars-are-out-for-the-san-sebastian-international-film-festival-in
15: Processed article #13
16: Processed article #14
17: Processed article #15
18: Processed article #16
19: SKIPPING: ZERO_SENTIMENT_ERROR, NO SENTIMENT DETECTED! #3
http://www.straitstimes.com/files/the-lives-they-live-born-chinese-but-raised-by-indian-parents
20: Processed article #17
2