In [None]:
import requests
from bs4 import BeautifulSoup
import json
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
import re

def claim_reader(raw):
    #extract claim, people IDs and place IDs from quote chunk
    metadata = {
                'speaker': '',
                'named_people': [],
                'geographies': []}
    claim = ''
    extra = ''
    if raw.count('"') == 2:
        parts = raw.split('"')
        claim = parts[1]
        extra = " ".join((parts[0], parts[2]))
    else:
        claim = raw
    return claim   

def clean_tweet_stub(raw):
    #Remove trailing retweet/like data from Google stub for tweet listing
    idx_string = " Retweets(.*); Likes"
    compiled = re.compile(idx_string)
    search_str = compiled.search(raw)
    search_str = search_str.group(1).strip()
    index = raw.index(" Retweets %s; Likes" % search_str)
    tweet_content = raw[:index]
    return tweet_content

def tag_pos(sentence):
    words = tokenize(sentence)
    words = [nltk.pos_tag(words) for word in words]
    return words
    
def ID_people(text):
    pass

def ID_geog(text):
    pass

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text, stem=False):
    stemmer = PorterStemmer()
    tokens = nltk.word_tokenize(text)
    if stem:
        tokens = stem_tokens(tokens, stemmer)
    return tokens

def tf_idf_cosine_comparison (raw_text):
    vect = TfidfVectorizer(min_df=1)
    tfidf = vect.fit_transform(raw_text)
    return (tfidf * tfidf.T).A[0,1]

def google_claim (claim):
    claim_google = claim.replace('"', '%22')
    url = 'https://www.google.com/search?q='+claim_google+'&aqs=chrome.2.69i57j69i65j0l4.5769j0j4&sourceid=chrome&ie=UTF-8'
    headers = {'user-agent': 'Mozilla/5.0'}

    claim = claim_reader(claim)

    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.content, "html.parser")

    results = soup.find_all("div", {"class":"g"})
    google_links = []
    google_sources = []
    google_headlines = []
    google_stubs = []
    twitter_users = []

    for result in results:
        link = result.find_all("a")[0].get("href")
        #Get rid of the junk characters that prefix links
        link = link[7:]
        google_links.append(link)

        headline = result.find_all("a")[0].text
        google_headlines.append(headline)

        stub = result.find_all("span", {"class":"st"})[0].text
        google_stubs.append(stub)

        index = link.index('//')+2
        source_link = link[index:]
        end_point = source_link.index('/')
        domain = source_link[:end_point]
        if "twitter.com" in domain:
            un_start = source_link.index('.com/')+5
            un = source_link[un_start:]
            un_end = un.index('/')
            un = un[:un_end]
            twitter_users.append(un)
            stub = clean_tweet_stub(stub)
            #Call comparison function - ADD IF STATEMENT TO VERIFY PERSON CLAIMING IS CORRECT
            similarity_score = tf_idf_cosine_comparison([claim, stub])
            print("Claim: ",claim)
            print("Stub: ",stub)
            print("Similarity score:", similarity_score)
            
        google_sources.append(domain)

def google_claim (claim):
    claim_twitter = claim.replace('"', '%22')
    url='https://twitter.com/search?q='+claim_twitter
    headers = {'user-agent': 'Mozilla/5.0'}

    claim = claim_reader(claim)

    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.content, "html.parser")

    results = soup.find_all("div", {"class":"stream"})
    google_links = []
    google_sources = []
    google_headlines = []
    google_stubs = []
    twitter_users = []

    for result in results:
        link = result.find_all("a")[0].get("href")
        #Get rid of the junk characters that prefix links
        link = link[7:]
        google_links.append(link)

        headline = result.find_all("a")[0].text
        google_headlines.append(headline)

        stub = result.find_all("span", {"class":"st"})[0].text
        google_stubs.append(stub)

        index = link.index('//')+2
        source_link = link[index:]
        end_point = source_link.index('/')
        domain = source_link[:end_point]
        if "twitter.com" in domain:
            un_start = source_link.index('.com/')+5
            un = source_link[un_start:]
            un_end = un.index('/')
            un = un[:un_end]
            twitter_users.append(un)
            stub = clean_tweet_stub(stub)
            #Call comparison function - ADD IF STATEMENT TO VERIFY PERSON CLAIMING IS CORRECT
            similarity_score = tf_idf_cosine_comparison([claim, stub])
            print("Claim: ",claim)
            print("Stub: ",stub)
            print("Similarity score:", similarity_score)

        google_sources.append(domain)

claim = '‚Äúglobal warming was created by the Chinese to make US manufacturing non-competitive.‚Äù Donald Trump'.replace('‚Äù', '"').replace('‚Äú', '"')

google_claim(claim)

In [None]:
from eventregistry import *
import requests
from bs4 import BeautifulSoup
import csv

def source_quote_eventregistry(quote):
    # Setup the event registry access
    er = EventRegistry()
       
    print('\n\nClaim is: '+quote)
    # need to shorten the claim to 15 words or less for free account
    quote=quote.split()[:15]
    print('Truncated claim is: '+quote)
    q = QueryArticles(keywords = quote)
    q.addRequestedResult(RequestArticlesInfo())

    known_sources=['bbc.co.uk','guardian.com','reuters.com','theherald.com']
    # JSON return data from event registry query - just grabbing the first instance here, but potentially we could
    # filter by news source - say we know how to parse BBC/guardian/etc pages and we have confidence in their accuracy
    jsonResults=er.execQuery(q)

    print json.dumps(er.execQuery(q), sort_keys=True, indent=4, separators=(',', ': '))
    #url=json.dumps(jsonResults["articles"]["results"][0]["url"])[1:-1]
    sources=[]
    # if there were some results
    if not jsonResults["error"]:
        for article in jsonResults["articles"]["results"]:
            # we've now got the article url from result set, check if it's in the known sources list
            sources.append(json.dumps(article["url"])[1:-1])

#             for known_source in known_sources:
#                 if known_source in json.dumps(article["url"])[1:-1]:
#                     sources.append(json.dumps(article["url"])[1:-1])
#                     break

    if sources:
        print("Found "+str(len(sources))+" sources")
        for url in sources:
            print("Checking: "+url)
            # use this url to grab the original article
            headers = {'user-agent': 'Mozilla/5.0'}
            r = requests.get(url, headers=headers)
            soup = BeautifulSoup(r.content, "html.parser")
            [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]

            # the original article text
            articleText=soup.getText().encode('utf-8').strip()
            #print(articleText)

            # the most primitive form of checking if the claim is in the article:
            if claim in str(articleText):
                print("Claim is present in article")
                # break out of searching the sources when we find a valid one
                return url
                break
            else:
                print("Claim doesn't appear")
    else:
        # if there are no accepted sources
        print("Couldn't find a source for this quote")

with open('sampledata.csv', 'rb') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        source_quote_eventregistry(row[1])
        #source_quote_eventregistry('The concept of global warming was created by and for the Chinese')

In [26]:
import json
import urllib

# takes a text name and searches the google knowledge tree for the top hit
# then returns the url of their wikipedia page and their schema.org id

def find_person(name):
    api_key = open('.api_key').read()
    #api_key='AIzaSyBIE1JFd1qUFpSIkD4fPUdlq5xPR3jQjP4'
    query = 'Donald Trump'
    service_url = 'https://kgsearch.googleapis.com/v1/entities:search'
    params = {
        'query': query,
        'limit': 10,
        'indent': True,
        'key': api_key,
    }
    url = service_url + '?' + urllib.urlencode(params)
    response = json.loads(urllib.urlopen(url).read())
    #if 'Person' in response['itemListElement']['@type']:
    #    print 'person query!'
    wiki_url=''
    schema_id=''
    for element in response['itemListElement']:
        if 'Person' in element['result']['@type']:
            print 'got person'
            wiki_url=element['result']['detailedDescription']['url']
            schema_id=element['result']['@id']
            break # after the top hit
    #     print element['result']['name'] + ' (' + str(element['resultScore']) + ')'
    # print json.dumps(response, indent=4, sort_keys=True)
    return wiki_url, schema_id

find_person('Donald Trump')


got person


(u'https://en.wikipedia.org/wiki/Donald_Trump', u'kg:/m/0cqt90')

In [3]:
import tweepy

con_key = open('.twitter_con_key').read()
con_sec = open('.twitter_con_sec').read()
acc_key = open('.twitter_acc_key').read()
acc_sec = open('.twitter_acc_sec').read()

auth = tweepy.OAuthHandler(con_key, con_sec)
auth.set_access_token(acc_key, acc_sec)

api = tweepy.API(auth)

public_tweets = api.home_timeline()
for tweet in public_tweets:
    print tweet.text

BREAKING: AP POLL ALERT: Clemson slips to No. 4; Michigan, Nebraska, Baylor rise; Miami, Virginia Tech out; North Carolina, LSU in.
The owner of this $12 million house was once a drug addict and a bank robber https://t.co/ZO2nWHt6i1 https://t.co/kEF1LtnuK0
Philippines leader Rodrigo Duterte says being sexually abused as a child had major influence on his politics https://t.co/45qxjcPJ9o
Norma Percy's Inside Obama's White House part 1 has just started on BBC2
RT @FT: EU leaders to discuss Russian political meddling https://t.co/h6hEff7j6M
There may have been something significant in the bizarre 8 A.M. juxtaposition of the news about Trump and Dylan.‚Ä¶ https://t.co/6EiBkBZeND
Fears mount on @realDonaldTrump's "rigged election" rhetoric https://t.co/t7D8iXwJkU | AP Photo https://t.co/xLlPYPYkhy
Randomness in correlations. An exact derivation for n small.
Started in a Moscow caf√© and completed on plane back. https://t.co/cQdWUHrkxB
RT @SkillageSteve: Hate people saying TV Burp was a fluk

In [9]:
import json
import requests
api_key=open('.webhose').read()
print api_key
search_term='Donald%20Trump'

url='https://webhose.io/search?token='+api_key+'&format=json&q='+search_term
print url
headers={
    "Accept": "text/plain"
    }
r = requests.get(url, headers=headers)
print(r.content)    


ce586f3a-9784-4353-b3a3-6348b0f60d0d
https://webhose.io/search?token=ce586f3a-9784-4353-b3a3-6348b0f60d0d&format=json&q=Donald%20Trump
{
  "posts": [
    {
      "thread": {
        "uuid": "e25a3f139ee20cb1a6753b8d7e19b9bf3b839d85",
        "url": "http://omgili.com/r/.0rSU5LtMgz6tqUIJgu8PsECBa0LOHO09I_TrMo6HhEQHAvHi0386KH0J9PvgQISBLLPaxSPDp12dkiOvkbtuA--",
        "site_full": "fortune.com",
        "site": "fortune.com",
        "site_section": "http://fortune.com",
        "site_categories": [
          "business"
        ],
        "section_title": "Donald Trump Blasts the Press Over Sexual Assault Stories: ‚ÄòThey Will Lie, Lie, Lie‚Äô ‚Äì Fortune",
        "title": "Donald Trump Blasts the Press Over Sexual Assault Stories: ‚ÄòThey Will Lie, Lie, Lie‚Äô",
        "title_full": "Donald Trump Blasts the Press Over Sexual Assault Stories: ‚ÄòThey Will Lie, Lie, Lie‚Äô",
        "published": "2016-10-13T21:48:42.340+03:00",
        "replies_count": 0,
        "participants_count": 1

In [37]:
from bs4 import BeautifulSoup
import requests

# A function to search a wikipedia page for a phrase
def search_for_term(url,term):
    respond = requests.get(url)
    soup = BeautifulSoup(respond.text)
    content_text = soup.find(id="content").text
    # if phrase is found
    if term in content_text:
        print 'found'
    
url='https://en.wikipedia.org/wiki/Donald_Trump'
search_for_term(url,'global warming')


225791
found
