In [None]:
import requests
from bs4 import BeautifulSoup
import json
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
import re

def claim_reader(raw):
    #extract claim, people IDs and place IDs from quote chunk
    metadata = {
                'speaker': '',
                'named_people': [],
                'geographies': []}
    claim = ''
    extra = ''
    if raw.count('"') == 2:
        parts = raw.split('"')
        claim = parts[1]
        extra = " ".join((parts[0], parts[2]))
    else:
        claim = raw
    return claim   

def clean_tweet_stub(raw):
    #Remove trailing retweet/like data from Google stub for tweet listing
    idx_string = " Retweets(.*); Likes"
    compiled = re.compile(idx_string)
    search_str = compiled.search(raw)
    search_str = search_str.group(1).strip()
    index = raw.index(" Retweets %s; Likes" % search_str)
    tweet_content = raw[:index]
    return tweet_content

def tag_pos(sentence):
    words = tokenize(sentence)
    words = [nltk.pos_tag(words) for word in words]
    return words
    
def ID_people(text):
    pass

def ID_geog(text):
    pass

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text, stem=False):
    stemmer = PorterStemmer()
    tokens = nltk.word_tokenize(text)
    if stem:
        tokens = stem_tokens(tokens, stemmer)
    return tokens

def tf_idf_cosine_comparison (raw_text):
    vect = TfidfVectorizer(min_df=1)
    tfidf = vect.fit_transform(raw_text)
    return (tfidf * tfidf.T).A[0,1]

def google_claim (claim):
    claim_google = claim.replace('"', '%22')
    url = 'https://www.google.com/search?q='+claim_google+'&aqs=chrome.2.69i57j69i65j0l4.5769j0j4&sourceid=chrome&ie=UTF-8'
    headers = {'user-agent': 'Mozilla/5.0'}

    claim = claim_reader(claim)

    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.content, "html.parser")

    results = soup.find_all("div", {"class":"g"})
    google_links = []
    google_sources = []
    google_headlines = []
    google_stubs = []
    twitter_users = []

    for result in results:
        link = result.find_all("a")[0].get("href")
        #Get rid of the junk characters that prefix links
        link = link[7:]
        google_links.append(link)

        headline = result.find_all("a")[0].text
        google_headlines.append(headline)

        stub = result.find_all("span", {"class":"st"})[0].text
        google_stubs.append(stub)

        index = link.index('//')+2
        source_link = link[index:]
        end_point = source_link.index('/')
        domain = source_link[:end_point]
        if "twitter.com" in domain:
            un_start = source_link.index('.com/')+5
            un = source_link[un_start:]
            un_end = un.index('/')
            un = un[:un_end]
            twitter_users.append(un)
            stub = clean_tweet_stub(stub)
            #Call comparison function - ADD IF STATEMENT TO VERIFY PERSON CLAIMING IS CORRECT
            similarity_score = tf_idf_cosine_comparison([claim, stub])
            print("Claim: ",claim)
            print("Stub: ",stub)
            print("Similarity score:", similarity_score)
            
        google_sources.append(domain)

def google_claim (claim):
    claim_twitter = claim.replace('"', '%22')
    url='https://twitter.com/search?q='+claim_twitter
    headers = {'user-agent': 'Mozilla/5.0'}

    claim = claim_reader(claim)

    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.content, "html.parser")

    results = soup.find_all("div", {"class":"stream"})
    google_links = []
    google_sources = []
    google_headlines = []
    google_stubs = []
    twitter_users = []

    for result in results:
        link = result.find_all("a")[0].get("href")
        #Get rid of the junk characters that prefix links
        link = link[7:]
        google_links.append(link)

        headline = result.find_all("a")[0].text
        google_headlines.append(headline)

        stub = result.find_all("span", {"class":"st"})[0].text
        google_stubs.append(stub)

        index = link.index('//')+2
        source_link = link[index:]
        end_point = source_link.index('/')
        domain = source_link[:end_point]
        if "twitter.com" in domain:
            un_start = source_link.index('.com/')+5
            un = source_link[un_start:]
            un_end = un.index('/')
            un = un[:un_end]
            twitter_users.append(un)
            stub = clean_tweet_stub(stub)
            #Call comparison function - ADD IF STATEMENT TO VERIFY PERSON CLAIMING IS CORRECT
            similarity_score = tf_idf_cosine_comparison([claim, stub])
            print("Claim: ",claim)
            print("Stub: ",stub)
            print("Similarity score:", similarity_score)

        google_sources.append(domain)

claim = '“global warming was created by the Chinese to make US manufacturing non-competitive.” Donald Trump'.replace('”', '"').replace('“', '"')

google_claim(claim)

In [None]:
from eventregistry import *
import requests
from bs4 import BeautifulSoup
import csv

def source_quote_eventregistry(quote):
    # Setup the event registry access
    er = EventRegistry()
       
    print('\n\nClaim is: '+quote)
    # need to shorten the claim to 15 words or less for free account
    quote=quote.split()[:15]
    print('Truncated claim is: '+quote)
    q = QueryArticles(keywords = quote)
    q.addRequestedResult(RequestArticlesInfo())

    known_sources=['bbc.co.uk','guardian.com','reuters.com','theherald.com']
    # JSON return data from event registry query - just grabbing the first instance here, but potentially we could
    # filter by news source - say we know how to parse BBC/guardian/etc pages and we have confidence in their accuracy
    jsonResults=er.execQuery(q)

    print json.dumps(er.execQuery(q), sort_keys=True, indent=4, separators=(',', ': '))
    #url=json.dumps(jsonResults["articles"]["results"][0]["url"])[1:-1]
    sources=[]
    # if there were some results
    if not jsonResults["error"]:
        for article in jsonResults["articles"]["results"]:
            # we've now got the article url from result set, check if it's in the known sources list
            sources.append(json.dumps(article["url"])[1:-1])

#             for known_source in known_sources:
#                 if known_source in json.dumps(article["url"])[1:-1]:
#                     sources.append(json.dumps(article["url"])[1:-1])
#                     break

    if sources:
        print("Found "+str(len(sources))+" sources")
        for url in sources:
            print("Checking: "+url)
            # use this url to grab the original article
            headers = {'user-agent': 'Mozilla/5.0'}
            r = requests.get(url, headers=headers)
            soup = BeautifulSoup(r.content, "html.parser")
            [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]

            # the original article text
            articleText=soup.getText().encode('utf-8').strip()
            #print(articleText)

            # the most primitive form of checking if the claim is in the article:
            if claim in str(articleText):
                print("Claim is present in article")
                # break out of searching the sources when we find a valid one
                return url
                break
            else:
                print("Claim doesn't appear")
    else:
        # if there are no accepted sources
        print("Couldn't find a source for this quote")

with open('sampledata.csv', 'rb') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        source_quote_eventregistry(row[1])
        #source_quote_eventregistry('The concept of global warming was created by and for the Chinese')

In [None]:
import json
import urllib

# takes a text name and searches the google knowledge tree for the top hit
# then returns the url of their wikipedia page and their schema.org id

def find_person(name):
    api_key = open('.api_key').read()
    #api_key='AIzaSyBIE1JFd1qUFpSIkD4fPUdlq5xPR3jQjP4'
    query = 'Donald Trump'
    service_url = 'https://kgsearch.googleapis.com/v1/entities:search'
    params = {
        'query': query,
        'limit': 10,
        'indent': True,
        'key': api_key,
    }
    url = service_url + '?' + urllib.urlencode(params)
    response = json.loads(urllib.urlopen(url).read())
    #if 'Person' in response['itemListElement']['@type']:
    #    print 'person query!'
    wiki_url=''
    schema_id=''
    for element in response['itemListElement']:
        if 'Person' in element['result']['@type']:
            print 'got person'
            wiki_url=element['result']['detailedDescription']['url']
            schema_id=element['result']['@id']
            break # after the top hit
    #     print element['result']['name'] + ' (' + str(element['resultScore']) + ')'
    # print json.dumps(response, indent=4, sort_keys=True)
    return wiki_url, schema_id

find_person('Donald Trump')


In [None]:
from bs4 import BeautifulSoup
import requests

# A function to search a wikipedia page for a phrase
def search_for_term(url,term):
    respond = requests.get(url)
    soup = BeautifulSoup(respond.text)
    content_text = soup.find(id="content").text
    # if phrase is found
    if term in content_text:
        print 'found'
    
url='https://en.wikipedia.org/wiki/Donald_Trump'
search_for_term(url,'global warming')


In [None]:
url = 'http://www.faroo.com/api?q=' + 'Donald%20Trump' + 'start=1&length=10&l=en&src=news&f=json'
response = json.loads(urllib.urlopen(url).read())
print json.dumps(response, indent=4, sort_keys=True)

In [1]:
import twitter_trawling as twitter
twitter_search=twitter.Twitter_Access()
twitter_search.search_twitter('Donald Trump')

[u"NBC pulls Donald Trump-inspired 'Law &amp; Order: SVU' episode until after election https://t.co/Irk0o0yDd3",
 u'#SM Howard Stern Explains Why He Won\u2019t \u2018Betray\u2019 Donald Trump By Replaying Old Interviews\u2026 https://t.co/OANuG6qWoL',
 u'#SM Taylor Swift Won&amp;rsquo;t Save Us From Donald Trump: As Donald J. Trump creeps (literally\u2026 https://t.co/R1bVNAiILh',
 u'Melania Trump Defends Husband\u2019s Attacks on Bill Clinton\u2019s Past: \u2018They\u2019re Asking for It\u2019: In\u2026 https://t.co/vt13llUBcs',
 u'#SM Amber Tamblyn Channels Donald Trump To Perform Color Me Badd In This \u2018Lip Sync Battle\u2026 https://t.co/w4mkXXe0TJ',
 u'Amy Schumer -- Booed by Donald Trump Fans at Tampa Show (VIDEO): Amy Schumer made the\u2026 https://t.co/qMmPacsHAH',
 u'#SM Alec Baldwin Is Basically a Teeny, Tiny Donald Trump in Dreamworks\u2019 The Boss Baby\u2026 https://t.co/xQaNXkXCnp',
 u'Amber Tamblyn Mocks Donald Trump on \u2018Lip Sync Battle\u2019: Amber Tamblyn dons 

In [1]:
%load_ext autoreload
import text_analysis as ta
import csv
analyser=ta.Text_Analyser()
with open('sampledata.csv', 'rb') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        print row[3]
        print(analyser.extract_entities(str(row[3])))
        

The concept of global warming was created by and for the Chinese in order to make US manufacturing non-competitive.
['Chinese']
This TPP sets the gold standard in trade agreements to open free, transparent, fair trade, the kind of environment that has the rule of law and a level playing field.
[]
The Obama administration has doubled the US national debt in eight years.
[]
The USA won the most medals at the 2016 Rio Olympics.
[]
The US employment rate is rising.
[]
The US population is 320 million.
[]
Donald Trump did not support the Iraq War.
['Iraq']
President Obama signed an executive order banning the Pledge of Allegiance in public schools.
['Allegiance']
Corey Lewandowski is Donald Trump’s campaign manager.
[]
Hillary Clinton is the Democratic candidate for President.
[]
Donald Trump "was one of the people who rooted for the housing crisis. He said back in 2006, ‘Gee, I hope it does collapse because then I can go in and buy some and make some money.’
[]
USA has the lowest self empl

In [12]:
import nltk
