In [1]:
import os
import tweepy

# twitter credentials
consumer_key = os.getenv('TWITTER_CUSTOMER_KEY')
consumer_secret = os.getenv('TWITTER_CUSTOMER_SECRET')

api_twitter = tweepy.API(tweepy.OAuthHandler(consumer_key, consumer_secret))


def twitter_get_text(uid:str) -> list:
    if uid == '-':
        return []
    return list(i._json['text'] for i in api_twitter.user_timeline(screen_name=uid, count=200))

In [2]:
import os
from urllib.request import urlopen
from json import loads as json_load

fb_access_url = "https://graph.facebook.com/v2.6/{company}/posts"\
                "?access_token={fb_key}|{fb_secret}"
fb_access_url = fb_access_url.format(fb_key = os.getenv('FB_KEY'),
                                     fb_secret = os.getenv('FB_SECRET'),
                                     company = '{company}')

    
def facebook_get_text(uid:str) -> list:
    if uid == '-':
        return []
    fb_data = urlopen(fb_access_url.format(company=uid)).read()
    fb_data = json_load(fb_data.decode())
    olist = []
    for i in fb_data['data']:
        try:
            olist.append(i['message'])
        except KeyError:
            olist.append('')
    return olist

In [3]:
from urllib.request import urlopen
from json import loads as json_load

wiki_access_url = "https://en.wikipedia.org/w/api.php?format=json"\
                "&action=query&prop=extracts&exintro=&explaintext=&titles={company}"


def wiki_get_text(uid:str) -> list:
    if uid == '-':
        return []
    # TODO: wikipedia get all text for page
    wiki_data = urlopen(wiki_access_url.format(company=uid)).read()
    wiki_data = json_load(wiki_data.decode())
    wiki_data = wiki_data['query']['pages']
    wiki_data = wiki_data[next(iter(wiki_data))]
    return [wiki_data['extract']]

In [4]:
source_set = [twitter_get_text, facebook_get_text, wiki_get_text]

In [5]:
STOPWORDS = set(["a", "about", "above", "across", "after", "afterwards", 
                 "again", "against", "all", "almost", "alone", "along", 
                 "already", "also", "although", "always", "am", "among", 
                 "amongst", "amoungst", "amount", "an", "and", "another", 
                 "any", "anyhow", "anyone", "anything", "anyway", "anywhere", 
                 "are", "around", "as", "at", "back", "be", "became", "because", 
                 "become", "becomes", "becoming", "b    een", "before", "beforehand", 
                 "behind", "being", "below", "beside", "besides", "between", "beyond", 
                 "bill", "both", "bottom", "but", "by", "call", "can", "cannot", "cant", 
                 "co", "computer", "c    on", "could", "couldnt", "cry", "de", "describe", 
                 "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", 
                 "either", "eleven", "else", "elsewhere", "empty", "enough", "etc", "even", 
                 "ever", "every", "everyone", "everything", "everywhere", "except", "few", 
                 "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", 
                 "former", "formerly", "forty", "found", "four", "from", "front", "full", 
                 "further", "get", "give", "go", "had", "has", "hasnt", "have", "he",
                 "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", 
                 "hers", "herse", "him", "himse", "his", "how", "however", "hundred", 
                 "i", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", 
                 "its", "itse", "keep", "last", "latter", "latterly", "least", "less", "ltd", 
                 "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", 
                 "moreover", "most",     "mostly", "move", "much", "must", "my", "myse", "name", 
                 "namely", "neither", "never", "neverth    eless", "next", "nine", "no", "nobody", 
                 "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", 
                 "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", 
                 "our", "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps", 
                 "please", "put"    , "rather", "re", "same", "see", "seem", "seemed", "seeming", 
                 "seems", "serious", "several", "sh    e", "should", "show", "side", "since", 
                 "sincere", "six", "sixty", "so", "some", "somehow", "some    one", "something", 
                 "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", 
                 "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", 
                 "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", 
                 "thick", "thin", "third", "this", "those", "though", "three", "through", 
                 "throughout", "thru", "thus", "to", "together", "too    ", "top", "toward", 
                 "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon",     
                 "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", 
                 "whence", "wheneve    r", "where", "whereafter", "whereas", "whereby", "wherein", 
                 "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", 
                 "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", 
                 "you", "your", "yours", "yourself", "yourselves", 'w/'])

In [6]:
def map_to_foo(f, data:str) -> list:
    return f(data)

In [7]:
import re


def get_words(string:str) -> list:
    string = re.sub(r"(http|@|#)\S+", "", string)
    return [i for i in (x.lower() for x in filter(lambda x: len(x) > 0, re.split('[ .,!?:…\"\'\-\+]+', string)))
            if not i in STOPWORDS]

In [8]:
import operator


def get_top_ten_sequences(words:list, numw:int) -> list:
    d = dict()
    for i in words:
        if len(i) < numw:
            continue
        t = list(i)
        for j in range(len(t) - numw):
            try:
                d[frozenset(t[j:j+numw])] += 1
            except KeyError:
                d[frozenset(t[j:j+numw])] = 1
    return sorted(d.items(), key=operator.itemgetter(1))[:-11:-1]

In [9]:
import csv
from functools import reduce
from itertools import repeat


RD = dict()


def merge_lists(a, b):
    c = []
    list.extend(c, a)
    list.extend(c, b)
    return c


def fstophrase(fs:frozenset)->str:
    return ' '.join(fs)


with open('links.csv', 'r') as csvfile:
    companiesd = dict()
    awd = dict()
    for company in csv.reader(csvfile):
        ftid = zip(source_set, company[1:]) # map fuction to social network id
        ctexts = reduce(merge_lists, map(map_to_foo, source_set, company[1:]))
        with open('{company}.txt'.format(company=company[0]), 'w') as cout:
            for i in ctexts:
                print(i, file=cout)
        cseq = list(map(get_words, ctexts))
        for words in cseq:
            for word in words:
                try:
                    awd[word] += 1
                except KeyError:
                    awd[word] = 1
        crezs = map(get_top_ten_sequences, repeat(cseq), range(1, 4))
        crezd = list(map(lambda t: dict(map(lambda x: (fstophrase(x[0]), x[1]), t)), crezs))
        companiesd[company[0]] = crezd
    companiesd['all words'] = sorted(awd.items(), key=operator.itemgetter(1))[:-11:-1]

In [10]:
import pprint

pp = pprint.PrettyPrinter(indent=4)
with open('result.txt', 'w') as pretty_output:
    pprint.pprint(companiesd, pretty_output)