### IMPORT LIBRARIES AND SET UP

In [1]:
import tweepy
from tweepy import Stream, OAuthHandler, StreamListener
import json
import time
from collections import Counter
import re
import pandas as pd
from langdetect import detect
import pyprind
import deepdish as dd

In [2]:
#import secret codes
from twitter_pwd import access_token, access_token_secret, consumer_key, consumer_secret

In [194]:
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

### KEY FUNCTIONS : USERS, FOLLOWERS, TIMELINES, LANGS

In [256]:
import tweepy
import pyprind
import time
import re

In [349]:
def get_account_network(account_name, rel_type='followers', max_num =100, key_words=None, 
                        min_num_tweets=0, min_num_followers=0):
    """ Given an account by account_name, 
        find all users that are linked to it via a specified relation type 'rel_type'.
        Args:
            * account_name: string. Twitter account name
            * rel_type: string. Specifies relation type (default is 'followers')
            * max_num: integer. Maximum number of 'related' users considered
            * key_words: list of strings. Used to filter retrieved users by location,
                if specified
            * min_num_tweets: minimum number of tweets a follower needs to have 
                to be included in list
            * min_num_followers: minimum number of followers a follower needs to have 
                to be included in list
        Returns:
            * list_people: list of account_names
    """
    pbar = pyprind.ProgBar(max_num)
    list_people = []
    api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
    # very important to set count=200 MAX VALUE
    users = tweepy.Cursor(getattr(api, rel_type, 0), screen_name=account_name, count=200).items(max_num)
    while True:
        try:
            user = next(users)
            if not key_words:
                if user.statuses_count > min_num_tweets:
                    list_people.append(user)
            else:
                locs = '|'.join(key_words)
                patt = re.compile(locs)
                found_loc = re.findall(patt, user._json['location'])
                if found_loc and user.statuses_count > min_num_tweets:
                    list_people.append(user)
        except tweepy.TweepError as e:
            if 'Read timed out' in str(e):
                print('fallen here')
                print(e)
                time.sleep(5)
            else:
                time.sleep(60*16)
                user = next(users)
        except StopIteration:
            break            
        pbar.update()
    return list_people
        

In [382]:
def get_account_tweets(account_name, max_num_twts=10):
    """ Given an account name,
        it retrieves a maximum number of tweets written or retweeted by account owner.
        It returns them in a list.
        Args:
            * account name: string. Screen_name that identifies the twitter account
            * max_num_twts: integer. Maximum number of tweets to be retrieved for each account
        Returns:
            * list_tweets: list including info of all retrieved tweets in JSON format"""
    list_tweets=[]
    api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
    timeline = tweepy.Cursor(api.user_timeline, screen_name=account_name, count=200).items(max_num_twts)
    i=0
    while True:
        try:
            tw = next(timeline)
            list_tweets.append(tw)
        except tweepy.TweepError as e:
            if '401' in str(e):    
                print(e)
                time.sleep(10)
                break
            else:
                time.sleep(60*15)
                tw = next(timeline)
        except StopIteration:
            break
    #list_tweets = [re.sub(r"(@\s?[^\s]+|https?://?[^\s]+)", "", tw) for tw in list_tweets]
    return list_tweets

In [383]:
def get_tweets_from_accounts(list_accounts, max_num_accounts=None, max_num_twts=10):
    """ Given a list of accounts, get tweets texts, langs and authors
        All URLs and tweet account names are removed from tweet
        texts since they are not relevant for language identification"""
    pbar = pyprind.ProgBar(len(list_accounts))
    texts_tweets = []
    langs_tweets = []
    authors_tweets = []
    if max_num_accounts:
        list_accounts = list_accounts[:max_num_accounts]
    for idx, acc in enumerate(list_accounts):
        twts = get_account_tweets(acc, max_num_twts=max_num_twts)
        texts_tweets.extend([re.sub(r"(@\s?[^\s]+|https?://?[^\s]+)", "", tw.text) 
                             for tw in twts])
        langs_tweets.extend([tw.lang for tw in twts])
        authors_tweets.extend([acc for _ in twts])
        pbar.update()
    return texts_tweets, langs_tweets, authors_tweets

In [260]:
def save_tweets_from_followers(screen_name, country, node_name, city=None, 
                               min_num_twts_per_acc=5, max_num_followers=None):
    """ Creates pandas dataframe with all tweets texts 
        from followers of a given account and corresponding language. 
        A dataframe with all followers info must have been previously computed 
        and saved in hdf5 format
        Args:
            * screen_name:
            * country:
            * node_name:
    """
    base_path = '/'.join(['',country, node_name, screen_name])
    path_load = base_path + '/followers'
    path_save = base_path + '/tls_followers'
    key_words = {'ukr':{'all':r"(–£–∫—Ä–∞—ó–Ω–∞|Ukraine|–£–∫—Ä–∞–∏–Ω–∞|–ö–∏—ó–≤|–ö–∏–µ–≤|Kiev|Kyiv|–õ—å–≤—ñ–≤|–õ—å–≤–æ–≤|–û–¥–µ—Å)", 
                        'Kiev':r"(Kiev|Kyiv|–ö–∏—ó–≤|–ö–∏–µ–≤)"}, 
                 'cat':{'all':r"(Barcel|Catal|Tarr|Llei|Ger|Gir|Badal)",
                        'Terrassa':r"(Terras|Vall)",
                        'Girona':r"(Giro|Gero)",
                        'Vic':r"(Vic|Oson)", 'Barcelona':r"(Barcel|barcel|Bcn|bcn)",
                        'Tarragona':r"(Tarrag|tarrag)", 'Lleida':r"(Lleida|Lerid|L√©rid)",
                        'Badalona':r"Badal"}}
    df = pd.read_hdf('lang_data.h5', path_load)
    # filter by num_min_twts_per_account
    relevant_followers = df['screen_name'][df['statuses_count'] >= min_num_twts_per_acc]
    # keep only country residents
    if city:
        relevant_followers = relevant_followers[df['location'].str.contains(key_words[country][city])].values
    else:
        relevant_followers = relevant_followers[df['location'].str.contains(key_words[country]['all'])].values
        
    texts, langs, auth = get_tweets_from_accounts(relevant_followers, 
                                                            max_num_accounts=max_num_followers)
    df_txts_langs= pd.DataFrame({'texts':texts, 'lang':langs, 'screen_name':auth})
    df_txts_langs.to_hdf('lang_data.h5', path_save)
    return df_txts_langs
    
    

In [None]:
import tweepy

class CityTweets:
    key_words = {'ukr':{'all':r"(–£–∫—Ä–∞—ó–Ω–∞|Ukraine|–£–∫—Ä–∞–∏–Ω–∞|–ö–∏—ó–≤|–ö–∏–µ–≤|Kiev|Kyiv|–õ—å–≤—ñ–≤|–õ—å–≤–æ–≤|–û–¥–µ—Å)", 
                    'Kiev':r"(Kiev|Kyiv|–ö–∏—ó–≤|–ö–∏–µ–≤)"}, 
                 'cat':{'all':r"(Barcel|Catal|Tarr|Llei|Ger|Gir|Badal)",
                        'Terrassa':r"(Terras|Vall)",
                        'Girona':r"(Giro|Gero)",
                        'Vic':r"(Vic|Oson)", 'Barcelona':r"(Barcel|barcel|Bcn|bcn)",
                        'Tarragona':r"(Tarrag|tarrag)", 'Lleida':r"(Lleida|Lerid|L√©rid)",
                        'Badalona':r"Badal"}}
    def __init__(self, account_name, min_num_tweets=0, 
                 min_num_followers=0, key_words=None):
        self.account_name = account_name
        self.min_num_tweets = min_num_tweets
        self.min_num_followers = min_num_followers
        self.key_words = key_words
        
        self.api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
        
    def get_account_network(rel_type='followers', max_num =100, key_words=None, 
                            min_num_tweets=0, min_num_followers=0):
        """ Given an account by account_name, 
            find all users that are linked to it via a specified relation type 'rel_type'.
            Args:
                * rel_type: string. Specifies relation type (default is 'followers')
                * max_num: integer. Maximum number of 'related' users considered
                * key_words: list of strings. Used to filter retrieved users by location,
                    if specified
                * min_num_tweets: minimum number of tweets a follower needs to have 
                    to be included in list
                * min_num_followers: minimum number of followers a follower needs to have 
                    to be included in list
            Returns:
                * list_people: list of account_names
        """
        pbar = pyprind.ProgBar(max_num)
        self.list_people = []
        
        # very important to set count=200 MAX VALUE
        users = tweepy.Cursor(getattr(self.api, rel_type, 0), screen_name=self.account_name, 
                              count=200).items(max_num)
        while True:
            try:
                user = next(users)
                if not key_words:
                    if user.statuses_count > min_num_tweets and user.followers_count > min_num_followers:
                        self.list_people.append(user)
                else:
                    locs = '|'.join(key_words)
                    patt = re.compile(locs)
                    found_loc = re.findall(patt, user._json['location'], flags=re.I)
                    if found_loc and user.statuses_count > min_num_tweets and user.followers_count > min_num_followers:
                        self.list_people.append(user)
            except tweepy.TweepError as e:
                if 'Read timed out' in str(e):
                    print('fallen here')
                    print(e)
                    time.sleep(5)
                else:
                    time.sleep(60*16)
                    user = next(users)
            except StopIteration:
                break            
            pbar.update()
    
    def get_account_tweets(max_num_twts=20):
        """ Given an account name,
            it retrieves a maximum number of tweets written or retweeted by account owner.
            It returns them in a list.
            Args:
                * account name: string. Screen_name that identifies the twitter account
                * max_num_twts: integer. Maximum number of tweets to be retrieved for each account
            Returns:
                * list_tweets: list including info of all retrieved tweets in JSON format
        """
        list_tweets=[]
        timeline = tweepy.Cursor(self.api.user_timeline, screen_name=self.account_name, 
                                 count=200, include_rts = True).items(max_num_twts)
        i=0
        while True:
            try:
                tw = next(timeline)
                list_tweets.append(tw)
            except tweepy.TweepError as e:
                if '401' in str(e):    
                    print(e)
                    time.sleep(10)
                    break
                else:
                    time.sleep(60*15)
                    tw = next(timeline)
            except StopIteration:
                break
        return list_tweets
    
    def get_tweets_from_accounts(max_num_accounts=None, max_num_twts=20):
        """ Given a list of accounts, get tweets texts, langs and authors
            All URLs and tweet account names are removed from tweet
            texts since they are not relevant for language identification
        """
        pbar = pyprind.ProgBar(len(list_accounts))
        texts_tweets = []
        langs_tweets = []
        authors_tweets = []
        if max_num_accounts:
            list_accounts = self.list_people[:max_num_accounts]
        for idx, acc in enumerate(list_accounts):
            twts = self.get_account_tweets(acc, max_num_twts=max_num_twts)
            texts_tweets.extend([re.sub(r"(@\s?[^\s]+|https?://?[^\s]+)", "", tw.text) 
                                 for tw in twts])
            langs_tweets.extend([tw.lang for tw in twts])
            authors_tweets.extend([acc for _ in twts])
            pbar.update()
        return texts_tweets, langs_tweets, authors_tweets

    
    def save_tweets_from_followers(screen_name, country, node_name, city=None, 
                                   min_num_twts_per_acc=10, max_num_followers=None):
        """ Creates pandas dataframe with all tweets texts 
            from followers of a given account and corresponding language. 
            A dataframe with all followers info must have been previously computed 
            and saved in hdf5 format
            Args:
                * screen_name:
                * country:
                * node_name:
        """
        base_path = '/'.join(['',country, node_name, screen_name])
        path_load = base_path + '/followers'
        path_save = base_path + '/tls_followers'

        df = pd.read_hdf('lang_data.h5', path_load)
        # filter by num_min_twts_per_account
        relevant_followers = df['screen_name'][df['statuses_count'] >= min_num_twts_per_acc]
        # keep only country residents
        if city:
            relevant_followers = relevant_followers[df['location'].str.contains(self.key_words[country][city])].values
        else:
            relevant_followers = relevant_followers[df['location'].str.contains(self.key_words[country]['all'])].values

        texts, langs, auth = self.get_tweets_from_accounts(max_num_accounts=max_num_followers)
        df_txts_langs= pd.DataFrame({'texts':texts, 'lang':langs, 'screen_name':auth})
        df_txts_langs.to_hdf('lang_data.h5', path_save)
        return df_txts_langs




In [389]:
mytxt = "–∫–∏–µ–≤"
re.findall(r"–ö–∏—ó–≤|–ö–∏–µ–≤", mytxt, )

['–∫–∏–µ–≤']

In [None]:
–ù–µ–π—Ä–æ–ø—Å–∏—Ö–æ–ª–æ–≥ –ö—Ä–∏—Å –§—Ä–∏—Ç –æ –∑–µ—Ä–∫–∞–ª—å–Ω—ã—Ö –Ω–µ–π—Ä–æ–Ω–∞—Ö, —ç–º–æ—Ü–∏–æ–Ω–∞–ª—å–Ω–æ–º –∑–∞—Ä–∞–∂–µ–Ω–∏–∏ –∏ —Ä–∞—Å–æ–≤—ã—Ö –ø—Ä–µ–¥—Ä–∞—Å—Å—É–¥–∫–∞—Ö

### UKRAINE: data structure and relevant twitter accounts 

In [12]:
Ukraine_nodes = {}
Ukraine_nodes['cities'] = ['kiev', 'odessa', 'lviv', 'kharkov', 'dnipropetrovsk']
Ukraine_nodes['city_sites'] = {'Mariupol':['0629ComUa'], 
                               'kiev':['kievtypical','kliniki_kiev','LISOD_clinic','avto_kiev', 'editbeauty']}
Ukraine_nodes['news'] = ['HromadskeUA','tsnua','ukrpravda_news', 'lb_ua', 'Korrespondent', 
                         'Delo_ua', 'BBC_ua', 'LIGAnet', 'segodnya_life']
Ukraine_nodes['TV'] = ['5channel', 'EspresoTV', '24tvua', 'footballua_tv']
Ukraine_nodes['starsystem'] = ['VeraBrezhneva', 's_vakarchuk', 'KAMEHCKUX']
Ukraine_nodes['politics'] = ['poroshenko', 'Vitaliy_Klychko', 'Leshchenkos','AvakovArsen', 'andriy_sadovyi', 'GennadyKernes']

In [14]:
key_words=['–£–∫—Ä–∞—ó–Ω–∞', 'Ukraine', '–£–∫—Ä–∞–∏–Ω–∞', '–ö–∏—ó–≤', '–ö–∏–µ–≤']
HromadskeUA_followers = get_account_network('HromadskeUA', rel_type='followers', 
                                            max_num =5000, key_words=key_words)

In [7]:
country = 'ukr'
node_name = 'politics'
acc_name = 'poroshenko'
rel_type = 'followers'

#key_words=['–£–∫—Ä–∞—ó–Ω–∞', 'Ukraine', '–£–∫—Ä–∞–∏–Ω–∞', '–ö–∏—ó–≤', '–ö–∏–µ–≤']
path_save = '/'.join(['',country, node_name, acc_name, rel_type])
followers = get_account_network(acc_name, rel_type=rel_type, 
                                max_num =5000, key_words=None)
json_format = [elem._json for elem in followers]
df = pd.DataFrame(json_format)
df.to_hdf('lang_data.h5', path_save)

0%                          100%
[#                             ] | ETA: 00:01:50

Rate limit reached. Sleeping for: 895
fallen here

[###                           ] | ETA: 02:20:33


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)
Rate limit reached. Sleeping for: 891
fallen here

[#####                         ] | ETA: 02:35:13


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)
Rate limit reached. Sleeping for: 894
fallen here

[#######                       ] | ETA: 02:34:01


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)
Rate limit reached. Sleeping for: 893
fallen here

[#########                     ] | ETA: 02:25:49


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)
Rate limit reached. Sleeping for: 893
fallen here

[##########                    ] | ETA: 02:35:50


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)
Rate limit reached. Sleeping for: 893
fallen here

[############                  ] | ETA: 02:21:12


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)
Rate limit reached. Sleeping for: 893
fallen here

[##############                ] | ETA: 02:05:27


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)
Rate limit reached. Sleeping for: 892
fallen here

[################              ] | ETA: 01:49:42


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)
Rate limit reached. Sleeping for: 893
fallen here

[##################            ] | ETA: 01:34:25


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)
Rate limit reached. Sleeping for: 891
fallen here

[###################           ] | ETA: 01:30:53


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)
Rate limit reached. Sleeping for: 893
fallen here

[#####################         ] | ETA: 01:14:15


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)
Rate limit reached. Sleeping for: 893
fallen here

[#######################       ] | ETA: 00:57:25


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)
Rate limit reached. Sleeping for: 891
fallen here

[#########################     ] | ETA: 00:40:58


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)
Rate limit reached. Sleeping for: 892
fallen here

[###########################   ] | ETA: 00:24:30


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)
Rate limit reached. Sleeping for: 893
fallen here

[############################  ] | ETA: 00:16:50


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)
Rate limit reached. Sleeping for: 892
fallen here

[##############################] | ETA: 00:00:00
Total time elapsed: 04:12:04

Total time elapsed: 04:12:04

Total time elapsed: 04:12:04

Total time elapsed: 04:12:04

Total time elapsed: 04:12:04

Total time elapsed: 04:12:04

Total time elapsed: 04:12:04

Total time elapsed: 04:12:04

Total time elapsed: 04:12:04

Total time elapsed: 04:12:04

Total time elapsed: 04:12:04

Total time elapsed: 04:12:04

Total time elapsed: 04:12:04

Total time elapsed: 04:12:04

Total time elapsed: 04:12:04

Total time elapsed: 04:12:04

Total time elapsed: 04:12:04
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block3_values] [items->['created_at', 'description', 'entities', 'id_str', 'lang', 'location', 'name', 'profile_background_color', 'profile_background_image_url', 'profile_background_image_url_https', 'profile_banner_url', 'profile_image_url', 'profile_image_url_https', 'profile_link_color', 'profile_sidebar_b


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)


In [8]:
df[['location','screen_name','lang']][df['location'].str.contains(r"(–õ—å–≤—ñ–≤|–õ—å–≤–æ–≤)")]

  if __name__ == '__main__':


Unnamed: 0,location,screen_name,lang
522,–£–∫—Ä–∞—ó–Ω–∞ –õ—å–≤—ñ–≤,juliaskab13,uk
1006,–õ—å–≤–æ–≤,novinska_cat,ru
1661,–õ—å–≤–æ–≤,HumenuykRoma,ru
2269,–õ—å–≤—ñ–≤,kaprikorn87,uk
2997,–£–∫—Ä–∞—ó–Ω–∞ –õ—å–≤—ñ–≤,Hollywell83,uk
3803,–º .–õ—å–≤—ñ–≤,M8tgs4YGZ391fNw,uk
4731,–õ—å–≤—ñ–≤,cherniak_gi,uk


In [9]:
df['lang'][df['location'].str.contains(r"(–õ—å–≤—ñ–≤|–õ—å–≤–æ–≤|Lviv|–Ü–≤–∞–Ω–æ-–§—Ä–∞–Ω–∫—ñ–≤—Å—å–∫)")].value_counts()

  if __name__ == '__main__':


uk    8
ru    3
en    1
Name: lang, dtype: int64

In [10]:
df['lang'][df['statuses_count'] >= 10][df['location'].str.contains(r"(Kiev|Kyiv|–ö–∏—ó–≤|–ö–∏–µ–≤)")].value_counts()

  if __name__ == '__main__':


ru    14
en     2
uk     1
Name: lang, dtype: int64

In [278]:
df[['lang','screen_name', 'followers_count']][df['statuses_count'] >= 200][df['location'].str.contains(r"(Kiev|Kyiv|–ö–∏—ó–≤|–ö–∏–µ–≤)")]

In [78]:
#HRMUA_flwrs = [f for idx, f in df_HRMUA.iterrows()]
#HRMUA_texts, HRMUA_langs = get_tweets_from_accounts(HRMUA_flwrs, max_num_followers=300)

In [66]:
df_txts_langs_HRMUA = pd.DataFrame({'texts':HRMUA_texts, 'lang':HRMUA_langs})

df_txts_langs_HRMUA.to_hdf('lang_data.h5', '/ukr_nodes/news/HromadskeUA/tls_followers')

In [None]:
df_txts = pd.read_hdf('lang_data.h5', '/ukr_nodes/news/HromadskeUA/tls_followers')

### CATALONIA NODES

In [392]:
Catalonia_nodes = {}
Catalonia_nodes['news'] = ['LaVanguardia', 'VilaWeb', 'diariARA', 'elperiodico',
                           'elperiodico_cat', 'elpuntavui']
Catalonia_nodes['cities'] = ['bcn_ajuntament', 'paerialleida', 'girona_cat', 'TGNAjuntament', 
                             'AjBadalona', 'aj_vic', 'ajterrassa']
Catalonia_nodes['politics'] = ['KRLS', 'junqueras', 'AdaColau', 
                              'miqueliceta', 'InesArrimadas', 'Albiol_XG', 
                              'raulromeva', 'ForcadellCarme']

In [87]:
def get_account_followers(country, node, acc_name):
    path_save = '/'.join(['', country, node, acc_name, 'followers'])
    followers = get_account_network(acc_name, rel_type='followers', max_num =5000)
    json_format = [elem._json for elem in followers]
    df = pd.DataFrame(json_format)
    df.to_hdf('lang_data.h5', path_save)
    return df
    

In [88]:
df = get_account_followers(country ='cat', node='cities', acc_name='AjBadalona')

0%                          100%
[#                             ] | ETA: 00:03:21

Rate limit reached. Sleeping for: 889
fallen here

[###                           ] | ETA: 02:20:09


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)
Rate limit reached. Sleeping for: 889
fallen here

[#####                         ] | ETA: 02:38:29


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)
Rate limit reached. Sleeping for: 888
fallen here

[#######                       ] | ETA: 02:37:27


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)
Rate limit reached. Sleeping for: 886
fallen here

[#########                     ] | ETA: 05:34:47


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)
Rate limit reached. Sleeping for: 887
fallen here

[##########                    ] | ETA: 05:18:59


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)
Rate limit reached. Sleeping for: 887
fallen here

[############                  ] | ETA: 04:23:35


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)
Rate limit reached. Sleeping for: 886
fallen here

[##############                ] | ETA: 03:38:30


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)
Rate limit reached. Sleeping for: 884
fallen here

[################              ] | ETA: 03:01:26


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)
Rate limit reached. Sleeping for: 885
fallen here

[##################            ] | ETA: 02:29:02


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)
Rate limit reached. Sleeping for: 888
fallen here

[###################           ] | ETA: 02:18:18


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)
Rate limit reached. Sleeping for: 888
fallen here

[#####################         ] | ETA: 01:49:20


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)
Rate limit reached. Sleeping for: 889
fallen here

[#######################       ] | ETA: 01:38:41


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)
Rate limit reached. Sleeping for: 890
fallen here

[#########################     ] | ETA: 01:08:07


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)
Rate limit reached. Sleeping for: 884
fallen here

[###########################   ] | ETA: 00:39:35


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)
Rate limit reached. Sleeping for: 887
fallen here

[############################  ] | ETA: 00:26:34


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)
Rate limit reached. Sleeping for: 887
fallen here

[##############################] | ETA: 00:00:00
Total time elapsed: 06:28:31

Total time elapsed: 06:28:31

Total time elapsed: 06:28:31

Total time elapsed: 06:28:31

Total time elapsed: 06:28:31

Total time elapsed: 06:28:31

Total time elapsed: 06:28:31

Total time elapsed: 06:28:31

Total time elapsed: 06:28:31

Total time elapsed: 06:28:31

Total time elapsed: 06:28:31

Total time elapsed: 06:28:31

Total time elapsed: 06:28:31

Total time elapsed: 06:28:31

Total time elapsed: 06:28:31

Total time elapsed: 06:28:31

Total time elapsed: 06:28:31
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block3_values] [items->['created_at', 'description', 'entities', 'id_str', 'lang', 'location', 'name', 'profile_background_color', 'profile_background_image_url', 'profile_background_image_url_https', 'profile_banner_url', 'profile_image_url', 'profile_image_url_https', 'profile_link_color', 'profile_sidebar_b


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)


In [89]:
df['lang'].value_counts()

es       3586
ca       1102
en        271
fr          9
it          7
en-gb       5
pt          4
de          3
eu          2
ar          2
ru          2
gl          1
pl          1
en-GB       1
zh-cn       1
ja          1
tr          1
ro          1
Name: lang, dtype: int64

In [90]:
regex = r"(Badal)"
df_Bad = df[df['statuses_count'] >= 10][df['location'].str.contains(regex)]
df_Bad['lang'].value_counts()

  from ipykernel import kernelapp as app


es    688
ca    281
en     51
fr      1
it      1
Name: lang, dtype: int64

### SAVE TWEETS FROM FOLLOWERS

In [96]:
df_txts_langs = save_tweets_from_followers('AjBadalona' , 'cat', 'cities', city='Badalona')

0%                          100%
[##                            ] | ETA: 00:13:58

Twitter error response: status code = 401
Twitter error response: status code = 401
Twitter error response: status code = 401

[####                          ] | ETA: 00:15:44


Twitter error response: status code = 401
Twitter error response: status code = 401
Twitter error response: status code = 401
Twitter error response: status code = 401

[#####                         ] | ETA: 00:17:57


Twitter error response: status code = 401

[######                        ] | ETA: 00:17:15


Twitter error response: status code = 401
Twitter error response: status code = 401
Twitter error response: status code = 401

[#######                       ] | ETA: 00:17:27


Twitter error response: status code = 401

[########                      ] | ETA: 00:16:15


Twitter error response: status code = 401
Twitter error response: status code = 401

[#########                     ] | ETA: 00:15:38


Twitter error response: status code = 401

[##########                    ] | ETA: 00:14:31


Twitter error response: status code = 401

[############                  ] | ETA: 00:12:31


Twitter error response: status code = 401
Twitter error response: status code = 401
Twitter error response: status code = 401

[#############                 ] | ETA: 00:12:02


Twitter error response: status code = 401
Twitter error response: status code = 401

[##############                ] | ETA: 00:11:22


Twitter error response: status code = 401
Twitter error response: status code = 401
Twitter error response: status code = 401
Twitter error response: status code = 401

[###############               ] | ETA: 00:10:59


Twitter error response: status code = 401
Twitter error response: status code = 401
Twitter error response: status code = 401

[################              ] | ETA: 00:10:23


Twitter error response: status code = 401
Twitter error response: status code = 401
Twitter error response: status code = 401

[#################             ] | ETA: 00:09:49


Twitter error response: status code = 401
Twitter error response: status code = 401

[##################            ] | ETA: 00:09:02


Twitter error response: status code = 401

[###################           ] | ETA: 00:08:09


Twitter error response: status code = 401
Twitter error response: status code = 401
Twitter error response: status code = 401
Twitter error response: status code = 401

[####################          ] | ETA: 00:07:35


Twitter error response: status code = 401

[#####################         ] | ETA: 00:06:44


Twitter error response: status code = 401
Twitter error response: status code = 401
Twitter error response: status code = 401

[######################        ] | ETA: 00:06:01


Twitter error response: status code = 401

[#######################       ] | ETA: 00:05:13


Twitter error response: status code = 401
Twitter error response: status code = 401

[##########################    ] | ETA: 00:02:52


Twitter error response: status code = 401
Twitter error response: status code = 401

[###########################   ] | ETA: 00:02:09


Twitter error response: status code = 401

[############################  ] | ETA: 00:01:25


Twitter error response: status code = 401
Twitter error response: status code = 401

[############################# ] | ETA: 00:00:42


Twitter error response: status code = 401
Twitter error response: status code = 401

[##############################] | ETA: 00:00:00





Total time elapsed: 00:21:29


In [97]:
df_txts_langs['lang'].value_counts()

es     4702
und    4427
en      554
pt       94
fr       40
it       20
in       15
eu       10
nl       10
ht        8
et        6
tl        6
ro        5
hu        5
lt        5
tr        4
cs        4
de        3
cy        3
no        3
da        3
pl        2
sv        2
fi        2
vi        2
hi        1
is        1
ja        1
Name: lang, dtype: int64

### LANG DETECTION

In [4]:
from langdetect import detect_langs

In [5]:
import langdetect

In [171]:
df_txts_langs.keys()

Index(['lang', 'screen_name', 'texts'], dtype='object')

In [44]:
df_txts_langs[['texts', 'lang', 'screen_name']][df_txts_langs['lang'] == 'und'].iloc[2]

NameError: name 'df_txts_langs' is not defined

In [272]:
base_path = '/'.join(['',country, node_name, screen_name])
path_load = base_path + '/followers'
path_save = base_path + '/tls_followers'

In [280]:
df = pd.read_hdf('lang_data.h5', path_save)

In [230]:
# filter followers to focus on most relevant ones
min_num_twts = 5
relevant_followers = df['screen_name'][df['statuses_count'] >= min_num_twts][
                         df['location'].str.contains(key_words[country])
                       ].values



In [122]:
#df_txts_langs['lang'].value_counts()
#langs_detected = [detect(txt) for txt in df_txts_langs['texts']]

langs_detected=[]
for txt in df_txts_langs['texts']:
    try:
        langs_detected.append(detect(txt))
    except:
        langs_detected.append(None)

In [55]:
df_try = pd.DataFrame({'a':['aaaa','bbbfdde',1],'b':[23,44,56]})
df_try2 = pd.DataFrame({'a':['xxxx','zzzz'],'b':[3233,43214]})

store = pd.HDFStore('try_hyerar.h5')

store.append('city/topic', df_try)

store.close()

pd.read_hdf('try_hyerar.h5', 'city/topic')

store = pd.HDFStore('try_hyerar.h5','a')

store.append('city/topic', df_try2)

store.close()

pd.read_hdf('try_hyerar.h5', 'city/topic')

store = pd.HDFStore('try_hyerar.h5','a')

store.put('city/followers',df)

store.close()

### EXPLORE AN ACCOUNT

In [200]:
api = tweepy.API(auth)
user_info = api.get_user('onlyforulonely ')  #ArnauAndreu
user_info._json['location']

'Ostroh'

In [20]:
#get timeline
tl = get_account_tweets('onlyforulonely',max_num_twts=200)

In [222]:
my_str = "@dijdoer frefe tt http://ewdowide.ewd.ewde ewdwed @jeiwo @ pgvila http://ewdowide.ewd.ewde oo"
re.sub(r"(@\s?[^\s]+|https?://?[^\s]+)", "", my_str)

' frefe tt  ewdwed    oo'

In [21]:
[(elem._json['lang'],elem._json['text']) for elem in tl]

[('uk', '@anastasiiareshe –ü–æ–¥—è–∫—É–≤–∞–ª–∞üòá'),
 ('uk',
  '@1705Belka —Ç–æ–¥—ñ —á–µ–∫–∞—Ç–∏–º—É —Ç–≤–æ—ó—Ö —Å–º—Å –∑—ñ —Å–ª–æ–≤–∞–º–∏ "–º–µ–Ω–µ –Ω–µ –∑–∞–±—Ä–∞–≤ —Ç–∞—Ç–∫–æ, –π–¥–µ–º–æ–æ"üòÖ'),
 ('uk',
  '@1705Belka —Ç–æ–¥—ñ –∑–º–æ–∂–µ–º–æ —Ä–∞–∑–æ–º –ø—ñ—Ç–∏ –Ω–∞ –≤–∏–ø—É—Å–∫–Ω–∏–π 11-–∏—Ö –∫–ª–∞—Å—ñ–≤, –∞ —Ç–æ —É –º–µ–Ω–µ –Ω–µ–º–∞—î –∫–æ–º–ø–∞–Ω—ñ—ó)\n—ñ –ø–ª–∞—Ç—Ç—è—á–∫–∞ –ø–æ–¥–∏–≤–∏–º–æ—Å—å, —ñ –ø–æ–≥–æ–≤–æ—Ä–∏–º–æ)'),
 ('uk', '@1705Belka —É –°–± –±—É–¥–µ—à —â–µ —Ç—É—Ç?)'),
 ('uk',
  '@1705Belka –≤ —Å–µ–µ–µ–Ω—Å—ñ –æ–¥–Ω–∞?\n\n—Å–∫–∞–∂–∏ –º–µ–Ω—ñ, –∫–æ–ª–∏ —Ç–∏ –≤—ñ–ª—å–Ω–∞, —ñ –º–∏ –∑—É—Å—Ç—Ä—ñ–Ω–µ–º–æ—Å—åüòä'),
 ('uk',
  '@1705Belka –õ—é, —è —Ç–µ–±–µ –±–∞—á–∏–ª–∞ –≤ –º—ñ—Å—Ç—ñ –æ —Å—å–æ–º—ñ–π, –∫—Ä–∏—á–∞–ª–∞ —Ç–æ–±—ñ "–õ—é—é, –∞–ª–æ", –∞ —Ç–∏ –≥–æ–≤–æ—Ä–∏–ª–∞ –ø–æ —Ç–µ–ª–µ—Ñ–æ–Ω—É, –∑—ñ—Ä–∫–æüò¶'),
 ('uk', '@sergeykovaliov —è–∫ –≤–∞—Ä—ñ–∞–Ω—ÇüòÖ'),
 ('uk', '–ü—Ä–∏–∫—Ä–æ, –∫–æ–ª–∏ –ª—é–¥–∏–Ω–∞ –∫–∏–¥–∞—î—Ç—å—Å—è —Å–ª–æ–≤–∞–º–∏ –Ω–∞ –≤—ñ—Ç–µ—Ä.'),
 ('uk', '@nastya_dubyna 

In [24]:
detect("–ù—É –Ω—ñ—á–æ–≥–æ-–Ω—ñ—á–æ–≥–æ, —Å–∫–æ—Ä–æ —ñ –º–æ—è —ñ–Ω—Å—Ç–∞ —Å—Ç–∞–Ω–µ –ø–æ–ø—É–ª—è—Ä–Ω–æ—é —ñ –≤—Å—ñ –±—É–¥—É—Ç—å —Ç–∞–∫—ñ –º–æ–ª –æ–≥–æ–æ –õ–∞–¥–∫–∞ —è–∫–∞ —Ç–∏ –∫–ª–∞—Å–Ω–∞, –∞ —è —Ç–∞–∫–∞ –æ–æ –¥—è–∫—É—é —Ü—å–æ–º-—Ü—å–æ–º.")

'uk'

In [12]:
user_info._json

{'contributors_enabled': False,
 'created_at': 'Sun Feb 15 18:20:38 +0000 2015',
 'default_profile': False,
 'default_profile_image': False,
 'description': '–∑–∞–Ω–∞–¥—Ç–æ —Ö–æ—Ä–æ—à–∞(–∑–∞–≥–∞–¥–∫–æ–≤–∞) –¥–ª—è —Ü—å–æ–≥–æ —Å–≤—ñ—Ç—É. —á–æ–º—É –± –≤–∞–º –Ω–µ –º–∏—Å–ª–∏—Ç–∏ –ø–æ–∑–∏—Ç–∏–≤–Ω–æ? –±—É—Ç–∏ –±–∞–∂–∞–Ω–æ—é, –∞–ª–µ –Ω–µ –±–∞–∂–∞—Ç–∏ –Ω—ñ–∫–æ–≥–æ. –º–∞–ª—é—é, —è–∂—Ö—É–¥–æ–∂–Ω–∏–∫. make art, be art‚òÄÔ∏è #–£–∫—Ä–¢–≤—ñ',
 'entities': {'description': {'urls': []},
  'url': {'urls': [{'display_url': 'instagram.com/lavoytko/',
     'expanded_url': 'http://www.instagram.com/lavoytko/',
     'indices': [0, 23],
     'url': 'https://t.co/aQVTV4rjd4'}]}},
 'favourites_count': 8970,
 'follow_request_sent': False,
 'followers_count': 742,
 'following': False,
 'friends_count': 407,
 'geo_enabled': True,
 'has_extended_profile': True,
 'id': 3039405867,
 'id_str': '3039405867',
 'is_translation_enabled': False,
 'is_translator': False,
 'lang': 'uk',
 'listed_count': 7,
 'location': 'Ostr

In [324]:
my_friends = get_account_network('ArnauAndreu')

my_df = pd.DataFrame(my_followers)

Counter([friend.lang for friend in my_friends])

my_fr_txts, my_friends_lang = get_tweets_from_accounts(my_friends)

#Counter(my_friends_lang)



# my_fr_langs_detected=[]
# for i,txt in enumerate(my_fr_txts):
#     #print(i, txt)
#     try:
#         my_fr_langs_detected.append(detect(txt))
#     except:
#         continue

#Counter(my_fr_langs_detected)

usr_tl = get_account_tweets(my_friends[44].screen_name, max_num=10)

df_try=pd.DataFrame([twt._json for twt in usr_tl])
df_try.columns

my_tl = get_account_tweets('ArnauAndreu', max_num=10)

### MERGE DFs

In [218]:
#get hdf database keys
with pd.HDFStore('lang_data.h5','r') as f:
    my_keys = f.keys()

In [221]:
my_keys

['/ukr_nodes/news/BBC_ua/followers',
 '/ukr_nodes/news/BBC_ua/tls_followers',
 '/ukr_nodes/news/HromadskeUA/followers',
 '/ukr_nodes/news/HromadskeUA/tls_followers',
 '/ukr_nodes/news/LIGAnet/followers',
 '/ukr_nodes/news/LIGAnet/tls_followers',
 '/ukr_nodes/news/ukrpravda_news/followers',
 '/ukr/starsystem/s_vakarchuk/followers',
 '/ukr/starsystem/s_vakarchuk/tls_followers',
 '/ukr/politics/Vitaliy_Klychko/followers',
 '/ukr/politics/Vitaliy_Klychko/tls_followers',
 '/cat_nodes/news/LaVanguardia/followers',
 '/cat_nodes/news/LaVanguardia/tls_followers',
 '/cat_nodes/news/diariARA/followers',
 '/cat_nodes/news/diariARA/tls_followers',
 '/cat/news/elperiodico/followers',
 '/cat/news/elperiodico/tls_followers',
 '/cat/cities/TGNAjuntament/followers',
 '/cat/cities/TGNAjuntament/tls_followers',
 '/cat/cities/aj_vic/followers',
 '/cat/cities/aj_vic/tls_followers',
 '/cat/cities/ajterrassa/followers',
 '/cat/cities/ajterrassa/tls_followers',
 '/cat/cities/bcn_ajuntament/followers',
 '/cat/c

In [234]:
aa=pd.read_hdf('lang_data.h5', '/ukr_nodes/news/BBC_ua/followers')

In [235]:
aa['lang'].value_counts()

ru       2626
uk       1770
en        505
pl         30
ro         13
en-gb      11
hu          6
ar          6
tr          6
es          4
fr          3
ja          3
de          3
pt          3
sk          3
ko          2
bg          1
it          1
nl          1
zh-cn       1
el          1
vi          1
Name: lang, dtype: int64

In [45]:
file_path = 'lang_data.h5'
country = 'cat'
acc_names = ['diariARA', 'LaVanguardia']
load_node1 = '/' + country + '_nodes/news/' + acc_names[0] + '/tls_followers'
load_node2 = '/' + country + '_nodes/news/' + acc_names[1] + '/tls_followers'

df1 = pd.read_hdf(file_path, load_node1)
df2 = pd.read_hdf(file_path, load_node2)

In [46]:
df_merged = pd.merge(df1, df2, how='outer')

In [47]:
df_merged.shape, df1.shape, df2.shape

((44937, 2), (34461, 2), (12181, 2))

In [180]:
%matplotlib
from matplotlib import pyplot as plt


Using matplotlib backend: MacOSX


<matplotlib.legend.Legend at 0x12a432550>

### TWITTER RANDOM WALK

#### Work out city's lingua franca out of random inhabitants that are also Twitter users

1. Start from mayor, city hall account, or any other relevant account based in the city.

2. Get a follower from the city (or county, oblast) as first node ( Check this follower 
   has sufficient tweets and followers)

3. Get follower of step2 node

4. Repeat step3 with new nodes until max number of nodes is reached

In [350]:
#use key words ['Kiev','Kyiv',]
# r"(Kiev|Kyiv|–ö–∏—ó–≤|–ö–∏–µ–≤)"
account_name = 'Leshchenkos'
key_words = ['–£–∫—Ä–∞—ó–Ω–∞', 'Ukraine', '–£–∫—Ä–∞–∏–Ω–∞','Kiev' ,'Kyiv' ,'–ö–∏—ó–≤' ,'–ö–∏–µ–≤']
acc_ntw = get_account_network(account_name, rel_type='followers', 
                              max_num =10000, key_words=key_words, min_num_tweets=10)

0%                          100%
[#########                     ] | ETA: 00:01:04

Rate limit reached. Sleeping for: 874
fallen here

[##################            ] | ETA: 00:11:04


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)
Rate limit reached. Sleeping for: 868
fallen here

[###########################   ] | ETA: 00:03:34


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)
Rate limit reached. Sleeping for: 868
fallen here

[##############################] | ETA: 00:00:00


Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)



Total time elapsed: 00:47:23

Total time elapsed: 00:47:23

Total time elapsed: 00:47:23

Total time elapsed: 00:47:23


In [374]:
Lesch_follws = [(acc.statuses_count, acc.lang, acc.screen_name, acc.followers_count, acc.friends_count, acc.location) 
                 for acc in acc_ntw if re.findall(r"Kiev|Kyiv|–ö–∏—ó|–ö–∏–µ–≤", acc.location) and 
                 acc.statuses_count > 50 and acc.lang in ['ru', 'uk']]

In [391]:
#Lesch_follws

In [370]:
bb=list(zip(*Lesch_follws))
col_keys = ['statuses_count', "lang", 'screen_name', 'followers_count', 'friends_count', 'location']
d_bb = {key:val for key,val in zip(col_keys,bb)}

In [390]:
#pd.DataFrame(d_bb).sort_values(by="statuses_count", ascending=False)

In [None]:
class RandomWalkCityTweets:
    pass

In [342]:
Kl_follwrs = pd.read_hdf('lang_data.h5', '/ukr/politics/Vitaliy_Klychko/followers')

In [346]:
Kl_follwrs[['screen_name', 'followers_count']][(Kl_follwrs['statuses_count'] > 100) & 
                                               (Kl_follwrs['followers_count'] > 50) &
                                               (Kl_follwrs['location'].str.contains(r"Kiev|Kyiv|–ö–∏—ó–≤|–ö–∏–µ–≤"))].sort_values(by='followers_count',
                                                                                                    ascending=False)

Unnamed: 0,screen_name,followers_count
55,AnPrikhodko,209429
756,interfaxua,88046
2223,AmbHagstrom,3129
139,YouScan,2809
255,Starckad,1906
2523,serhiykiral,1009
1841,Tardigrrrada,1001
720,AlexKhrebet,934
2131,bohomol1841,710
1701,PBest,622


In [345]:
ww= get_account_tweets('AnastasijaKaram', max_num_twts=20)

KeyboardInterrupt: 

In [70]:
cit_obl = ['–ë—ñ–ª–∞ –¶–µ—Ä–∫–≤–∞', '–ë—Ä–æ–≤–∞—Ä–∏', '–ë–æ—Ä–∏—Å–ø—ñ–ª—å', '–§–∞—Å—Ç—ñ–≤', '–Ü—Ä–ø—ñ–Ω—å',
        '–í–∞—Å–∏–ª—å–∫—ñ–≤', '–ë–æ—è—Ä–∫–∞', '–í–∏—à–Ω–µ–≤–µ', '–û–±—É—Ö—ñ–≤',
        '–ü–µ—Ä–µ—è—Å–ª–∞–≤-–•–º–µ–ª—å–Ω–∏—Ü—å–∫–∏–π', '–ë—É—á–∞', '–°–ª–∞–≤—É—Ç–∏—á', '–Ø–≥–æ—Ç–∏–Ω', '–í–∏—à–≥–æ—Ä–æ–¥',
        '–°–∫–≤–∏—Ä–∞', '–ë–µ—Ä–µ–∑–∞–Ω—å', '–ë–æ–≥—É—Å–ª–∞–≤', '–¢–µ—Ç—ñ—ó–≤', '–£–∫—Ä–∞—ó–Ω–∫–∞', '–ö–∞–≥–∞—Ä–ª–∏–∫',
        '–¢–∞—Ä–∞—â–∞', '–ú–∏—Ä–æ–Ω—ñ–≤–∫–∞', '–£–∑–∏–Ω', '–†–∂–∏—â—ñ–≤', '–ß–æ—Ä–Ω–æ–±–∏–ª—å', "–ü—Ä–∏–ø'—è—Ç—å"]

In [71]:
acc_ntw = get_account_network('huyova_bc', rel_type='followers', max_num =200, key_words=None)

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:06


In [72]:
list_data = [(acc._json['followers_count'], 
  acc._json['statuses_count'], 
  acc._json['screen_name'], 
  acc._json['location'], 
  acc._json['lang']) for acc in acc_ntw]

df = pd.DataFrame(list_data, columns = ['followers_count','statuses_count','screen_name', 'location', 'lang'])

In [79]:
df_sorted = df.sort_values(by='followers_count', ascending=False)
df_sorted['location'].values

array(['Worldwide', '–ú–æ—Å–∫–≤–∞, –†–æ—Å—Å–∏—è', 'Kyrzbekistan (–ö—ã—Ä–∑–±–µ–∫–∏—Å—Ç–∞–Ω)',
       'Ukraine ', '–ö–∏–µ–≤', '–ö—Ä–∞–º–∞—Ç–æ—Ä—Å–∫, –£–∫—Ä–∞–∏–Ω–∞', '–õ—å–≤—ñ–≤', '–£–∫—Ä–∞–∏–Ω–∞', '',
       'Kyiv', '', '–£–∫—Ä–∞—ó–Ω–∞', '', '–£–∫—Ä–∞—ó–Ω–∞', 'Zhytomyr', '–£–∫—Ä–∞–∏–Ω–∞',
       'SPACE', '', '', '–°–∞–Ω–∫—Ç-–ü–µ—Ç–µ—Ä–±—É—Ä–≥, –†–æ—Å—Å–∏—è', '–£–∫—Ä–∞–∏–Ω–∞', '', '–ú–∏–Ω—Å–∫',
       '–¶–∞—Ä—Å—Ç–≤–∏–µ –ù–µ–±–µ—Å–Ω–æ–µ', '–£–∫—Ä–∞–∏–Ω–∞ -–î–æ–Ω–µ—Ü–∫! ', '–ñ–∏—Ç–æ–º–∏—Ä', '',
       '–î–Ω–µ–ø—Ä–æ–ø–µ—Ç—Ä–æ–≤—Å–∫', '–ú–û–°–ö–í–ê –ì–†–û–ë', 'A—Ñ–∏–Ω—ã', '', '', '',
       'Ukraine, Dnepr', '', '–Ü–≤–∞–Ω–æ-–§—Ä–∞–Ω–∫—ñ–≤—Å—å–∫0954573994', '',
       '–õ—å–≤—ñ–≤, –£–∫—Ä–∞—ó–Ω–∞ ', '', '', '', 'Ukraina', '–≥. –ù–∏–∫–æ–ª–∞–µ–≤', '–£–∫—Ä–∞–∏–Ω–∞',
       '', '', '', '', '–£–∫—Ä–∞–∏–Ω–∞', '', 'Miami, FL', 'Detroit, MI',
       'United States', '–£–∫—Ä–∞—ó–Ω–∞', '', '', '–£–∫—Ä–∞–∏–Ω–∞', 'EU, Latvia', '–ö–∏–µ–≤',
       '', 'Oleshky, Ukraine', '–£–∫—Ä–∞–∏–Ω–∞', '–£–∫—Ä–∞–∏–Ω–∞, –

In [47]:
acc_ntw[0]._json

{'blocked_by': False,
 'blocking': False,
 'contributors_enabled': False,
 'created_at': 'Sat Jul 14 14:47:34 +0000 2012',
 'default_profile': False,
 'default_profile_image': False,
 'description': '',
 'entities': {'description': {'urls': []}},
 'favourites_count': 40,
 'follow_request_sent': False,
 'followers_count': 1,
 'following': False,
 'friends_count': 24,
 'geo_enabled': True,
 'has_extended_profile': False,
 'id': 635511528,
 'id_str': '635511528',
 'is_translation_enabled': False,
 'is_translator': False,
 'lang': 'uk',
 'listed_count': 0,
 'live_following': False,
 'location': '–ß–µ—Ä–Ω—ñ–≥—ñ–≤',
 'muting': False,
 'name': '–áZHAKüíôüíõ',
 'notifications': False,
 'profile_background_color': '000000',
 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme14/bg.gif',
 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme14/bg.gif',
 'profile_background_tile': False,
 'profile_banner_url': 'https://pbs.twimg.com/profile_banne

### GET NODES IN HDF FILE

In [64]:
with pd.HDFStore('lang_data.h5') as f:
    list_nodes = f.keys()

In [65]:
list_nodes

['/ukr_nodes/news/BBC_ua/followers',
 '/ukr_nodes/news/BBC_ua/tls_followers',
 '/ukr_nodes/news/HromadskeUA/followers',
 '/ukr_nodes/news/HromadskeUA/tls_followers',
 '/ukr_nodes/news/LIGAnet/followers',
 '/ukr_nodes/news/LIGAnet/tls_followers',
 '/ukr_nodes/news/ukrpravda_news/followers',
 '/ukr/starsystem/s_vakarchuk/followers',
 '/ukr/starsystem/s_vakarchuk/tls_followers',
 '/ukr/politics/Vitaliy_Klychko/followers',
 '/ukr/politics/Vitaliy_Klychko/tls_followers',
 '/ukr/politics/andriy_sadovyi/followers',
 '/ukr/politics/poroshenko/followers',
 '/cat_nodes/news/LaVanguardia/followers',
 '/cat_nodes/news/LaVanguardia/tls_followers',
 '/cat_nodes/news/diariARA/followers',
 '/cat_nodes/news/diariARA/tls_followers',
 '/cat/news/elperiodico/followers',
 '/cat/news/elperiodico/tls_followers',
 '/cat/cities/AjBadalona/followers',
 '/cat/cities/AjBadalona/tls_followers',
 '/cat/cities/TGNAjuntament/followers',
 '/cat/cities/TGNAjuntament/tls_followers',
 '/cat/cities/aj_vic/followers',
 '/c

### IMPROVING LANGUAGE DETECTION

How to filter out URLs and accounts, not relevant for language detection

In [271]:
my_str = "@dijdoer Ukraine is a complex country http://ewdowide.ewd.ewde say @jeiwo @ pgvila http://ewdowide.ewd.ewde I agree"
re.sub(r"(@\s?[^\s]+|https?://?[^\s]+)", "", my_str)

' Ukraine is a complex country  say    I agree'

In [100]:
aa = pd.read_hdf('lang_data.h5', '/ukr_nodes/news/BBC_ua/tls_followers')

In [226]:
follws_BBC = pd.read_hdf('lang_data.h5', '/ukr_nodes/news/BBC_ua/followers')

In [236]:
follws_BBC['location'][follws_BBC['location'].str.contains(r"–£–∫—Ä–∞—ó–Ω–∞|Ukraine|–£–∫—Ä–∞–∏–Ω–∞|–ö–∏—ó–≤|–ö–∏–µ–≤")].value_counts()

–£–∫—Ä–∞–∏–Ω–∞                          318
–£–∫—Ä–∞—ó–Ω–∞                          255
Ukraine                           37
–ö–∏–µ–≤                              36
–ö–∏—ó–≤                              10
–£–∫—Ä–∞—ó–Ω–∞                            7
–£–∫—Ä–∞–∏–Ω–∞, –ö–∏–µ–≤                      4
Kyiv, Ukraine                      4
–ö–∏–µ–≤                               4
–£–∫—Ä–∞–∏–Ω–∞                            3
Ukraine, Kiev                      3
–ö–∏—ó–≤, –£–∫—Ä–∞—ó–Ω–∞                      3
–õ—å–≤—ñ–≤ –£–∫—Ä–∞—ó–Ω–∞                      3
–õ—å–≤—ñ–≤,–£–∫—Ä–∞—ó–Ω–∞                      2
–¢–µ—Ä–Ω–æ–ø—ñ–ª—å , –£–∫—Ä–∞—ó–Ω–∞                2
–û–¥–µ—Å—Å–∞,–£–∫—Ä–∞–∏–Ω–∞                     2
Kiev, Ukraine                      2
Ukraine, Lviv                      2
–£–∫—Ä–∞–∏–Ω–∞ –•–µ—Ä—Å–æ–Ω                     2
–£–∫—Ä–∞—ó–Ω–∞ –ö–∏—ó–≤                       2
–£–∫—Ä–∞—ó–Ω–∞,–ö–∞–ª—É—à                      2
Kyiv, Ukraine                      2
–£–∫—Ä–∞–∏–Ω–∞, –°—É–º—ã           

In [237]:
relevant_data = follws_BBC[follws_BBC['location'].str.contains(r"–£–∫—Ä–∞—ó–Ω–∞|Ukraine|–£–∫—Ä–∞–∏–Ω–∞|–ö–∏—ó–≤|–ö–∏–µ–≤")]

In [272]:
clean_txts = [re.sub(r"(@\s?[^\s]+|https?://?[^\s]+)", "", txt)
              for txt in aa['texts'].values[:1000]]

In [273]:
tup_lang_txt = [(txt,detect(txt)) for txt in clean_txts if len(txt) > 20]

In [275]:
tup_lang_txt[:100]

[('RT  –ï–∫—Å-–≥–æ–ª–æ–≤–∞ —Ä–∞–π—Ä–∞–¥–∏ –Ω–∞ –õ—É–≥–∞–Ω—â–∏–Ω—ñ –æ—Ç—Ä–∏–º–∞–≤ 4 —Ä–æ–∫–∏ –∑–∞ "—Ä–µ—Ñ–µ—Ä–µ–Ω–¥—É–º –õ–ù–†"  ',
  'bg'),
 ('RT  –ü—Ä–æ—Å—Ç–æ –≤—Å—Ç–∞–≤–∏–ª –Ω–µ –≤ —Ç—É —Å–∫—Ä–µ–ø—É ', 'ru'),
 ('RT  –ü–ª–∞–º–µ–Ω–Ω—ã–µ –ø–∞—Ç—Ä–∏–æ—Ç—ã. –∫–æ—Ç–æ—Ä—ã–µ –≥–æ–≤–æ—Ä—è—Ç, —á—Ç–æ –µ–∑–¥—è—Ç –Ω–∞ —Ä–æ—Å–¢–í –∑–∞—â–∏—â–∞—Ç—å –£–∫—Ä–∞–∏–Ω—É, –ø–æ—Å–ª–µ —ç—Ñ–∏—Ä–∞ –º–∏–ª–æ –±—É—Ö–∞—é—Ç —Å —Ç–µ–º–∏, –∫—Ç–æ —Ñ–∏–Ω–∞–Ω—Å–∏—Ä—É–µ—Ç‚Ä¶',
  'ru'),
 ('RT  –¢—Ä–µ–π–ª–µ—Ä —Ä—É—Å—Å–∫–æ–≥–æ –º–∏—Ä–∞ ', 'ru'),
 ('RT  –ü–∏—à—É—Ç, —á—Ç–æ –≤ –ê–ª–µ–ø–ø–æ —Å–±–∏–ª–∏ —Å–≤–∏–Ω–æ—Å–æ–±–∞—á–∏–π —Å–∞–º–æ–ª–µ—Ç.\n–î–æ–±—Ä–æ–µ —É—Ç—Ä–æ, –•—É–Ω—Ç–∞! ',
  'ru'),
 ('RT    –ü–æ–∫–∏ –∫—Ä–∞—â—ñ —Å–∏–Ω–∏ –£–∫—Ä–∞—ó–Ω–∏ –≥–∏–Ω—É—Ç—å –∑–∞—Ö–∏—â–∞—é—á–∏ –∫—Ä–∞—ó–Ω—É –ü–ê–¶–Æ–ö–ò —ó–¥—É—Ç—å –Ω–∞ –∑–∞—Ä–æ–±—ñ—Ç–∫–∏ –¥–æ –º–æ—Å–∫–æ–≤—ñ—ó —Ç–∞ –Ω–∞ –≤—ñ–¥–ø–æ—á–∏‚Ä¶',
  'uk'),
 ('RT  –í –•–∞–±–∞—Ä–æ–≤—Å–∫–µ –≤—Å—Ç—Ä–µ—Ç–∏–ª–∏—Å—å –¥–≤–∞ –º–∏—Ç–∏–Ω–≥–∞: –ù–æ–¥–æ–≤—Ü—ã –≤—ã—Å—Ç—Ä–æ–∏–ª–∏—Å—å –≤ –∞–≤—Ç–æ–∫–æ–ª–æ–Ω–Ω—

In [174]:
ru_txts = [x[0] for x in tup_lang_txt if x[1] == 'ru']

In [175]:
ukr_txts = [x[0] for x in tup_lang_txt if x[1] == 'uk']

In [185]:
len(ru_txts), len(ukr_txts)

(473, 205)

In [176]:
stats_ru =[Counter([detect(x) for _ in range(20)]) for x in ru_txts]

In [177]:
stats_uk =[Counter([detect(x) for _ in range(20)]) for x in ukr_txts]

In [178]:
ru_uncertain = [x for x,y in zip(ru_txts, stats_ru) if y['ru'] < 20]

In [179]:
ukr_uncertain = [x for x,y in zip(ukr_txts, stats_uk) if y['uk'] < 20]

In [182]:
len([(x, y['ru']) for x,y in zip(ru_txts, stats_ru) if y['ru'] < 20])

29

In [183]:
len([(x, y['uk']) for x,y in zip(ukr_txts, stats_uk) if y['uk'] < 20])

19

In [84]:
dummy_data = pd.DataFrame({'a':np.random.randint(1,100,10), 'b':np.random.randint(1,100,10)})

In [96]:
store2 = pd.HDFStore('dummy_ex.h5')

In [99]:
store2.get_node('/node_c/subfolder')

/node_c/subfolder (Group) ''
  children := ['block0_values' (Array), 'axis1' (Array), 'block0_items' (Array), 'axis0' (Array)]

In [86]:
store2['node_a'] = dummy_data['a']

In [87]:
store2['node_b'] = dummy_data['b']

In [88]:
store2.close()

In [97]:
with pd.HDFStore('dummy_ex.h5', ) as g:
    #g.get_node('node_c/subfolder')._f_rename('node_c/subfolder1')
    print(g.keys())
    

['/node_a', '/node_b', '/node_c/subfolder']


In [80]:
df_ = pd.DataFrame({'c':np.random.randint(1,100,10)})

In [90]:
df_.to_hdf('dummy_ex.h5', 'node_c/subfolder')

### CATALAN COMARQUES

In [25]:
import pandas as pd

url = "https://en.wikipedia.org/wiki/Municipalities_of_Catalonia"
tabs = pd.read_html(url)[0]

tabs.columns = tabs.iloc[0]

tabs = tabs.iloc[1:]

In [80]:
tabs.keys()

Index(['Municipality', 'Comarca', 'Province', 'Population (2014)[3]',
       'Area (km2)[3]', 'Density', 'No. of EMDs[4]'],
      dtype='object', name=0)

In [35]:
tabs[tabs['Comarca'] == 'Osona']['Municipality'].values

array(['Alpens', 'Baleny√†', 'El Brull', 'Calldetenes', 'Centelles',
       'Espinelves', "L'Esquirol", 'Folgueroles', 'Gurb', 'Llu√ß√†', 'Malla',
       'Manlleu', 'Les Masies de Roda', 'Les Masies de Voltreg√†',
       'Montesquiu', 'Muntanyola', 'Olost', 'Or√≠s', 'Orist√†', 'Perafita',
       'Prats de Llu√ßan√®s', 'Roda de Ter', 'Rupit i Pruit',
       'Sant Agust√≠ de Llu√ßan√®s', 'Sant Bartomeu del Grau',
       'Sant Boi de Llu√ßan√®s', 'Sant Hip√≤lit de Voltreg√†',
       'Sant Juli√† de Vilatorta', "Sant Mart√≠ d'Albars",
       'Sant Mart√≠ de Centelles', 'Sant Pere de Torell√≥',
       'Sant Quirze de Besora', "Sant Sadurn√≠ d'Osormort",
       'Sant Vicen√ß de Torell√≥', 'Santa Cec√≠lia de Voltreg√†',
       'Santa Eug√®nia de Berga', 'Santa Eul√†lia de Riuprimer',
       'Santa Maria de Besora', 'Seva', 'Sobremunt', 'Sora', 'Taradell',
       'Tav√®rnoles', 'Tavertet', 'Tona', 'Torell√≥', 'Vic', 'Vidr√†',
       'Viladrau', 'Vilanova de Sau'], dtype=object)

In [82]:
tabs_sorted_com = tabs.sort_values(by='Comarca').set_index(['Comarca'])

In [85]:
tabs_sorted_com.ix['Osona']['Municipality'].values

array(['Muntanyola', 'Santa Eug√®nia de Berga', "Sant Sadurn√≠ d'Osormort",
       'Vidr√†', 'Tona', 'Torell√≥', 'Rupit i Pruit', 'Roda de Ter',
       'Santa Cec√≠lia de Voltreg√†', 'Sant Vicen√ß de Torell√≥',
       'Santa Eul√†lia de Riuprimer', 'Calldetenes', 'Prats de Llu√ßan√®s',
       'Tavertet', 'Montesquiu', 'Sant Juli√† de Vilatorta', 'Or√≠s', 'Vic',
       'Olost', 'Sant Boi de Llu√ßan√®s', 'Perafita', "L'Esquirol",
       'Viladrau', 'Taradell', 'Les Masies de Roda', 'Sora',
       'Les Masies de Voltreg√†', 'Gurb', 'Seva', 'Sobremunt',
       'Folgueroles', 'Santa Maria de Besora', 'Orist√†', 'Vilanova de Sau',
       'El Brull', 'Espinelves', "Sant Mart√≠ d'Albars",
       'Sant Bartomeu del Grau', 'Tav√®rnoles', 'Manlleu', 'Malla',
       'Sant Pere de Torell√≥', 'Sant Quirze de Besora',
       'Sant Agust√≠ de Llu√ßan√®s', 'Alpens', 'Sant Mart√≠ de Centelles',
       'Sant Hip√≤lit de Voltreg√†', 'Centelles', 'Baleny√†', 'Llu√ß√†'], dtype=object)

### Get random tweets from a given coordinate box

In [415]:
# 
data_list, texts, langs, locs = [], [], [], []

class StdOutListener(StreamListener):
    """ A listener handles tweets are the received from the stream.
    This is a basic listener that just prints received tweets to stdout.
    """
    
    def __init__(self):
        self.data_list = []
        self.texts = []
        self.langs = []
    
    def on_data(self, data):
        jd = json.loads(data)
        self.data_list.append(jd)
        self.texts.append(jd['text'])
        self.langs.append(jd['lang'])
        try:
            print(data)
            saveFile = open('newtweets.csv', 'a')
            saveFile.write(data).encode("utf8")
            saveFile.write('/n').encode("utf8")
            saveFile.close()
            return True
        except BaseException:
            print ('failed ondata')
            time.sleep(5)

    def on_error(self, status):
        print(status)

In [416]:
#coordinates
Lviv = [23.882904,49.763526,24.163055,49.921167]
Kiev = [30.449982,50.408518,30.639496,50.495958]
Yerevan = [44.329834,40.078071,44.681396,40.296287]
Brussel = [4.258575,50.788575,4.489288,50.913424]
Barcelona = [1.835403,41.375778,2.241898,41.586688]

In [10]:
# #Barcelona
# l_Barc = StdOutListener()
# #ASK FOR KEYWORD TO COLLECT DATA
# stream_Barc = Stream(auth, l_Barc)
# stream_Barc.filter(locations=Barcelona)

In [9]:
#Counter(l_Barc.langs)
# for data, lang in zip(l_Barc.data_list, l_Barc.langs):
#     print(data['user']['location'], lang)
for text in l_Barc.texts:
    if type(text) == str:
        print(text)
    else:
        print(text.decode('utf-8'))

In [10]:
# for ee in l_Barc.data_list:
#     print(ee['place']['id'], ee['place']['place_type'])

In [11]:
# #Brussel
# l_Bru = StdOutListener()
# #ASK FOR KEYWORD TO COLLECT DATA
# stream_Bru = Stream(auth, l_Bru)
# stream_Bru.filter(locations=Brussel)

In [12]:
# #Counter(l_Bru.langs)
# for data, lang in zip(l_Bru.data_list, l_Bru.langs):
#     print(data['user']['location'], lang)

In [13]:
# #LVIV
# l_Lv = StdOutListener()
# #ASK FOR KEYWORD TO COLLECT DATA
# stream_Lv = Stream(auth, l_Lv)
# stream_Lv.filter(locations=Lviv)

In [14]:
# #YEREVAN
# l_Yer = StdOutListener()
# #ASK FOR KEYWORD TO COLLECT DATA
# stream_Yer = Stream(auth, l_Yer)
# stream_Yer.filter(locations=Yerevan)

In [15]:
# for data, lang in zip(l_Yer.data_list, l_Yer.langs):
#     print(data['user']['location'], lang)

In [16]:
# for text in l_Yer.texts:
#     if type(text) == str:
#         print(text)
#     else:
#         print(text.decode('utf-8'))

In [17]:
# for data, lang in zip(data_list, langs):
#     print(data['user']['location'], lang)

In [18]:
#KIEV
l_Kiev = StdOutListener()
#ASK FOR KEYWORD TO COLLECT DATA
stream_Kiev = Stream(auth, l_Kiev)
stream_Kiev.filter(locations=Kiev)

In [19]:
#Counter(l_Kiev.langs)
# for data, lang in zip(l_Kiev.data_list, l_Kiev.langs):
#     print(data['user']['location'], lang)
for text in l_Kiev.texts:
    if type(text) == str:
        print(text)
    else:
        print(text.decode('utf-8'))

In [20]:
for data in l_Kiev.data_list:
    print(data['place']['id'], data['place']['place_type'], 
          data['place']['country'], data['user']['location'])

In [125]:
for text in texts:
    if type(text) == str:
        print(text)
    else:
        print(text.decode('utf-8'))


### DYNAMO KIEV PLAYERS

In [27]:
# DYNAMO PLAYERS

import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

url = "http://www.worldfootball.net/teams/dinamo-kiev/2017/2/"
html = requests.get(url)
soup = BeautifulSoup(html.text, "lxml")

dynamo_players = pd.read_html(url, encoding='utf8')[1][2].dropna().values

dynamo_players

array(['Maksim Koval', 'Artur Rudko', 'Antunes', 'Mykyta Burda',
       'Tam√°s K√°d√°r', 'Evgen Khacheridi', 'Pavlo Lukyanchuk',
       'Bogdan Mykhaylychenko', 'Zurab Ochihava', 'Aleksandar Pantiƒá',
       'Oleksandr Tymchyk', 'Domagoj Vida', 'Vitaliy Buyalskiy',
       'Valeriy Fedorchuk', 'Denys Garmash', 'Nikita Korzun',
       'Mykyta Kravchenko', 'Mikola Morozyuk', 'Pavel Orekhovskiy',
       'Serhiy Rybalka', 'Volodymyr Shepeliev', 'Sergiy Sydorchuk',
       'Viktor Tsygankov', 'Artem Besedin', 'Derlis Gonz√°lez',
       'Roman Yaremchuk', 'Andriy Yarmolenko', 'Sergiy Rebrov'], dtype=object)

In [56]:
dyn_play_countr = pd.read_html(url, encoding='utf8')[1][[2,4]].dropna().values

dyn_play_countr

### PYMONGO

In [5]:
from pymongo import MongoClient

conn=MongoClient()

#define database
db = conn.citylangs

#define collection inside database
collection = db.bcn.ajuntam.followers

# function to add documents to collection 
def make_followers_collection(account_name, collection, max_num=100):
    api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
    users = tweepy.Cursor(api.followers, screen_name=account_name).items(max_num)
    #i = 0
    while True:
        try:
            user = next(users)
            collection.insert_one(user._json)
        except tweepy.TweepError as e:
            if 'Read timed out' in str(e):
                print('fall here')
                print(e)
                time.sleep(5)
            else:
                time.sleep(60*16)
                user = next(users)
        except StopIteration:
            break
        #print ("@" + user.screen_name)
    #return collection

make_followers_collection('bcn_ajuntament',collection, max_num=100)

#check what databases are available
conn.database_names()

# available collections inside db
db.collection_names()

collection2 = db.kiev.ukrpravda

make_followers_collection('ukrpravda_news',collection2, max_num=300)

l = list(conn.citylangs.kiev.ukrpravda.find())

Counter([obj['lang'] for obj in l])


db.collection_names()

#db.categories.insert_one({ "_id": "ukr_pravda", "children": [] })
# db.categories.insert({ _id: "avto_kiev", children: [] })
# db.categories.insert({ _id: "kiev", children: ["ukr_pravda", "avto_kiev"] })

db['kiev'].insert_one({'avto_kiev':[],'vitklitschko':[]})

db.collection_names()

rr=list(db['bcn.ajuntam.followers'].find())

db['kiev'].find_one()

db['bcn.ajuntam.followers']

conn.database_names()

coll2 = db.countries