<a href="https://colab.research.google.com/github/nishantparmar24/Political-Risk/blob/master/DataResource.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis - Twitter 

### Install required packages

In [0]:
!pip install searchtweets unidecode newsapi-python

Collecting searchtweets
  Downloading https://files.pythonhosted.org/packages/51/d7/7dd296ba9469e046bad23583aaa0d36b18c7d6e4df9fd2acfb433d1c7ee2/searchtweets-1.7.4-py3-none-any.whl
Collecting unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
[K     |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 245kB 12.7MB/s 
[?25hCollecting newsapi-python
  Downloading https://files.pythonhosted.org/packages/4a/49/cbb39846c53076a1cde2c73c5dbc3d39956ea7586c8dfc35d516d706a497/newsapi-python-0.2.5.tar.gz
Collecting tweet-parser
  Downloading https://files.pythonhosted.org/packages/4b/ea/cb82efb43dbcb115ea447313fbd287ff66349b34bdfdb4a78e25d3a42cb0/tweet_parser-1.13.2-py3-none-any.whl
Building wheels for collected packages: newsapi-python
  Building wheel for newsapi-python (setup.py) ... [?25l[?25hdone
  Created wheel f

### Import all required packages

In [0]:
from datetime import datetime as dt
from google.cloud import translate
from google.colab import drive
from html import unescape
from newsapi import NewsApiClient
from numpy import nan
from searchtweets import load_credentials, gen_rule_payload, collect_results
from time import sleep
from unidecode import unidecode
from urllib import parse
import json
import os
import pandas as pd
import re
import requests
import unicodedata

### Enable Google Drive in this notebook:

In [0]:
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [0]:
DRIVE_LOCATION = "/gdrive/My Drive/Political Risk Project/Test Data"
SEARCH_RESULTS = "/gdrive/My Drive/Firebolt/Political Risk"
bckp_loc = "/gdrive/My Drive/Political Risk Project/Test Data/Backup"

search_results = os.path.join(SEARCH_RESULTS, 'new_outputs.csv')
tw_output_directory = os.path.join(DRIVE_LOCATION, 'Twitter Results')
nh_output_directory = os.path.join(DRIVE_LOCATION, 'News Results')
Twitter_API_usage_file = os.path.join(tw_output_directory, 'api_usage.json')
News_API_usage_file = os.path.join(nh_output_directory, 'api_usage.json')

### Fetch Google Translate API credentials

In [0]:
translate_api_dir = "/gdrive/My Drive/Firebolt/API Keys"
translate_api_key = os.path.join(translate_api_dir, 'eiu-searchautomation.json')
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = translate_api_key
try:
    translate_client = translate.Client()
    
    def translate_keyword(keyword, target_lang="en"):
        response = translate_client.translate(keyword,
                                              target_language=target_lang)
        if response:
            return response["translatedText"]
        else:
            return ""
    
    def detect_language(text_string):
        result = translate_client.detect_language(text_string)
        return result["language"]
    
except Exception as e:
    print("Exception occurred while instantiating \
    Google Cloud Translate: {}".format(e))


### Get text with the least length from a collection of text values

In [0]:
def phrase_min_length(text_sample):
    sample_df = pd.DataFrame(text_sample, columns=['text'])
    sample_df["length"] = [l for l in map(len, text_sample)]
    min_pos = sample_df["length"] == sample_df["length"].min()
    min_phrase = sample_df[min_pos]["text"] 

### Function to remove emojis from a text string

In [0]:
def remove_emojis(input_string):
    return_string = ""

    for character in input_string.strip():
        try:
            character.encode("ascii")
            return_string += character
        except UnicodeEncodeError:
            replaced = unidecode(str(character))
            if replaced not in ['', '[?]']:
                return_string += replaced
    return return_string.strip()

### Function to extract (and segregate) tweet-components: links, user-handles and hashtags

In [0]:
def extract_components(clean_string):
    token_list = re.split(r"\s+", clean_string.strip())
    url_ptrn = '^https?:\/\/.*[\r\n]*'
    handle_ptrn = '^\@.*'
    trend_ptrn = '^#.*'
    url_collection = list()
    handle_collection = list()
    trend_collection = list()
    tokens = list()
    for token in token_list:
        # print("token = {}".format(token))
        url_match = re.search(url_ptrn, token) or re.match(url_ptrn, token)
        if url_match:
            # print("\t\tURL Pattern matched")
            url_collection.append(
                token) if token not in url_collection else False
            ix = token_list.index(token)
            token_list[ix] = ''
            continue
        handle_match = re.search(
            handle_ptrn, token) or re.match(handle_ptrn, token)
        if handle_match:
            # print("\t\tHandle Pattern matched")
            handle_collection.append(
                token) if token not in handle_collection else False
            ix = token_list.index(token)
            token_list[ix] = ''
            continue
        trend_match = re.search(
            trend_ptrn, token) or re.match(trend_ptrn, token)
        if trend_match:
            # print("\t\tTrend Pattern matched")
            trend_collection.append(
                token) if token not in trend_collection else False
            ix = token_list.index(token)
            token_list[ix] = ''
            continue
        if token.strip():
            tokens.append(token)
    del token_list
    return (tokens, url_collection, handle_collection, trend_collection)


### Function to clean and structure the text data

In [0]:
def structure_text(entire_str):
    separations = dict()
    clean_str = remove_emojis(entire_str)
    tokens, links, handles, tags = extract_components(clean_str)
    clean_str = (" ".join(tokens)).strip()
    print("clean_str = {}".format(clean_str))
    cleaner_str = (re.subn("[:-]+", "", clean_str))[0].strip()
    try:
        cleaner_str = unescape(translate_keyword(cleaner_str))
    except Exception as e:
        print("Exception occurred: {}".format(e))
        sleep(5)
        try:
            cleaner_str = unescape(translate_keyword(cleaner_str))
        except Exception as e:
            cleaner_str = ""
    print("cleaner_str = {}".format(cleaner_str))
    separations['text'] = cleaner_str
    separations['links'] = ",".join(links)
    separations['handles'] = ",".join(handles)
    separations['tags'] = ",".join(tags)
    return separations

### Function to include a zero for date values less than 10

In [0]:
format_date_str = lambda s: "0{}".format(s) if int(s) < 10 else "{}".format(s)

### Function to return a formatting of current datetime (for file names)

In [0]:
today = lambda: "{}-{}-{}_{}-{}".format(dt.now().year,
                                format_date_str(dt.now().month),
                                format_date_str(dt.now().day),
                                format_date_str(dt.now().hour),
                                format_date_str(dt.now().minute))
today_date = today().split("_")[0]
today_date

'2019-11-28'

### Function to format datetime as string (time separated with a ":")

In [0]:
def extract_datetime_string(datetime_obj):
    if datetime_obj:
        return "{}-{}-{} {}:{}:{}".format(datetime_obj.year,
                                          format_date_str(datetime_obj.month),
                                          format_date_str(datetime_obj.day),
                                          format_date_str(datetime_obj.hour),
                                          format_date_str(datetime_obj.minute),
                                          format_date_str(datetime_obj.second))

### Functions to create and validate custom dates

In [0]:
def get_month_days_map(year, month=None):
    month_days_map = {(1, 3, 5, 7, 8, 10, 12): 31, (4, 6, 9, 11): 30}
    if year % 4 == 0:
        month_days_map.update({(2,): 29})
    else:
        month_days_map.update({(2,): 28})
    month = 12 if month == 0 else month
    if month:
        for months, n_days in month_days_map.items():
            if month in months:
                return n_days
    return month_days_map

In [0]:
def validate_custom_date(year, month, day):
    print("Date entered: {}-{}-{}".format(year, 
                                          format_date_str(month), 
                                          format_date_str(day)))
    n_days = get_month_days_map(year, month=month)
    if month not in range(1, 13):
        print("Invalid month entered!")
        return False
    if day not in range(1, n_days+1):
        print("Invalid day entered!")
        return False
    if year > dt.now().year and month > dt.now().month and day > dt.now().day:
        print("Wrong date entered! Please enter correct date values.")
        return False
    elif year > dt.now().year:
        print("Invalid year entered!")
        return False
    elif year == dt.now().year:
        if month > dt.now().month:
            print("Invalid month entered!")
            return False
        if day > dt.now().day:
            print("Invalid day entered!")
            return False
    return True

In [0]:
def get_custom_date(year=dt.now().year, month=dt.now().month, day=dt.now().day):
    date_valid = validate_custom_date(year, month, day)
    custom_date = ""
    if date_valid:
        from_day = diff_day = day - 7
        from_mon = month
        from_year = year
        prev_n = get_month_days_map(year, month - 1)
        if diff_day < 0:
            from_day = diff_day % prev_n
            from_mon = month - 1 if month != 1 else 12 
            from_year = year - 1 if month == 1 else year
        elif diff_day == 0:
            from_day = prev_n
            from_mon = month - 1 if month != 1 else 12
            from_year = year - 1 if month == 1 else year
        print("from_day = {}".format(from_day))
        custom_date = "{}-{}-{}".format(from_year, 
                                        format_date_str(from_mon), 
                                        format_date_str(from_day)) 
    return custom_date

get_custom_date()

Date entered: 2019-11-28
from_day = 21


'2019-11-21'

### Fetch Twitter Premium Search API credentials

In [0]:
creds_file = '/gdrive/My Drive/Political Risk Project/TwitterCredentials/Bckp/twitter_creds.yaml'
search_tweets_api = 'search_tweets_30_day_dev'
premium_search_args = load_credentials(filename=creds_file,
                                       yaml_key=search_tweets_api,
                                       env_overwrite=False)

Grabbing bearer token from OAUTH


### Defining all the filters, queries and user handles

In [0]:
it_query_filter = "-has:media place_country:IT"
it_en_query_filter = "-has:media place_country:IT lang:en"
generic_query_filter = "-has:media"
queries = ["early election", "snap election", "government collapse", 
           "government coalition", "election", "instability", "uncertainty",
           "crisis", "coalition"]
queries_it = [q for q in map(lambda s: translate_keyword(s, "it"), queries)]

from_users = ["lorepregliasco", "FerdiGiugliano", "AlbertoNardelli", 
              "gavinjones10"]

In [0]:
# print(queries)
# print(queries_it)

['early election',
 'snap election',
 'government collapse',
 'government coalition',
 'election',
 'instability',
 'uncertainty',
 'crisis',
 'coalition',
 'elezioni anticipate',
 'elezioni anticipate',
 'crollo del governo',
 'coalizione di governo',
 'elezione',
 'instabilit√†',
 'incertezza',
 'crisi',
 'coalizione']

### Twitter API Usage Monitor

In [0]:
def get_api_usage(date):
    if os.path.lexists(Twitter_API_usage_file):
        api_usage = pd.read_json(Twitter_API_usage_file, orient='records')
        print(api_usage)
        print(list(api_usage.columns))
        if date in api_usage.columns:
            return api_usage, api_usage[date].squeeze()
        else:
            # update_api_count(date, 0)
            return api_usage, 0
    return None, None


def update_api_count(date, count_add):
    api_usage, count = get_api_usage(date)
    if not api_usage.empty:
        updated_count = 0
        print(list(api_usage.columns))
        if date in api_usage.columns:
            print("Month present\n")
            api_usage[date] += count_add
            updated_count = api_usage[date].squeeze()
        else:
            print("Month absent\n")
            api_usage[date] = 1
        print("api_usage = {}".format(api_usage[date]))
        api_usage.to_json(Twitter_API_usage_file, 
                          orient='records', 
                          date_format='iso')
        sleep(3)
        print("Updating new API usage...")
        _, new_count = get_api_usage(date)
        if new_count != updated_count:
            print("...")
            drive.mount("/gdrive", force_remount=True)
        _, new_count = get_api_usage(date)
        # print(new_count)
        # while new_count != updated_count:
        #     print("Updating new API usage...")
        #     _, new_count = get_api_usage(date)
        print("API usage count updated! Current usage: {}".format(new_count))


### NewsAPI Usage Monitor

In [0]:
def get_news_api_usage(date):
    if os.path.lexists(News_API_usage_file):
        api_usage = pd.read_json(News_API_usage_file, orient="records")
        if date in api_usage.columns:
            return api_usage, api_usage[date].squeeze()
        else:
            # update_news_api_count(date)
            return api_usage, 1
    return None, None


def update_news_api_count(date):
    api_usage, count = get_news_api_usage(date)
    if not api_usage.empty:
        updated_count = 0
        if date in api_usage.columns:
            api_usage[date] += 1
            updated_count = api_usage[date].squeeze()
        else:
            api_usage[date] = 0
        api_usage.to_json(News_API_usage_file, orient="records", date_format="iso")
        sleep(3)
        _, new_count = get_news_api_usage(date)
        print("Updating new API usage...")
        # print(new_count)
        if new_count != updated_count:
            print("...")
            drive.mount("/gdrive", force_remount=True)
        _, new_count = get_news_api_usage(date)
        print("API usage count updated! Current usage: {}".format(new_count))


### Fetch NewsAPI Credentials

In [0]:
def get_api_key():
    src_ = "/gdrive/My Drive/Political Risk Project/NewsAPIKey/"
    with open(os.path.join(src_, "news_api_key.json")) as key:
        obj = json.load(key)
        return obj["api_key"]


newsapi = NewsApiClient(api_key=get_api_key())

### Initialize News API variables

In [0]:
news_sources = ",".join(["reuters", "ansa", "google-news-it"])
print(news_sources)
# queries = ["early election", "snap election", "government collapse", 
#            "government coalition", "election", "instability", "uncertainty",
#            "crisis", "coalition"]
country = "it"
page_size = 100
endpoint = "everything" # "top-headlines"

reuters,ansa,google-news-it


### Fetch News Headlines using the News API

In [0]:
def fetch_news(queries, sources, page_size=100, **kwargs):
    top_headlines_map = dict()
    # saved_files = list()
    news_collection = pd.DataFrame()
    for query in queries:
        headlines = newsapi.get_everything(q=query,
                                           sources=sources,
                                           sort_by="relevancy",
                                           page_size=page_size)
        if headlines["totalResults"] > 0:
            # response_text = json.loads(headlines.text)
            all_articles = headlines["articles"]
            collection = list()
            for article in all_articles:
                data_ = dict()
                source = article.pop("source")
                content = article.pop("content")
                source = {"source": source["id"]}
                article.update(source)
                collection.append(article)
            data_df = pd.DataFrame(data=collection)
            if not data_df.empty:
                match_str = "({})".format(query)
                reqd_df = data_df[data_df["description"].str.contains(
                    match_str)]
                if news_collection.empty:
                    news_collection = reqd_df
                else:
                    news_collection = news_collection.append(reqd_df)
                # responses_file = "{}_on_{}.csv".format(endpoint,
                #                                       query.replace(" ", "_"))
                # top_headlines_map[query] = reqd_df
            
                # out_file = os.path.join(output_directory, responses_file)
                # data_df.to_csv(out_file, index=False)
            update_news_api_count(today())
    return news_collection


### Functions to fetch tweets by querying the Twitter API with the given queries and filters

In [0]:
def get_user_queries_filter(queries, from_users):
    user_queries_filter = '("{}") from:{} -has:media'
    user_queries_list = list()
    for query in queries:
        for user in from_users:
            filter_ = user_queries_filter.format(query, user)
            user_queries_list.append(filter_)
    return user_queries_list


def get_user_specific_filter(from_users):
    user_specific_filter =  'from:{} -has:media'
    user_specific_list = list()
    # for query in queries:
    for user in from_users:
        filter_ = user_specific_filter.format(user)
        user_specific_list.append(filter_)
    return user_specific_list


def get_tweets(query_set, query_filter=None):
    tweets_list = list()
    for query in query_set:
        curr_month = "{}-{}".format(dt.now().year, 
                                    format_date_str(dt.now().month))
        _, curr_usage = get_api_usage(curr_month)
        if curr_usage >= 24999:
            print("Twitter API limit is about to exceed! Returning now ...\n")
            break
        if query_filter:
            q = '("{}") {}'.format(query, query_filter)
        else:
            q = "{}".format(query)
            print("No filter/Filter in query_set: {}".format(q))
        print("Collecting for {}".format(q))
        try:
            rule = gen_rule_payload(q, results_per_call=100)
            tweets = collect_results(rule,
                                    max_results=100,
                                    result_stream_args=premium_search_args)
            print(len(tweets))
            update_api_count(curr_month, len(tweets))
            tweets_list.append(tweets)

        except Exception as e:
            print("Exception occurred while fetching tweets: {}".format(e))
            break
    return tweets_list


In [0]:
user_specific_queries = get_user_specific_filter(from_users)
user_specific_queries

['from:lorepregliasco -has:media',
 'from:FerdiGiugliano -has:media',
 'from:AlbertoNardelli -has:media',
 'from:gavinjones10 -has:media']

### Process the tweets to produce a collection of tweet-text, hashtags, links, and the date-time when they were created.

In [0]:
def process_tweets(tweets_list):
    tweets_collection = pd.DataFrame()
    for tweets in tweets_list:
        for tweet in tweets:
            structured_tweets = structure_text(tweet.all_text)
            structured_tweets.update({"created_time":tweet.created_at_datetime})
            reqd_df = pd.DataFrame([structured_tweets])

            if tweets_collection.empty:
                tweets_collection = reqd_df
            else:
                tweets_collection = tweets_collection.append(reqd_df)
    return tweets_collection

### To save the output of the tweet-information into the categories, General or Italy

In [0]:
def save_output(tweets_collection, 
                curr_datetime,
                category="General", 
                subset="predictions"):
    if category in ["General", "Italy"]:
        file_name = 'relevant_{}_{}.csv'.format(subset, curr_datetime)
        tweets_collection.to_csv(
            os.path.join(tw_output_directory, category, file_name),
            index=False)
    else:
        print("Output Directory is not valid!")

In [0]:
def get_saved_versions(name, location):
    files = os.listdir(location)
    found_files = list()
    for f in files:
        match = re.search("({})\-(\d+)\.csv$".format(name), f)
        if match:
            f_groups = match.groups()
            saved_version = f_groups[-1]
            map_ = {"file": f, "version": int(saved_version)}
            found_files.append(map_)
    if found_files:
        versions_ = pd.DataFrame(found_files)
        return versions_
    else:
        return pd.DataFrame()


def save_output_version(data, name, location, version=None):
    filename_format = "{name}-{version}.csv"
    next_version = int(version) if version else 1
    all_versions = get_saved_versions(name, location)
    if not all_versions.empty:
        next_version = 1 + int(all_versions["version"].max())
        if version and version >= next_version:
            next_version = version
    new_name = filename_format.format(name=name, version=next_version)
    data.to_csv(os.path.join(location, new_name), index=False)
    return new_name


In [0]:
# get_saved_versions("it-it-tweets", os.path.join(tw_output_directory, "Italy"))
get_saved_versions("en-it-news", nh_output_directory)

Unnamed: 0,file,version
0,en-it-news-1.csv,1
1,en-it-news-2.csv,2


## Deployed model

### Endpoint URL to query the model in real time

In [0]:
enpoint_uri = "http://ac6a2064dee3c11e99ced0a13821e56d-733867741.ap-southeast-1.elb.amazonaws.com/sentiment/classifier"
headers = {"content-type": "application/json"}

### Get polarity of a piece of text

In [0]:
def get_polarity(sentence):
    data = json.dumps({"polarity": sentence})
    response = requests.post(enpoint_uri, data=data, headers=headers)
    return response.json() if response.status_code == 200 else ""

## Get and Process Tweets from a set of users

### Get tweets posted by a list of users

In [0]:
user_specific_tweets = get_tweets(query_set=user_specific_queries)

No filter/Filter in query_set: ("early election") from:lorepregliasco -has:media
No filter/Filter in query_set: ("early election") from:FerdiGiugliano -has:media
No filter/Filter in query_set: ("early election") from:AlbertoNardelli -has:media
No filter/Filter in query_set: ("early election") from:gavinjones10 -has:media
No filter/Filter in query_set: ("snap election") from:lorepregliasco -has:media
No filter/Filter in query_set: ("snap election") from:FerdiGiugliano -has:media
No filter/Filter in query_set: ("snap election") from:AlbertoNardelli -has:media
No filter/Filter in query_set: ("snap election") from:gavinjones10 -has:media
No filter/Filter in query_set: ("government collapse") from:lorepregliasco -has:media
No filter/Filter in query_set: ("government collapse") from:FerdiGiugliano -has:media
No filter/Filter in query_set: ("government collapse") from:AlbertoNardelli -has:media
No filter/Filter in query_set: ("government collapse") from:gavinjones10 -has:media


### Process the tweets

In [0]:
user_tweets_collection = process_tweets(user_specific_tweets)
user_tweets_collection

### Clean the DataFrame of tweets

In [0]:
user_tweets_collection = user_tweets_collection.reset_index().drop(columns=['index'])
duplicacy_subset = list(set(user_tweets_collection.columns) - set(["created_time"]))
user_tweets_collection.drop_duplicates(subset=duplicacy_subset, inplace=True)
user_tweets_collection.dropna(subset=["text"])
# user_tweets_collection.dropna(subset=["text"])

Unnamed: 0,created_time,handles,links,tags,text
0,2019-10-22 08:56:15,"@PaoloZanetto,@lorepregliasco,@Baselice,,@Agen...","https://t.co/8c4rOIxKIl,https://t.co/dhSP6tfxr...",,"Today in Rome, beautiful things! Inside Politi..."
1,2019-10-22 08:50:23,@LeFrecce,,,"9619 train, stopped for half an hour in the tu..."
2,2019-10-22 08:05:33,@LeFrecce,,,"Thank you, all right on board, but the 'delay'..."
3,2019-10-22 07:57:54,@StarWars:,"https://t.co/jjhAMAufLW,https://t.co/MLbzRXrCJ...","#TheRiseofSkywalker,#TheRiseOfSkywalker","The saga will end, the story lives forever. Wa..."
4,2019-10-22 07:55:39,,https://t.co/P26Mx5UtTM,,"Beautiful things, this afternoon in Rome!"
5,2019-10-22 07:46:41,@LeFrecce,,#neolingua,"""Travel time 20 minutes longer than expected""...."
6,2019-10-22 05:38:53,"@chedisagio,@JustinTrudeau,@realDonaldTrump",,,Colleges
7,2019-10-22 05:00:05,,https://t.co/4Orf3CJibI,"#Canada:,#canadavotes2019,#MaratonaYouTrend",Liberals and Democrats (NDP) would currently l...
8,2019-10-22 04:47:43,,,"#Canada:,#canadavotes2019,#MaratonaYouTrend","among the colleges to be assigned, the 2 most ..."
9,2019-10-22 04:44:45,@ElectionsCan_E,,"#Canada,#canadavotes2019,#MaratonaYouTrend",For this it would be the definitive distributi...


## Get and Process Tweets from Italy

### Clean the DataFrame of tweets from Italy

In [0]:
italy_tweets_collection = italy_tweets_collection.reset_index().drop(columns=['index'])
duplicacy_subset = list(set(italy_tweets_collection.columns) - set(["created_time"]))
italy_tweets_collection.drop_duplicates(subset=duplicacy_subset, inplace=True)
italy_tweets_collection

Unnamed: 0,text,links,handles,tags,created_time
0,How will an early election stop us crashing ou...,,"@DanielMRussell,@Boeing7G7",,2019-10-27 10:07:35
1,"People voted just a year and a half ago, there...",,"@blackhvndreds,@EuropeElects",,2019-10-24 08:56:38
2,Elezioni anticipate I saw this on the BBC and ...,https://t.co/yr67geg95b,,,2019-10-19 22:24:18
3,Appello I saw this on the BBC and thought you ...,https://t.co/gLzAV9I6eT,,,2019-11-14 06:47:18
4,Still waiting for a witness to state unequivoc...,"https://t.co/M4KI5OrAJB,https://t.co/iBKZRxamlN",,,2019-11-13 19:40:41
...,...,...,...,...,...
218,Governo di minoranza saw this on the BBC and t...,https://t.co/0H1q5ofGKO,,,2019-10-23 21:59:48
219,Big takeaway electoral reform. You cannot hav...,,,#cdnpoli,2019-10-22 07:55:28
220,Chance of a full blown coalition between Liber...,,@reicurran,,2019-10-22 03:06:55
221,New Coalition Is Already Fighting Over Its Nex...,https://t.co/P2JjsIHwR4,,#Italy's,2019-10-21 09:46:54


In [0]:
save_output_version(data=italy_tweets_collection,
                    name="en-it-tweets",
                    location=tw_output_directory)

'en-it-tweets-1.csv'

### Attach predictions of the tweets into the DataFrame

In [0]:
italy_tweets_collection["polarity_v1"] = italy_tweets_collection["text"].apply(get_polarity)

In [0]:
italy_tweets_collection

Unnamed: 0,created_time,handles,links,tags,text,polarity_v1
0,2019-10-19 22:24:18,,https://t.co/yr67geg95b,,Elezioni anticipate I saw this on the BBC and ...,positive
0,2019-09-29 07:39:34,,https://t.co/kkhsWN5HHV,#Austria,Un voto importante con incognite di coalizione...,positive
0,2019-09-29 07:25:09,@WoodleyNLP,,,I'd like to know how he's stumbled upon a magi...,negative


### Save output of the collection of tweets in Google Drive
Folder location: https://drive.google.com/open?id=1iX7i_jarE7hhOVf68p-KzUECrEG59u4T

In [0]:
save_output(italy_tweets_collection, 
            category="Italy",
            subset="predictions", 
            curr_datetime=today())

## Get generic tweets (relevant to the keywords)

### Get all tweets that contain the keywords like *snap election*, *early election*, *political instability*

In [0]:
generic_tweets = get_tweets(queries, generic_query_filter)

Collecting for ("early election") -has:media
Collecting for ("snap election") -has:media
Collecting for ("government collapse") -has:media
Collecting for ("government coalition") -has:media
Collecting for ("election") -has:media
Collecting for ("instability") -has:media
Collecting for ("uncertainty") -has:media
Collecting for ("crisis") -has:media
Collecting for ("coalition") -has:media


### Process all the generic tweets

In [0]:
generic_tweets_collection = process_tweets(generic_tweets)

clean_str = Salmond's November rape Trial could be a reason SNP may agree to an early election
cleaner_str = Salmond's November rape Trial could be a reason SNP may agree to an early election
clean_str = was weighing whether to push for an early or try again to pass his stalled after blocked a fast-track plan to approve his before...
cleaner_str = was weighing whether to push for an early or try again to pass his stalled after blocked a fasttrack plan to approve his before...
clean_str = Salmond's November rape Trial could be a reason SNP may agree to an early election
cleaner_str = Salmond's November rape Trial could be a reason SNP may agree to an early election
clean_str = On this, v interesting piece which says that the public actually want a general election, those who don't are (generally) politicians and political journalists who think the public don't either.
cleaner_str = On this, v interesting piece which says that the public actually want a general election, those who don't 

### Clean the DataFrame of generic tweets

In [0]:
generic_tweets_collection = generic_tweets_collection.reset_index().drop(columns=['index'])
duplicacy_subset = list(set(generic_tweets_collection.columns) - set(["created_time"]))
generic_tweets_collection.drop_duplicates(subset=duplicacy_subset, inplace=True)
generic_tweets_collection

Unnamed: 0,created_time,handles,links,tags,text
0,2019-10-23 16:18:11,,https://t.co/HGxdrMJWy8,,Salmond's November rape Trial could be a reaso...
1,2019-10-23 16:17:06,,https://t.co/ZOXs5XqLNb,"#British,#PrimeMinister,#BorisJohnson,#electio...",was weighing whether to push for an early or t...
3,2019-10-23 16:14:06,,https://t.co/pYf5OrjJ5k,,"On this, v interesting piece which says that t..."
4,2019-10-23 16:13:19,,https://t.co/P8zlqny5Cc,,Boris Johnson mulls early election over Brexit...
6,2019-10-23 16:12:01,"@LoyalDefender2K,@Ali_H_Smile,@SkyNews",,,An early election is important. Let's hope we ...
18,2019-10-23 15:56:28,,https://t.co/ZgKuNTEyCW,,Oh dear. It's not a matter of opinion the PM c...
24,2019-10-23 15:53:03,,,,I would love there to be an election before Xm...
25,2019-10-23 15:51:12,"@PoliticalDataUK,@davidallengreen",,,Not sure. It all depends on how you spin it to...
30,2019-10-23 15:47:37,@ABC,https://t.co/agPUy6B3wm,,UK prime minister mulls early election over Br...
38,2019-10-23 15:34:46,"@leebee999,@BlueArmyFaction","https://t.co/HGxdrMJWy8,https://t.co/jF3yXOpvxH",,Salmond's November rape Trial could be a reaso...


### Save the generic Twitter results to Google Drive

In [0]:
save_output(generic_tweets_collection, subset="tweets", curr_datetime=today())

### Merge the predictions of tweets in the DataFrame

In [0]:
generic_tweets_collection["polarity_v1"] = generic_tweets_collection["text"].apply(get_polarity)

### Save the predicted generic tweets in Google Drive
Folder location: https://drive.google.com/open?id=13FRvvDetM4Bcsdw9dblKuEr9Tyz219_A

In [0]:
save_output(generic_tweets_collection, 
            subset="predictions", 
            curr_datetime=today())

## Fetch tweets (en-it, it-it)

### Get and save tweets specific to the country of origin (English)

In [0]:
en_it_tweets = get_tweets(queries, it_query_filter)
en_it_tweets_collection = process_tweets(en_it_tweets)
en_it_tweets_collection = en_it_tweets_collection.reset_index().drop(columns=['index'])
duplicacy_subset = list(set(en_it_tweets_collection.columns) - set(["created_time"]))
en_it_tweets_collection.drop_duplicates(subset=duplicacy_subset, inplace=True)
en_it_tweets_collection

   2019-10-01T00:00:00.000Z  2019-11
0                     24000     7306
['2019-10-01T00:00:00.000Z', '2019-11']
Collecting for ("early election") -has:media place_country:IT
0
   2019-10-01T00:00:00.000Z  2019-11
0                     24000     7306
['2019-10-01T00:00:00.000Z', '2019-11']
['2019-10-01T00:00:00.000Z', '2019-11']
Month present

api_usage = 0    7306
Name: 2019-11, dtype: int64
Updating new API usage...
   2019-10-01T00:00:00.000Z  2019-11
0                     24000     7306
['2019-10-01T00:00:00.000Z', '2019-11']
   2019-10-01T00:00:00.000Z  2019-11
0                     24000     7306
['2019-10-01T00:00:00.000Z', '2019-11']
API usage count updated! Current usage: 7306
   2019-10-01T00:00:00.000Z  2019-11
0                     24000     7306
['2019-10-01T00:00:00.000Z', '2019-11']
Collecting for ("snap election") -has:media place_country:IT
0
   2019-10-01T00:00:00.000Z  2019-11
0                     24000     7306
['2019-10-01T00:00:00.000Z', '2019-11']
['2019-10-01T

Unnamed: 0,text,links,handles,tags,created_time
0,"Just days after Sri Lanka's election, hundreds...","https://t.co/3kY5kdmdxL,https://t.co/ptynkU5UQ...",,,2019-11-28 05:45:35
1,So this isn't true is it then? It shows what t...,"https://t.co/AWFSFblTNt,https://t.co/ZXCeCPQvMq",,,2019-11-27 19:02:45
2,"Oh please, They had an impeachment hearing fil...",,"@MadBlackTitan,@DuaneGundrum,@RilsRislan,@ABC",,2019-11-27 07:23:30
3,Xiang and Kung were referred to the Taipei Dis...,https://t.co/F1o49mlWuf,,,2019-11-26 20:42:37
4,!!! ISN'T IT IRONIC ??? Boris' selfdeprication...,"https://t.co/7PNTneeQDN,https://t.co/zaYJhEc9qs",,"#Millenials,#ToryCoreVote:",2019-11-26 18:40:59
...,...,...,...,...,...
212,Global Coalition to Defeat ISIS weighs next st...,https://t.co/xFxHnvASfY,@ShareAmerica,,2019-11-14 07:24:09
213,I don't believe the will support a 2nd referen...,,"@aaronjamesf1,@BarbaraNairn","#LibDems,#Brexit,#Tory,#Beware",2019-11-11 20:02:06
214,!!! Need coalition with Tories to keep Brexit ...,,"@DMDent,@brexitparty_uk",,2019-11-07 08:06:42
215,!!! BLIND !!! ERG Tories CAN'T SEE that HUNG P...,https://t.co/CUar4Q7iCK,@brexitparty_uk,,2019-11-03 22:46:27


In [0]:
save_output_version(data=en_it_tweets_collection,
                    name="en-it-tweets",
                    location=tw_output_directory)

'en-it-tweets-3.csv'

### Get and save tweets from Italy (Italian)

In [0]:
it_it_tweets = get_tweets(queries_it, it_query_filter)
it_it_tweets_collection = process_tweets(it_it_tweets)
it_it_tweets_collection = it_it_tweets_collection.reset_index().drop(columns=['index'])
duplicacy_subset = list(set(it_it_tweets_collection.columns) - set(["created_time"]))
it_it_tweets_collection.drop_duplicates(subset=duplicacy_subset, inplace=True)
it_it_tweets_collection

   2019-10-01T00:00:00.000Z  2019-11
0                     24000     6953
['2019-10-01T00:00:00.000Z', '2019-11']
Collecting for ("elezioni anticipate") -has:media place_country:IT
12
   2019-10-01T00:00:00.000Z  2019-11
0                     24000     6953
['2019-10-01T00:00:00.000Z', '2019-11']
['2019-10-01T00:00:00.000Z', '2019-11']
Month present

api_usage = 0    6965
Name: 2019-11, dtype: int64
Updating new API usage...
   2019-10-01T00:00:00.000Z  2019-11
0                     24000     6965
['2019-10-01T00:00:00.000Z', '2019-11']
   2019-10-01T00:00:00.000Z  2019-11
0                     24000     6965
['2019-10-01T00:00:00.000Z', '2019-11']
API usage count updated! Current usage: 6965
   2019-10-01T00:00:00.000Z  2019-11
0                     24000     6965
['2019-10-01T00:00:00.000Z', '2019-11']
Collecting for ("elezioni anticipate") -has:media place_country:IT
12
   2019-10-01T00:00:00.000Z  2019-11
0                     24000     6965
['2019-10-01T00:00:00.000Z', '2019-11']


Unnamed: 0,text,links,handles,tags,created_time
0,"On. Forget that Bettini has1) theorized the ""c...",,"@cpetruccioli,@LaStampa,@pdnetwork,@nzingaretti","#ItaliaViva.,#pd??",2019-11-27 12:59:14
1,Bravo you reduced this incapable but cunning a...,,"@Mov5Stelle,@luigidimaio,@beppe_grillo","#Grillo,#Salvini,#DiMaio,#DiMaiocialtrone",2019-11-23 12:32:30
2,All this crowd and for early elections.,https://t.co/MNWtAS5MGV,,,2019-11-15 21:29:19
3,The League sees the early elections coming clo...,https://t.co/RJIfNZxf06,,#agi,2019-11-15 15:51:24
4,Yet another joke !!!! The M5S parliamentarians...,,@Corriere,,2019-11-13 08:33:51
...,...,...,...,...,...
347,"Have you heard any TV talk about this, yet Ber...",https://t.co/iGytQ1pkdD,,,2019-10-29 21:28:33
348,Report becomes the umpteenth firearm of a coal...,,"@GiorgiaMeloni,@reportrai3,@MarcelloFoa,@abara...",,2019-10-29 19:33:46
350,At the moment I don't think there is a coaliti...,,@matteosalvinimi....,,2019-10-29 16:27:06
351,And if you give these premises and correct to ...,,,,2019-10-29 14:36:38


In [0]:
save_output_version(data=it_it_tweets_collection,
                    name="it-it-tweets",
                    location=os.path.join(tw_output_directory, "Italy"))

'it-it-tweets-3.csv'

## Collect Data

### Correct Headers of files with inconsistent column names

In [0]:
def file_valid(file_path):
    if os.path.isfile(file_path):
        if re.search(".*\.csv$", file_path):
            return file_path
    return ""


def correct_files(file_collection):
    for filename in file_collection:
        if filename:
            df = pd.read_csv(filename, encoding="utf-8")
            df.rename(columns={df.columns[0]: "text"}, 
                    inplace=True)
            df.to_csv(filename, index=False)


files_to_correct = [f for f in map(lambda s:
    file_valid(os.path.join(tw_output_directory, s)),
    os.listdir(tw_output_directory))]

correct_files(files_to_correct)

# file_ = pd.read_csv(os.path.join(tw_output_directory, 
#                                  "relevant_tweets_output_2.csv"),
#                     encoding="utf-8")
# file_.rename(columns={"headline_text": "text"}, inplace=True)
# file_

### Fetch en-it News Headlines

In [0]:
en_fresh_headlines = fetch_news(queries, news_sources)
if not en_fresh_headlines.empty:
    en_news_collection = en_fresh_headlines.reset_index().drop(columns=["index"])
    en_news_collection.to_csv(os.path.join(nh_output_directory, 
                                        "news_{}.csv".format(today())),
                            index=False)
    print("en_news_collection")
    print(en_news_collection.columns)

  return func(self, *args, **kwargs)


Updating new API usage...
API usage count updated! Current usage: 0
Updating new API usage...
API usage count updated! Current usage: 1
Updating new API usage...
API usage count updated! Current usage: 2
Updating new API usage...
API usage count updated! Current usage: 3
Updating new API usage...
API usage count updated! Current usage: 4
Updating new API usage...
API usage count updated! Current usage: 5
Updating new API usage...
API usage count updated! Current usage: 6
Updating new API usage...
API usage count updated! Current usage: 0
Updating new API usage...
API usage count updated! Current usage: 1
en_news_collection
Index(['author', 'title', 'description', 'url', 'urlToImage', 'publishedAt',
       'source'],
      dtype='object')


In [0]:
save_output_version(data=en_news_collection, 
                    name="en-it-news", 
                    location=nh_output_directory)

'en-it-news-3.csv'

### Fetch it-it News Headlines

In [0]:
fresh_headlines = fetch_news(queries_it, news_sources)
if not fresh_headlines.empty:
    it_news_collection = fresh_headlines.reset_index().drop(columns=["index"])
    it_news_collection.to_csv(os.path.join(nh_output_directory, 
                                        "news_{}.csv".format(today())),
                            index=False)
    print("it_news_collection")
    print(it_news_collection.columns)

  return func(self, *args, **kwargs)


Updating new API usage...
API usage count updated! Current usage: 2
Updating new API usage...
API usage count updated! Current usage: 3
Updating new API usage...
API usage count updated! Current usage: 4
Updating new API usage...
API usage count updated! Current usage: 5
Updating new API usage...
API usage count updated! Current usage: 6
Updating new API usage...
API usage count updated! Current usage: 7
Updating new API usage...
API usage count updated! Current usage: 8
Updating new API usage...
API usage count updated! Current usage: 9
it_news_collection
Index(['author', 'title', 'description', 'url', 'urlToImage', 'publishedAt',
       'source'],
      dtype='object')


In [0]:
save_output_version(data=it_news_collection, 
                    name="it-it-news", 
                    location=nh_output_directory)

'it-it-news-3.csv'

### Collect and combine previously saved and fresh data from Twitter and NewsAPI (and also from Firebolt)

In [0]:
def get_previous_data(file_loc, column_list=None):
    # file_name = ""
    # file_loc = os.path.join(DRIVE_LOCATION, PREV_DATA_DIR, file_name)
    prev_data = pd.read_csv(file_loc)
    if not prev_data.empty:
        return prev_data[column_list] if column_list else prev_data
    return None


def get_fresh_data(tweets, headlines):
    processed_tweets = None
    news_collection = None
    if tweets:
        fresh_data = dict()
        # get fresh tweets:
        # tweets = get_tweets(queries, query_filter)
        processed_tweets = process_tweets(tweets)
        processed_tweets = processed_tweets.reset_index().drop(
            columns=['index'])
        print("processed_tweets")
        print(processed_tweets.columns)
        duplicacy_subset = list(
            set(processed_tweets.columns) - {"created_time"})
        processed_tweets.drop_duplicates(subset=duplicacy_subset, inplace=True)
        processed_tweets.to_csv(os.path.join(tw_output_directory, 
                                             "tweets_{}.csv".format(today())),
                                index=False)
    # get fresh news headlines:
    # headlines = fetch_news(queries, sources, ...)
    if not headlines.empty:
        news_collection = headlines.reset_index().drop(columns=["index"])
        news_collection.to_csv(os.path.join(nh_output_directory, 
                                            "news_{}.csv".format(today())),
                               index=False)
        print("news_collection")
        print(news_collection.columns)
    fresh_data["tweets"] = processed_tweets
    fresh_data["headlines"] = news_collection
    return fresh_data


def frame_files(files):
    final_df = pd.DataFrame()
    for f in files:
        if os.path.exists(f) and os.path.isfile(f):
            inter_df = pd.read_csv(f, encoding="UTF-8")
            final_df = final_df.append(inter_df)
    return final_df


def combine_data_offline(tweets_files, news_files, prev_files_map=None):
    processed_headlines = processed_tweets = prev_data_df = pd.DataFrame()
    # tweets_files = "it-it-tweets" if not tweets_files else tweets_files
    # news_files = "it-it-news" if not news_files else news_files

    if prev_files_map:
        for filename, column in prev_files_map.items():
            file_df = get_previous_data(filename, column)
            print("prev_data = {}".format(file_df.shape))
            if prev_data_df.empty:
                prev_data_df = file_df
            else:
                prev_data_df = prev_data_df.append(file_df)
        print("combined prev_data = {}".format(prev_data_df.shape))
        prev_data_df = prev_data_df.reset_index().drop(columns=["index"])
        print("post-reset prev_data = {}".format(prev_data_df.shape))
        prev_data_df.rename(columns={prev_data_df.columns[0]: "text"}, 
                            inplace=True)
        prev_data_df.drop_duplicates(inplace=True)
        print("post-removing duplicates prev_data = {}".format(prev_data_df.shape))
    print("final prev_data = {}".format(prev_data_df.shape))
    
    tw_text = nh_text = list()
    saved_tweets = get_saved_versions(tweets_files["type"], 
                                      tweets_files["path"])
    print("\nFetching it-it Tweets")
    if not saved_tweets.empty:
        tweets_ = list(saved_tweets["file"])
        processed_tweets = pd.DataFrame()
        for tweet_ in tweets_:
            print("reading {}".format(os.path.join(tweets_files["path"], tweet_)))
            df_ = pd.read_csv(os.path.join(tweets_files["path"], tweet_),
                              encoding="UTF-8")
            print("{}: shape = {}".format(tweet_, df_.shape))
            processed_tweets = processed_tweets.append(df_)
        if not processed_tweets.empty:
            tw_text = list(processed_tweets["text"])
    print("len(tw_text): {}".format(len(tw_text)))
    
    saved_news = get_saved_versions(news_files["type"], news_files["path"])
    print("\nFetching it-it News Headlines")
    if not saved_news.empty:
        news_ = list(saved_news["file"])
        processed_headlines = pd.DataFrame()
        for headline_ in news_:
            print("reading {}".format(os.path.join(news_files["path"], headline_)))
            df_ = pd.read_csv(os.path.join(news_files["path"], headline_),
                              encoding="UTF-8")
            print("{}: shape = {}".format(headline_, df_.shape))
            processed_headlines = processed_headlines.append(df_)
        if not processed_headlines.empty:
            nh_text = list(processed_headlines["url"])
    print("len(nh_text): {}".format(len(nh_text)))
    tw_text.extend(nh_text)
    if not prev_data_df.empty:
        base_ = list(prev_data_df["text"])
        base_.extend(tw_text)
    else:
        base_ = tw_text
    print("len(base_): {}".format(len(base_)))
    final_df = pd.DataFrame(base_, columns=["data"])
    final_df.drop_duplicates(inplace=True)
    return final_df


def combine_data(prev_files):
    prev_data_df = pd.DataFrame()
    for filename, column in prev_files.items():
        file_df = get_previous_data(filename, column)
        if prev_data_df.empty:
            prev_data_df = file_df
        else:
            prev_data_df = prev_data_df.append(file_df)
    prev_data_df = prev_data_df.reset_index().drop(columns=["index"])
    prev_data_df.rename(columns={prev_data_df.columns[0]: "text"}, inplace=True)
    prev_data_df.drop_duplicates(inplace=True)
    # return prev_data_df
    # get fresh data
    twitter_keywords = queries
    selected_filter = it_query_filter
    tweets = get_tweets(twitter_keywords, selected_filter)
    news_keywords = twitter_keywords
    # news_sources = []
    headlines = fetch_news(news_keywords, news_sources)
    fresh_data = get_fresh_data(tweets, headlines)
    combined_data = prev_data_df.append(fresh_data["tweets"]["text"]).append(
        fresh_data["headlines"]["url"])
    return combined_data


In [0]:
tweets_files_it = {"type": "it-it-tweets", 
                "path": os.path.join(tw_output_directory, "Italy")}
news_files_it = {"type": "it-it-news", 
                "path": nh_output_directory}
search_results_path = os.path.join(DRIVE_LOCATION, 
                                   "Search-Results-{}.csv".format(
                                       today().split("_")[0]))
search_results_map = {search_results_path: "link"}
search_combined_it_it = combine_data_offline(tweets_files_it, 
                                              news_files_it,
                                              search_results_map)
search_combined_it_it["rating"] = ""


prev_data = (105,)
combined prev_data = (105,)
post-reset prev_data = (105, 1)
post-removing duplicates prev_data = (105, 1)
final prev_data = (105, 1)

Fetching it-it Tweets
reading /gdrive/My Drive/Political Risk Project/Test Data/Twitter Results/Italy/it-it-tweets-1.csv
it-it-tweets-1.csv: shape = (160, 5)
reading /gdrive/My Drive/Political Risk Project/Test Data/Twitter Results/Italy/it-it-tweets-2.csv
it-it-tweets-2.csv: shape = (282, 5)
reading /gdrive/My Drive/Political Risk Project/Test Data/Twitter Results/Italy/it-it-tweets-3.csv
it-it-tweets-3.csv: shape = (250, 5)
len(tw_text): 692

Fetching it-it News Headlines
reading /gdrive/My Drive/Political Risk Project/Test Data/News Results/it-it-news-1.csv
it-it-news-1.csv: shape = (86, 7)
reading /gdrive/My Drive/Political Risk Project/Test Data/News Results/it-it-news-2.csv
it-it-news-2.csv: shape = (86, 7)
reading /gdrive/My Drive/Political Risk Project/Test Data/News Results/it-it-news-3.csv
it-it-news-3.csv: shape = (77, 7)
le

In [0]:
search_combined_it_it.to_csv(
    os.path.join(DRIVE_LOCATION, "it-it", "Data-Collection-{}-v1.csv".format(
        today_date
    )), 
    index=False)

In [0]:
tweets_files_it = {"type": "it-it-tweets", 
                "path": os.path.join(tw_output_directory, "Italy")}
news_files_it = {"type": "it-it-news", 
                "path": nh_output_directory}
offline_combined_it_it = combine_data_offline(tweets_files_it, news_files_it)


final prev_data = (0, 0)

Fetching it-it Tweets
reading /gdrive/My Drive/Political Risk Project/Test Data/Twitter Results/Italy/it-it-tweets-1.csv
it-it-tweets-1.csv: shape = (160, 5)
reading /gdrive/My Drive/Political Risk Project/Test Data/Twitter Results/Italy/it-it-tweets-2.csv
it-it-tweets-2.csv: shape = (282, 5)
len(tw_text): 442

Fetching it-it News Headlines
reading /gdrive/My Drive/Political Risk Project/Test Data/News Results/it-it-news-1.csv
it-it-news-1.csv: shape = (86, 7)
reading /gdrive/My Drive/Political Risk Project/Test Data/News Results/it-it-news-2.csv
it-it-news-2.csv: shape = (86, 7)
len(nh_text): 172
len(base_): 614


Unnamed: 0,data
0,How will an early election stop us crashing ou...
1,"People voted just a year and a half ago, there..."
2,Elezioni anticipate I saw this on the BBC and ...
3,Appello I saw this on the BBC and thought you ...
4,Still waiting for a witness to state unequivoc...
...,...
569,http://www.ansa.it/sito/notizie/economia/2019/...
584,http://www.ansa.it/sito/notizie/topnews/2019/1...
608,http://www.ansa.it/sito/notizie/topnews/2019/1...
610,http://www.ansa.it/sito/notizie/topnews/2019/1...


In [0]:
offline_combined_it_it.to_csv(
    os.path.join(DRIVE_LOCATION, "it-it", "Data-Collection-{}.csv".format(
        today_date
    )), 
    index=False)

In [0]:
tweets_files_en = {"type": "en-it-tweets", 
                "path": tw_output_directory}
news_files_en = {"type": "en-it-news", 
                "path": nh_output_directory}
offline_combined_en_it = combine_data_offline(tweets_files_en, news_files_en)
offline_combined_en_it

final prev_data = (0, 0)

Fetching it-it Tweets
reading /gdrive/My Drive/Political Risk Project/Test Data/Twitter Results/en-it-tweets-1.csv
en-it-tweets-1.csv: shape = (160, 5)
reading /gdrive/My Drive/Political Risk Project/Test Data/Twitter Results/en-it-tweets-2.csv
en-it-tweets-2.csv: shape = (114, 5)
reading /gdrive/My Drive/Political Risk Project/Test Data/Twitter Results/en-it-tweets-3.csv
en-it-tweets-3.csv: shape = (163, 5)
len(tw_text): 437

Fetching it-it News Headlines
reading /gdrive/My Drive/Political Risk Project/Test Data/News Results/en-it-news-1.csv
en-it-news-1.csv: shape = (220, 7)
reading /gdrive/My Drive/Political Risk Project/Test Data/News Results/en-it-news-2.csv
en-it-news-2.csv: shape = (226, 7)
reading /gdrive/My Drive/Political Risk Project/Test Data/News Results/en-it-news-3.csv
en-it-news-3.csv: shape = (229, 7)
len(nh_text): 675
len(base_): 1112


Unnamed: 0,data
0,How will an early election stop us crashing ou...
1,"People voted just a year and a half ago, there..."
2,Elezioni anticipate I saw this on the BBC and ...
3,Appello I saw this on the BBC and thought you ...
4,Still waiting for a witness to state unequivoc...
...,...
1097,https://www.reuters.com/article/us-yemen-secur...
1099,https://www.reuters.com/article/us-spain-polit...
1102,https://www.reuters.com/article/us-yemen-secur...
1105,https://www.reuters.com/article/us-ethiopia-po...


In [0]:
offline_combined_en_it.to_csv(
    os.path.join(DRIVE_LOCATION, "en-it", "Data-Collection-{}-v2.csv".format(
        today().split("_")[0]
    )), 
    index=False)

In [0]:
# prev_files = dict()
# prev_files.extend(os.listdir(nh_output_directory))

def get_prev_files():
    dir_col_map = {tw_output_directory: "text",
                   nh_output_directory: "url",
                   search_results: "link"
                   }
    prev_files = dict()

    def extract_csv_files(directory, source):
        # files_ = list()
        if os.path.isdir(directory):
            for part in os.listdir(directory):
                # print(part)
                if os.path.isdir(os.path.join(directory, part)):
                    extract_csv_files(os.path.join(directory, part), source)
                else:
                    if re.search(".*\.csv$", part) and \
                    part not in prev_files.keys():
                        prev_files[os.path.join(directory, part)] = source
        elif os.path.isfile(directory):
            if re.search(".*\.csv$", directory) \
            and directory not in prev_files.keys():
                prev_files[directory] = source
    
    for directory, column in dir_col_map.items():
        extract_csv_files(directory, column)
    
    return prev_files


get_prev_files()
# extract_csv_files(tw_output_directory, "text")
# extract_csv_files(nh_output_directory, "url")
# extract_csv_files(search_results, "link")
# print(prev_files)
# combine_data(prev_files)

{'/gdrive/My Drive/Firebolt/Political Risk/new_outputs.csv': 'link',
 '/gdrive/My Drive/Political Risk Project/Test Data/News Results/News Headlines Predictions/newsheadlines_predictions_2019-10-24_05-40.csv': 'url',
 '/gdrive/My Drive/Political Risk Project/Test Data/News Results/all_results.csv': 'url',
 '/gdrive/My Drive/Political Risk Project/Test Data/News Results/all_results_new.csv': 'url',
 '/gdrive/My Drive/Political Risk Project/Test Data/News Results/en-it-news-1.csv': 'url',
 '/gdrive/My Drive/Political Risk Project/Test Data/News Results/everything_on_early_election.csv': 'url',
 '/gdrive/My Drive/Political Risk Project/Test Data/News Results/everything_on_government_collapse.csv': 'url',
 '/gdrive/My Drive/Political Risk Project/Test Data/News Results/everything_on_political_instability.csv': 'url',
 '/gdrive/My Drive/Political Risk Project/Test Data/News Results/everything_on_snap_election.csv': 'url',
 '/gdrive/My Drive/Political Risk Project/Test Data/News Results/it-i

### Combine Offline Data

In [0]:
# text_data = combine_data_offline(prev_files)
# collection = pd.DataFrame(text_data)
# collection["rating"] = ""

Unnamed: 0,data
0,Where now for Brexit Johnson's WAB cd pass bu...
1,That's nothing. I got MPs to vote on an early ...
2,"Yes vote workers rights away,then lets have an..."
3,I'm sure that I speak for a large number of Ca...
4,If parliament does pass a Brexit deal this wee...
...,...
3183,https://www.reuters.com/article/us-mideast-cri...
3184,https://www.reuters.com/article/us-north-maced...
3185,https://www.reuters.com/article/us-germany-pol...
3186,https://www.reuters.com/article/us-internet-ta...


In [0]:
collection.drop_duplicates(subset=["data"], inplace=True)

In [0]:
collection.to_csv(os.path.join(
    DRIVE_LOCATION, "DataCollection_{}.csv".format(today())), 
                  index=False)

## Redundant cells:

In [0]:
# tweets_data = pd.DataFrame(tweets)
# tweets_data
# print(tweets_data.loc[0]["text"])

# all_texts = [{"all_text": structure_text(t.all_text), 
#               "created_at": extract_datetime_string(t.created_at_datetime)}
#               for t in tweets]
# all_texts
# for tweet in tweets:
#     print(dir(tweet))

In [0]:
# str_ = ["Beyonc√© necesita ver esto. Que diosa @TiniStoessel üî•üî•üî• https://t.co/gadVJbehQZ", 
#         "When Beyonc√© adopts a dog üôåüèæ https://t.co/U571HyLG4F",
#         "Yup!!!! ‚ù§Ô∏è‚ù§Ô∏è‚ù§Ô∏è‚ù§Ô∏è #davechappelle https://t.co/ybSGNrQpYF"]

# c = [clean_str for clean_str in map(remove_emojis, str_)]
# s1 = "Beyonc√© necesita ver esto. Que diosa"
# s1 = "Yup!!!! ‚ù§Ô∏è‚ù§Ô∏è‚ù§Ô∏è‚ù§Ô∏è #davechappelle #love https://t.co/ybSGNrQpYF"
# s1 += "@davarch1 @PKBook22 @davarch1 @TheStephenRalph https://t.co/gadVJbehQZ https://t.co/gadVJbehQZ https://t.co/U571HyLG4F"

In [0]:
# # detect_language(" ".join(['Un', 'voto', 'importante', 'con', 'incognite']))
# s1 = "Un voto importante con incognite di coalizione I saw this on the BBC and thought you should see it: votes in snap election after video sting scandal"
# # translate_keyword(" ".join(['Un', 'voto', 'importante', 'con', 'incognite']), "en")
# s1_arr = s1.split(" ")
# non_english = dict()
# for index, token in enumerate(s1_arr):
#     if detect_language(token) != "en":
#         non_english.update({index: token})

# # remaining_eng = s1_arr[list(non_english.keys())[-1]+1:]
# non_eng_text = " ".join(list(non_english.values()))
# translated_non_eng = translate_keyword(non_eng_text)
# s1t = s1.replace(non_eng_text, translated_non_eng)
# # list(non_english.keys())
# italian_tweets = pd.DataFrame(all_texts)
# italian_tweets
# # italian_tweets.to_csv(os.path.join(output_directory, 
# #                                    "italian_tweets_{}.csv".format(today)),
# #                       index=False)

In [0]:
# text_ = """This piece of text is taken from http://ansa.it. Una lunga maratona notturna sblocca l'impasse del governo sulla legge di bilancio e il decreto fiscale. Arriva il via libera salvo intese. Alle cinque del mattino, dopo un Consiglio dei ministri di quasi sei ore, il premier Giuseppe Conte e il ministro dell'Economia Roberto Gualtieri si mostrano stanchi ma soddisfatti: arriva una manovra da circa 30 miliardi, con lo stop all'aumento dell'Iva, tre miliardi per tagliare le tasse ai lavoratori, 600 milioni per la famiglia, la fine del superticket da settembre 2020 e il piano di lotta all'evasione Italia cashless voluto da Conte."""
# sample_text = re.split(r'\.\s+', text_)

# clean_tweets = [row for row in map(structure_text, sample_text)]
# # for tweet_row in clean_tweets:
# #     ix = clean_tweets.index(tweet_row)
# #     text = tweet_row["text"]
# #     lang = detect_language(text)
# #     if lang != "en":
# #         translated_text = unescape(
# #             translate_keyword(text, "en"))
# #         clean_tweets[ix]["text"] = translated_text
# pd.DataFrame(clean_tweets, index=None)

In [0]:

# # query = '("early election" OR "snap election OR government collapse") {}'.format(query_filter)
# query_2 = '("government collapse") {}'.format(query_filter)
# # query_url = parse.quote(query)

In [0]:
# def structure_text(entire_str):
#     separations = dict()
#     clean_str = remove_emojis(entire_str)
#     tokens, links, handles, tags = extract_components(clean_str)
#     clean_str = (" ".join(tokens)).strip()
#     print("clean_str = {}".format(clean_str))
#     cleaner_str = (re.subn("[:-]+", "", clean_str))[0].strip()
#     cleaner_str_split = cleaner_str.split(" ")
#     non_english = dict()
#     try:
#         for index, token in enumerate(cleaner_str_split):
#             if detect_language(token) != "en":
#                 non_english.update({index: token})
#         if non_english:
#             print("non_english: {}".format(non_english))
#             non_eng_text = " ".join(list(non_english.values()))
#             translated_non_eng = unescape(translate_keyword(non_eng_text))
#             cleaner_str = cleaner_str.replace(non_eng_text, translated_non_eng)
#     except Exception as e:
#         print("Exception occurred: {}".format(e))
#         cleaner_str = ""
#     print("cleaner_str = {}".format(cleaner_str))
#     separations['text'] = cleaner_str
#     separations['links'] = ",".join(links)
#     separations['handles'] = ",".join(handles)
#     separations['tags'] = ",".join(tags)
#     return separations

In [0]:
# def process_tweets(tweets_list):
#     tweets_collection = pd.DataFrame()
#     for tweets in tweets_list:
#         for tweet in tweets:
#             structured_tweets = structure_text(tweet.all_text)
#             structured_tweets.update({"created_time":tweet.created_at_datetime})
#             reqd_df = pd.DataFrame([structured_tweets])
#             # reqd_df.drop_duplicates(inplace=True)
#             # reqd_df["text"] = reqd_df["text"].apply(
#             #     lambda s: re.subn('(\s)', ' ', s)[0])
#             # reqd_df["text"] = reqd_df["text"].apply(structure_text)
#             # match_str = '({})'.format(query)
#             if tweets_collection.empty:
#                 tweets_collection = reqd_df
#             else:
#                 tweets_collection = tweets_collection.append(reqd_df)
#     # tweets_collection.drop_duplicates(inplace=True)
#     return tweets_collection

In [0]:
# rule = gen_rule_payload(query, results_per_call=100)
# r = gen_rule_payload(query_2, results_per_call=100)

In [0]:
# tweets_2 = collect_results(r,
#                          max_results=500,
#                          result_stream_args=premium_search_args)

In [0]:
# len(tweets_2)

In [0]:
# [print(tweet.all_text, end='\n\n') for tweet in tweets_2[0:10]]

In [0]:
# tweets_data = pd.DataFrame(tweets)

In [0]:
# type(tweets[3]["quoted_status"]) == type({})
# tweets_data_2

In [0]:
# nested_columns = []
# for tweet in tweets[:2]:
#     print("tweet {}".format(tweets.index(tweet)))
#     for col in tweet.keys():
        
#         if tweet[col] and (type(tweet[col]) == type({}) or type(tweet[col]) == type([])):
#             print("\tcolumn: {}\tvalue: {}".format(col, tweet[col]))

In [0]:
# # all_text = tweets_data["text"].drop_duplicates()
# all_text = pd.Series([tweet.all_text for tweet in tweets])
# all_text.drop_duplicates(inplace=True)

In [0]:
# # import codecs
# # codecs.getdecoder("unicode_escape")(all_text.loc[8])[0]
# all_text_2

In [0]:
# all_text_refined = all_text.apply(lambda s: re.subn('(\s)', ' ', s)[0])

In [0]:
# all_text_refined.loc[454]

In [0]:
# match_str = '(government collapse)'
# reqd_df = all_text_refined[all_text_refined.str.contains(match_str)]
# # reqd_df.drop_duplicates(subset=['headline_text'], inplace=True)

In [0]:
# reqd_df_2

In [0]:
# def combine_data_offline(tweets_files, news_files, prev_files_map=None):
#     processed_headlines = processed_tweets = prev_data_df = pd.DataFrame()
    
#     if prev_files_map:
#         for filename, column in prev_files_map.items():
#             file_df = get_previous_data(filename, column)
#             if prev_data_df.empty:
#                 prev_data_df = file_df
#             else:
#                 prev_data_df = prev_data_df.append(file_df)
#         prev_data_df = prev_data_df.reset_index().drop(columns=["index"])
#         prev_data_df.rename(columns={prev_data_df.columns[0]: "text"}, 
#                             inplace=True)
#         prev_data_df.drop_duplicates(inplace=True)
#     processed_tweets = pd.read_csv(os.path.join(tw_output_directory,
#                                                 "tweets_2019-11-11_11-41.csv"),
#                                    encoding="UTF-8")
    
#     tw_text = list(processed_tweets["text"])
#     processed_headlines = pd.read_csv(os.path.join(nh_output_directory, 
#                                           "news_2019-11-11_11-41.csv"),
#                              encoding="UTF-8")
#     nh_text = list(processed_headlines["url"])
#     tw_text.extend(nh_text)
#     if not prev_data_df.empty:
#         base_ = list(prev_data_df["text"])
#         base_.extend(tw_text)
#     else:
#         base_ = tw_text
#     final_df = pd.DataFrame(base_, columns=["data"])
#     return final_df