<a href="https://colab.research.google.com/github/nishantparmar24/Political-Risk/blob/master/TwitterSearch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis - Twitter 

### Install required packages

In [1]:
!pip install searchtweets unidecode

Collecting searchtweets
  Downloading https://files.pythonhosted.org/packages/51/d7/7dd296ba9469e046bad23583aaa0d36b18c7d6e4df9fd2acfb433d1c7ee2/searchtweets-1.7.4-py3-none-any.whl
Collecting unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
[K     |████████████████████████████████| 245kB 6.0MB/s 
[?25hCollecting tweet-parser
  Downloading https://files.pythonhosted.org/packages/4b/ea/cb82efb43dbcb115ea447313fbd287ff66349b34bdfdb4a78e25d3a42cb0/tweet_parser-1.13.2-py3-none-any.whl
Installing collected packages: tweet-parser, searchtweets, unidecode
Successfully installed searchtweets-1.7.4 tweet-parser-1.13.2 unidecode-1.1.1


### Import all required packages

In [0]:
from datetime import datetime as dt
from google.cloud import translate
from google.colab import drive
from html import unescape
from numpy import nan
from searchtweets import load_credentials, gen_rule_payload, collect_results
from time import sleep
from unidecode import unidecode
from urllib import parse
import json
import os
import pandas as pd
import re
import requests
import unicodedata

### Enable Google Drive in this notebook:

In [3]:
drive.mount('/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /gdrive


In [0]:
output_directory = '/gdrive/My Drive/Political Risk Project/Test Data/Twitter Results/'
API_usage_file = os.path.join(output_directory, 'api_usage.json')

### Fetch Google Translate API credentials

In [0]:
translate_api_dir = "/gdrive/My Drive/Firebolt/API Keys"
translate_api_key = os.path.join(translate_api_dir, 'eiu-searchautomation.json')
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = translate_api_key
try:
    translate_client = translate.Client()
    
    def translate_keyword(keyword, target_lang="en"):
        response = translate_client.translate(keyword, 
                                                target_language=target_lang)
        if response:
            return response["translatedText"]
        else:
            return ""
    
    def detect_language(text_string):
        result = translate_client.detect_language(text_string)
        return result["language"]
    
except Exception as e:
    print("Exception occurred while instantiating \
    Google Cloud Translate: {}".format(e))


### Get text with the least length from a collection of text values

In [0]:
def phrase_min_length(text_sample):
    sample_df = pd.DataFrame(text_sample, columns=['text'])
    sample_df["length"] = [l for l in map(len, text_sample)]
    min_pos = sample_df["length"] == sample_df["length"].min()
    min_phrase = sample_df[min_pos]["text"] 

### Function to remove emojis from a text string

In [0]:
def remove_emojis(input_string):
    return_string = ""

    for character in input_string.strip():
        try:
            character.encode("ascii")
            return_string += character
        except UnicodeEncodeError:
            replaced = unidecode(str(character))
            if replaced not in ['', '[?]']:
                return_string += replaced
    return return_string.strip()

### Function to extract (and segregate) tweet-components: links, user-handles and hashtags

In [0]:
def extract_components(clean_string):
    token_list = re.split(r"\s+", clean_string.strip())
    url_ptrn = '^https?:\/\/.*[\r\n]*'
    handle_ptrn = '^\@.*'
    trend_ptrn = '^#.*'
    url_collection = list()
    handle_collection = list()
    trend_collection = list()
    tokens = list()
    for token in token_list:
        # print("token = {}".format(token))
        url_match = re.search(url_ptrn, token) or re.match(url_ptrn, token)
        if url_match:
            # print("\t\tURL Pattern matched")
            url_collection.append(
                token) if token not in url_collection else False
            ix = token_list.index(token)
            token_list[ix] = ''
            continue
        handle_match = re.search(
            handle_ptrn, token) or re.match(handle_ptrn, token)
        if handle_match:
            # print("\t\tHandle Pattern matched")
            handle_collection.append(
                token) if token not in handle_collection else False
            ix = token_list.index(token)
            token_list[ix] = ''
            continue
        trend_match = re.search(
            trend_ptrn, token) or re.match(trend_ptrn, token)
        if trend_match:
            # print("\t\tTrend Pattern matched")
            trend_collection.append(
                token) if token not in trend_collection else False
            ix = token_list.index(token)
            token_list[ix] = ''
            continue
        if token.strip():
            tokens.append(token)
    del token_list
    return (tokens, url_collection, handle_collection, trend_collection)


### Function to clean and structure the text data

In [0]:
def structure_text(entire_str):
    separations = dict()
    clean_str = remove_emojis(entire_str)
    tokens, links, handles, tags = extract_components(clean_str)
    clean_str = (" ".join(tokens)).strip()
    print("clean_str = {}".format(clean_str))
    cleaner_str = (re.subn("[:-]+", "", clean_str))[0].strip()
    try:
        cleaner_str = unescape(translate_keyword(cleaner_str))
    except Exception as e:
        print("Exception occurred: {}".format(e))
        sleep(5)
        try:
            cleaner_str = unescape(translate_keyword(cleaner_str))
        except Exception as e:
            cleaner_str = ""
    print("cleaner_str = {}".format(cleaner_str))
    separations['text'] = cleaner_str
    separations['links'] = ",".join(links)
    separations['handles'] = ",".join(handles)
    separations['tags'] = ",".join(tags)
    return separations

### Function to include a zero for date values less than 10

In [0]:
format_date_str = lambda s: "0{}".format(s) if int(s) < 10 else "{}".format(s)

### Function to format datetime as string (time separated with a ":")

In [0]:
def extract_datetime_string(datetime_obj):
    if datetime_obj:
        return "{}-{}-{} {}:{}:{}".format(datetime_obj.year,
                                          format_date_str(datetime_obj.month),
                                          format_date_str(datetime_obj.day),
                                          format_date_str(datetime_obj.hour),
                                          format_date_str(datetime_obj.minute),
                                          format_date_str(datetime_obj.second))

### Function to return a formatting of current datetime (for file names)

In [0]:
today = lambda: "{}-{}-{}_{}-{}".format(dt.now().year,
                                format_date_str(dt.now().month),
                                format_date_str(dt.now().day),
                                format_date_str(dt.now().hour),
                                format_date_str(dt.now().minute))


### Functions to create and validate custom dates

In [0]:
def get_month_days_map(year, month=None):
    month_days_map = {(1, 3, 5, 7, 8, 10, 12): 31, (4, 6, 9, 11): 30}
    if year % 4 == 0:
        month_days_map.update({(2,): 29})
    else:
        month_days_map.update({(2,): 28})
    month = 12 if month == 0 else month
    if month:
        for months, n_days in month_days_map.items():
            if month in months:
                return n_days
    return month_days_map

In [0]:
def validate_custom_date(year, month, day):
    print("Date entered: {}-{}-{}".format(year, 
                                          format_date_str(month), 
                                          format_date_str(day)))
    n_days = get_month_days_map(year, month=month)
    if month not in range(1, 13):
        print("Invalid month entered!")
        return False
    if day not in range(1, n_days+1):
        print("Invalid day entered!")
        return False
    if year > dt.now().year and month > dt.now().month and day > dt.now().day:
        print("Wrong date entered! Please enter correct date values.")
        return False
    elif year > dt.now().year:
        print("Invalid year entered!")
        return False
    elif year == dt.now().year:
        if month > dt.now().month:
            print("Invalid month entered!")
            return False
        if day > dt.now().day:
            print("Invalid day entered!")
            return False
    return True

In [0]:
def get_custom_date(year=dt.now().year, month=dt.now().month, day=dt.now().day):
    date_valid = validate_custom_date(year, month, day)
    custom_date = ""
    if date_valid:
        from_day = diff_day = day - 30
        from_mon = month
        from_year = year
        prev_n = get_month_days_map(year, month - 1)
        if diff_day < 0:
            from_day = diff_day % prev_n
            from_mon = month - 1 if month != 1 else 12 
            from_year = year - 1 if month == 1 else year
        elif diff_day == 0:
            from_day = prev_n
            from_mon = month - 1 if month != 1 else 12
            from_year = year - 1 if month == 1 else year
        print("from_day = {}".format(from_day))
        custom_date = "{}-{}-{}".format(from_year, 
                                        format_date_str(from_mon), 
                                        format_date_str(from_day)) 
    return custom_date

# get_custom_date(year=2017, month=11, day=15)

### Fetch Twitter Premium Search API credentials

In [13]:
creds_file = '/gdrive/My Drive/Political Risk Project/TwitterCredentials/twitter_creds.yaml'
search_tweets_api = 'search_tweets_30_day_dev'
premium_search_args = load_credentials(filename=creds_file,
                                       yaml_key=search_tweets_api,
                                       env_overwrite=False)

Grabbing bearer token from OAUTH


### Defining all the filters, queries and user handles

In [0]:
it_query_filter = "-has:media place_country:IT"
it_en_query_filter = "-has:media place_country:IT lang:en"
generic_query_filter = "-has:media"
queries = ["early election", "snap election", "government collapse", 
           "government coalition", "election", "instability", "uncertainty",
           "crisis", "coalition"]
from_users = ["lorepregliasco", "FerdiGiugliano", "AlbertoNardelli", 
              "gavinjones10"]

### API Usage Monitor

In [0]:
def get_api_usage(date):
    if os.path.lexists(API_usage_file):
        api_usage = pd.read_json(API_usage_file, orient='records')
        print(api_usage)
        print(list(api_usage.columns))
        if date in api_usage.columns:
            return api_usage, api_usage[date].squeeze()
        else:
            # update_api_count(date, 0)
            return api_usage, 0
    return None, None

def update_api_count(date, count_add):
    api_usage, count = get_api_usage(date)
    if not api_usage.empty:
        updated_count = 0
        print(list(api_usage.columns))
        if date in api_usage.columns:
            print("Month present\n")
            api_usage[date] += count_add
            updated_count = api_usage[date].squeeze()
        else:
            print("Month absent\n")
            api_usage[date] = 1
        print("api_usage = {}".format(api_usage))
        api_usage.to_json(API_usage_file, orient='records', date_format='iso')
        sleep(3)
        print("Updating new API usage...")
        _, new_count = get_api_usage(date)
        if new_count != updated_count:
            print("...")
            drive.mount("/gdrive", force_remount=True)
        _, new_count = get_api_usage(date)
        # print(new_count)
        # while new_count != updated_count:
        #     print("Updating new API usage...")
        #     _, new_count = get_api_usage(date)
        print("API usage count updated! Current usage: {}".format(new_count))


In [0]:
# with open(API_usage_file) as api_file:
#     api_usage = json.load(api_file)
# usage_df = pd.DataFrame([api_usage])
# curr_month = "{}-{}".format(dt.now().year, format_date_str(dt.now().month))
# get_api_usage(curr_month)
# update_api_count(date=curr_month, count_add=0)

### Functions to fetch tweets by querying the Twitter API with the given queries and filters

In [0]:
def get_user_queries_filter(queries, from_users):
    user_queries_filter = '("{}") from:{} -has:media'
    user_queries_list = list()
    for query in queries:
        for user in from_users:
            filter_ = user_queries_filter.format(query, user)
            user_queries_list.append(filter_)
    return user_queries_list


def get_user_specific_filter(from_users):
    user_specific_filter =  'from:{} -has:media'
    user_specific_list = list()
    # for query in queries:
    for user in from_users:
        filter_ = user_specific_filter.format(user)
        user_specific_list.append(filter_)
    return user_specific_list


def get_tweets(query_set, query_filter=None):
    tweets_list = list()
    for query in query_set:
        curr_month = "{}-{}".format(dt.now().year, 
                                    format_date_str(dt.now().month))
        _, curr_usage = get_api_usage(curr_month)
        if curr_usage >= 24999:
            print("Twitter API limit is about to exceed! Returning now ...\n")
            break
        if query_filter:
            q = '("{}") {}'.format(query, query_filter)
        else:
            q = "{}".format(query)
            print("No filter/Filter in query_set: {}".format(q))
        print("Collecting for {}".format(q))
        rule = gen_rule_payload(q, results_per_call=100)
        tweets = collect_results(rule,
                                max_results=100,
                                result_stream_args=premium_search_args)
        print(len(tweets))
        update_api_count(curr_month, len(tweets))
        tweets_list.append(tweets)
    return tweets_list


In [0]:
user_specific_queries = get_user_specific_filter(from_users)
user_specific_queries

['from:lorepregliasco -has:media',
 'from:FerdiGiugliano -has:media',
 'from:AlbertoNardelli -has:media',
 'from:gavinjones10 -has:media']

### Process the tweets to produce a collection of tweet-text, hashtags, links, and the date-time when they were created.

In [0]:
def process_tweets(tweets_list):
    tweets_collection = pd.DataFrame()
    for tweets in tweets_list:
        for tweet in tweets:
            structured_tweets = structure_text(tweet.all_text)
            structured_tweets.update({"created_time":tweet.created_at_datetime})
            reqd_df = pd.DataFrame([structured_tweets])

            if tweets_collection.empty:
                tweets_collection = reqd_df
            else:
                tweets_collection = tweets_collection.append(reqd_df)
    return tweets_collection

### To save the output of the tweet-information into the categories, General or Italy

In [0]:
def save_output(tweets_collection, 
                curr_datetime,
                category="General", 
                subset="predictions"):
    if category in ["General", "Italy"]:
        file_name = 'relevant_{}_{}.csv'.format(subset, curr_datetime)
        tweets_collection.to_csv(
            os.path.join(output_directory, category, file_name),
            index=False)
    else:
        print("Output Directory is not valid!")

## Deployed model

### Endpoint URL to query the model in real time

In [0]:
enpoint_uri = "http://ac6a2064dee3c11e99ced0a13821e56d-733867741.ap-southeast-1.elb.amazonaws.com/sentiment/classifier"
headers = {"content-type": "application/json"}

### Get polarity of a piece of text

In [0]:
def get_polarity(sentence):
    data = json.dumps({"polarity": sentence})
    response = requests.post(enpoint_uri, data=data, headers=headers)
    return response.json() if response.status_code == 200 else ""

## Get and Process Tweets from Italy

### Get tweets specific to the country of origin

In [0]:
italy_tweets = get_tweets(queries, it_query_filter)

### Process the tweets from Italy

In [0]:
italy_tweets_collection = process_tweets(italy_tweets)

clean_str = Elezioni anticipate I saw this on the BBC and thought you should see it: North Macedonia calls snap election after EU talks setback -
cleaner_str = Elezioni anticipate I saw this on the BBC and thought you should see it North Macedonia calls snap election after EU talks setback
clean_str = Un voto importante con incognite di coalizione I saw this on the BBC and thought you should see it: votes in snap election after video sting scandal -
cleaner_str = Un voto importante con incognite di coalizione I saw this on the BBC and thought you should see it votes in snap election after video sting scandal
clean_str = I'd like to know how he's stumbled upon a magic money tree right before a snap election. Also is austerity suddenly over?
cleaner_str = I'd like to know how he's stumbled upon a magic money tree right before a snap election. Also is austerity suddenly over?


### Clean the DataFrame of tweets from Italy

In [0]:
italy_tweets_collection = italy_tweets_collection.reset_index().drop(columns=['index'])
duplicacy_subset = list(set(italy_tweets_collection.columns) - set(["created_time"]))
italy_tweets_collection.drop_duplicates(subset=duplicacy_subset, inplace=True)
italy_tweets_collection

Unnamed: 0,created_time,handles,links,tags,text
0,2019-10-19 22:24:18,,https://t.co/yr67geg95b,,Elezioni anticipate I saw this on the BBC and ...
1,2019-09-29 07:39:34,,https://t.co/kkhsWN5HHV,#Austria,Un voto importante con incognite di coalizione...
2,2019-09-29 07:25:09,@WoodleyNLP,,,I'd like to know how he's stumbled upon a magi...


### Attach predictions of the tweets into the DataFrame

In [0]:
italy_tweets_collection["polarity_v1"] = italy_tweets_collection["text"].apply(get_polarity)

In [0]:
italy_tweets_collection

Unnamed: 0,created_time,handles,links,tags,text,polarity_v1
0,2019-10-19 22:24:18,,https://t.co/yr67geg95b,,Elezioni anticipate I saw this on the BBC and ...,positive
0,2019-09-29 07:39:34,,https://t.co/kkhsWN5HHV,#Austria,Un voto importante con incognite di coalizione...,positive
0,2019-09-29 07:25:09,@WoodleyNLP,,,I'd like to know how he's stumbled upon a magi...,negative


### Save output of the collection of tweets in Google Drive
Folder location: https://drive.google.com/open?id=1iX7i_jarE7hhOVf68p-KzUECrEG59u4T

In [0]:
save_output(italy_tweets_collection, 
            category="Italy",
            subset="predictions", 
            curr_datetime=today())

## Get and Process Tweets from a set of users

### Get tweets posted by a list of users

In [0]:
user_specific_tweets = get_tweets(query_set=user_specific_queries)

No filter/Filter in query_set: ("early election") from:lorepregliasco -has:media
No filter/Filter in query_set: ("early election") from:FerdiGiugliano -has:media
No filter/Filter in query_set: ("early election") from:AlbertoNardelli -has:media
No filter/Filter in query_set: ("early election") from:gavinjones10 -has:media
No filter/Filter in query_set: ("snap election") from:lorepregliasco -has:media
No filter/Filter in query_set: ("snap election") from:FerdiGiugliano -has:media
No filter/Filter in query_set: ("snap election") from:AlbertoNardelli -has:media
No filter/Filter in query_set: ("snap election") from:gavinjones10 -has:media
No filter/Filter in query_set: ("government collapse") from:lorepregliasco -has:media
No filter/Filter in query_set: ("government collapse") from:FerdiGiugliano -has:media
No filter/Filter in query_set: ("government collapse") from:AlbertoNardelli -has:media
No filter/Filter in query_set: ("government collapse") from:gavinjones10 -has:media


### Process the tweets

In [0]:
user_tweets_collection = process_tweets(user_specific_tweets)
user_tweets_collection

### Clean the DataFrame of tweets

In [0]:
user_tweets_collection = user_tweets_collection.reset_index().drop(columns=['index'])
duplicacy_subset = list(set(user_tweets_collection.columns) - set(["created_time"]))
user_tweets_collection.drop_duplicates(subset=duplicacy_subset, inplace=True)
user_tweets_collection.dropna(subset=["text"])
# user_tweets_collection.dropna(subset=["text"])

Unnamed: 0,created_time,handles,links,tags,text
0,2019-10-22 08:56:15,"@PaoloZanetto,@lorepregliasco,@Baselice,,@Agen...","https://t.co/8c4rOIxKIl,https://t.co/dhSP6tfxr...",,"Today in Rome, beautiful things! Inside Politi..."
1,2019-10-22 08:50:23,@LeFrecce,,,"9619 train, stopped for half an hour in the tu..."
2,2019-10-22 08:05:33,@LeFrecce,,,"Thank you, all right on board, but the 'delay'..."
3,2019-10-22 07:57:54,@StarWars:,"https://t.co/jjhAMAufLW,https://t.co/MLbzRXrCJ...","#TheRiseofSkywalker,#TheRiseOfSkywalker","The saga will end, the story lives forever. Wa..."
4,2019-10-22 07:55:39,,https://t.co/P26Mx5UtTM,,"Beautiful things, this afternoon in Rome!"
5,2019-10-22 07:46:41,@LeFrecce,,#neolingua,"""Travel time 20 minutes longer than expected""...."
6,2019-10-22 05:38:53,"@chedisagio,@JustinTrudeau,@realDonaldTrump",,,Colleges
7,2019-10-22 05:00:05,,https://t.co/4Orf3CJibI,"#Canada:,#canadavotes2019,#MaratonaYouTrend",Liberals and Democrats (NDP) would currently l...
8,2019-10-22 04:47:43,,,"#Canada:,#canadavotes2019,#MaratonaYouTrend","among the colleges to be assigned, the 2 most ..."
9,2019-10-22 04:44:45,@ElectionsCan_E,,"#Canada,#canadavotes2019,#MaratonaYouTrend",For this it would be the definitive distributi...


## Get generic tweets (relevant to the keywords)

### Get all tweets that contain the keywords like *snap election*, *early election*, *political instability*

In [29]:
generic_tweets = get_tweets(queries, generic_query_filter)

Collecting for ("early election") -has:media
Collecting for ("snap election") -has:media
Collecting for ("government collapse") -has:media
Collecting for ("government coalition") -has:media
Collecting for ("election") -has:media
Collecting for ("instability") -has:media
Collecting for ("uncertainty") -has:media
Collecting for ("crisis") -has:media
Collecting for ("coalition") -has:media


### Process all the generic tweets

In [0]:
generic_tweets_collection = process_tweets(generic_tweets)

clean_str = Salmond's November rape Trial could be a reason SNP may agree to an early election
cleaner_str = Salmond's November rape Trial could be a reason SNP may agree to an early election
clean_str = was weighing whether to push for an early or try again to pass his stalled after blocked a fast-track plan to approve his before...
cleaner_str = was weighing whether to push for an early or try again to pass his stalled after blocked a fasttrack plan to approve his before...
clean_str = Salmond's November rape Trial could be a reason SNP may agree to an early election
cleaner_str = Salmond's November rape Trial could be a reason SNP may agree to an early election
clean_str = On this, v interesting piece which says that the public actually want a general election, those who don't are (generally) politicians and political journalists who think the public don't either.
cleaner_str = On this, v interesting piece which says that the public actually want a general election, those who don't 

### Clean the DataFrame of generic tweets

In [0]:
generic_tweets_collection = generic_tweets_collection.reset_index().drop(columns=['index'])
duplicacy_subset = list(set(generic_tweets_collection.columns) - set(["created_time"]))
generic_tweets_collection.drop_duplicates(subset=duplicacy_subset, inplace=True)
generic_tweets_collection

Unnamed: 0,created_time,handles,links,tags,text
0,2019-10-23 16:18:11,,https://t.co/HGxdrMJWy8,,Salmond's November rape Trial could be a reaso...
1,2019-10-23 16:17:06,,https://t.co/ZOXs5XqLNb,"#British,#PrimeMinister,#BorisJohnson,#electio...",was weighing whether to push for an early or t...
3,2019-10-23 16:14:06,,https://t.co/pYf5OrjJ5k,,"On this, v interesting piece which says that t..."
4,2019-10-23 16:13:19,,https://t.co/P8zlqny5Cc,,Boris Johnson mulls early election over Brexit...
6,2019-10-23 16:12:01,"@LoyalDefender2K,@Ali_H_Smile,@SkyNews",,,An early election is important. Let's hope we ...
18,2019-10-23 15:56:28,,https://t.co/ZgKuNTEyCW,,Oh dear. It's not a matter of opinion the PM c...
24,2019-10-23 15:53:03,,,,I would love there to be an election before Xm...
25,2019-10-23 15:51:12,"@PoliticalDataUK,@davidallengreen",,,Not sure. It all depends on how you spin it to...
30,2019-10-23 15:47:37,@ABC,https://t.co/agPUy6B3wm,,UK prime minister mulls early election over Br...
38,2019-10-23 15:34:46,"@leebee999,@BlueArmyFaction","https://t.co/HGxdrMJWy8,https://t.co/jF3yXOpvxH",,Salmond's November rape Trial could be a reaso...


### Save the generic Twitter results to Google Drive

In [0]:
save_output(generic_tweets_collection, subset="tweets", curr_datetime=today())

### Merge the predictions of tweets in the DataFrame

In [0]:
generic_tweets_collection["polarity_v1"] = generic_tweets_collection["text"].apply(get_polarity)

### Save the predicted generic tweets in Google Drive
Folder location: https://drive.google.com/open?id=13FRvvDetM4Bcsdw9dblKuEr9Tyz219_A

In [0]:
save_output(generic_tweets_collection, 
            subset="predictions", 
            curr_datetime=today())

## Redundant cells:

In [0]:
# tweets_data = pd.DataFrame(tweets)
# tweets_data
# print(tweets_data.loc[0]["text"])

# all_texts = [{"all_text": structure_text(t.all_text), 
#               "created_at": extract_datetime_string(t.created_at_datetime)}
#               for t in tweets]
# all_texts
# for tweet in tweets:
#     print(dir(tweet))

In [0]:
# str_ = ["Beyoncé necesita ver esto. Que diosa @TiniStoessel 🔥🔥🔥 https://t.co/gadVJbehQZ", 
#         "When Beyoncé adopts a dog 🙌🏾 https://t.co/U571HyLG4F",
#         "Yup!!!! ❤️❤️❤️❤️ #davechappelle https://t.co/ybSGNrQpYF"]

# c = [clean_str for clean_str in map(remove_emojis, str_)]
# s1 = "Beyoncé necesita ver esto. Que diosa"
# s1 = "Yup!!!! ❤️❤️❤️❤️ #davechappelle #love https://t.co/ybSGNrQpYF"
# s1 += "@davarch1 @PKBook22 @davarch1 @TheStephenRalph https://t.co/gadVJbehQZ https://t.co/gadVJbehQZ https://t.co/U571HyLG4F"

In [0]:
# # detect_language(" ".join(['Un', 'voto', 'importante', 'con', 'incognite']))
# s1 = "Un voto importante con incognite di coalizione I saw this on the BBC and thought you should see it: votes in snap election after video sting scandal"
# # translate_keyword(" ".join(['Un', 'voto', 'importante', 'con', 'incognite']), "en")
# s1_arr = s1.split(" ")
# non_english = dict()
# for index, token in enumerate(s1_arr):
#     if detect_language(token) != "en":
#         non_english.update({index: token})

# # remaining_eng = s1_arr[list(non_english.keys())[-1]+1:]
# non_eng_text = " ".join(list(non_english.values()))
# translated_non_eng = translate_keyword(non_eng_text)
# s1t = s1.replace(non_eng_text, translated_non_eng)
# # list(non_english.keys())
# italian_tweets = pd.DataFrame(all_texts)
# italian_tweets
# # italian_tweets.to_csv(os.path.join(output_directory, 
# #                                    "italian_tweets_{}.csv".format(today)),
# #                       index=False)

In [0]:
# text_ = """This piece of text is taken from http://ansa.it. Una lunga maratona notturna sblocca l'impasse del governo sulla legge di bilancio e il decreto fiscale. Arriva il via libera salvo intese. Alle cinque del mattino, dopo un Consiglio dei ministri di quasi sei ore, il premier Giuseppe Conte e il ministro dell'Economia Roberto Gualtieri si mostrano stanchi ma soddisfatti: arriva una manovra da circa 30 miliardi, con lo stop all'aumento dell'Iva, tre miliardi per tagliare le tasse ai lavoratori, 600 milioni per la famiglia, la fine del superticket da settembre 2020 e il piano di lotta all'evasione Italia cashless voluto da Conte."""
# sample_text = re.split(r'\.\s+', text_)

# clean_tweets = [row for row in map(structure_text, sample_text)]
# # for tweet_row in clean_tweets:
# #     ix = clean_tweets.index(tweet_row)
# #     text = tweet_row["text"]
# #     lang = detect_language(text)
# #     if lang != "en":
# #         translated_text = unescape(
# #             translate_keyword(text, "en"))
# #         clean_tweets[ix]["text"] = translated_text
# pd.DataFrame(clean_tweets, index=None)

In [0]:

# # query = '("early election" OR "snap election OR government collapse") {}'.format(query_filter)
# query_2 = '("government collapse") {}'.format(query_filter)
# # query_url = parse.quote(query)

In [0]:
# def structure_text(entire_str):
#     separations = dict()
#     clean_str = remove_emojis(entire_str)
#     tokens, links, handles, tags = extract_components(clean_str)
#     clean_str = (" ".join(tokens)).strip()
#     print("clean_str = {}".format(clean_str))
#     cleaner_str = (re.subn("[:-]+", "", clean_str))[0].strip()
#     cleaner_str_split = cleaner_str.split(" ")
#     non_english = dict()
#     try:
#         for index, token in enumerate(cleaner_str_split):
#             if detect_language(token) != "en":
#                 non_english.update({index: token})
#         if non_english:
#             print("non_english: {}".format(non_english))
#             non_eng_text = " ".join(list(non_english.values()))
#             translated_non_eng = unescape(translate_keyword(non_eng_text))
#             cleaner_str = cleaner_str.replace(non_eng_text, translated_non_eng)
#     except Exception as e:
#         print("Exception occurred: {}".format(e))
#         cleaner_str = ""
#     print("cleaner_str = {}".format(cleaner_str))
#     separations['text'] = cleaner_str
#     separations['links'] = ",".join(links)
#     separations['handles'] = ",".join(handles)
#     separations['tags'] = ",".join(tags)
#     return separations

In [0]:
# def process_tweets(tweets_list):
#     tweets_collection = pd.DataFrame()
#     for tweets in tweets_list:
#         for tweet in tweets:
#             structured_tweets = structure_text(tweet.all_text)
#             structured_tweets.update({"created_time":tweet.created_at_datetime})
#             reqd_df = pd.DataFrame([structured_tweets])
#             # reqd_df.drop_duplicates(inplace=True)
#             # reqd_df["text"] = reqd_df["text"].apply(
#             #     lambda s: re.subn('(\s)', ' ', s)[0])
#             # reqd_df["text"] = reqd_df["text"].apply(structure_text)
#             # match_str = '({})'.format(query)
#             if tweets_collection.empty:
#                 tweets_collection = reqd_df
#             else:
#                 tweets_collection = tweets_collection.append(reqd_df)
#     # tweets_collection.drop_duplicates(inplace=True)
#     return tweets_collection

In [0]:
# rule = gen_rule_payload(query, results_per_call=100)
# r = gen_rule_payload(query_2, results_per_call=100)

In [0]:
# tweets_2 = collect_results(r,
#                          max_results=500,
#                          result_stream_args=premium_search_args)

In [0]:
# len(tweets_2)

In [0]:
# [print(tweet.all_text, end='\n\n') for tweet in tweets_2[0:10]]

In [0]:
# tweets_data = pd.DataFrame(tweets)

In [0]:
# type(tweets[3]["quoted_status"]) == type({})
# tweets_data_2

In [0]:
# nested_columns = []
# for tweet in tweets[:2]:
#     print("tweet {}".format(tweets.index(tweet)))
#     for col in tweet.keys():
        
#         if tweet[col] and (type(tweet[col]) == type({}) or type(tweet[col]) == type([])):
#             print("\tcolumn: {}\tvalue: {}".format(col, tweet[col]))

In [0]:
# # all_text = tweets_data["text"].drop_duplicates()
# all_text = pd.Series([tweet.all_text for tweet in tweets])
# all_text.drop_duplicates(inplace=True)

In [0]:
# # import codecs
# # codecs.getdecoder("unicode_escape")(all_text.loc[8])[0]
# all_text_2

In [0]:
# all_text_refined = all_text.apply(lambda s: re.subn('(\s)', ' ', s)[0])

In [0]:
# all_text_refined.loc[454]

In [0]:
# match_str = '(government collapse)'
# reqd_df = all_text_refined[all_text_refined.str.contains(match_str)]
# # reqd_df.drop_duplicates(subset=['headline_text'], inplace=True)

In [0]:
# reqd_df_2