Importing packages

In [None]:
import tweepy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
from sentiment_analysis_spanish import sentiment_analysis
from requests.sessions import dispatch_hook
from collections import Counter

import time

import json

import plotly.express as px
from datetime import datetime

import yaml
import requests
import os

from datetime import datetime, timedelta

In [None]:
# for analysis of text in Spanish
sentiment_spanish = sentiment_analysis.SentimentAnalysisSpanish()

# Dowloading data from Twitter

In [None]:
# insert your connection information below
# https://developer.twitter.com/en/docs/twitter-api/getting-started/getting-access-to-the-twitter-api

consumer_key = 'XXX'
consumer_secret = 'YYY'
access_token = 'ZZZ'
access_token_secret = 'WWW'

Authentication

In [None]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)

Define your query

In [None]:
query = 'Quart de Poblet'

To get the data for the last 7 days (available with free Twitter API)

In [None]:
count = 1000
tweets = tweepy.Cursor(
    api.search_tweets,
    q=query,
    tweet_mode="extended",
).items(count)

To get the data for the last 30 days (available with academic Twitter API). `label` should contain the name of your application created at twitter developer portal

In [None]:
count = 1000
tweets = tweepy.Cursor(
    api.search_30_day, 
    label='dev30days', 
).items(count)

In [None]:
tweets_list = [tweet._json for tweet in tweets]
print('Dowloaded {} tweets'.format(len(tweets_list)))
tweet_df = pd.DataFrame(tweets_list)
tweet_df['created_at'] = tweet_df['created_at'].astype('datetime64[ns]')
file_name = '{}.txt'.format(query)
with open(file_name, 'w') as out_f:
    json.dump(tweets_list, out_f)
print('The tweets were saved into {}'.format(file_name))

# Reading tweets stored in files

In [None]:
# this file contains tweets collected for the query "Quart de Poblet"
file_name = 'Quart de Poblet.txt'

In [None]:
with open(file_name) as in_file:
    tweets_json = json.load(in_file)

In [None]:
tweets_df = pd.DataFrame(tweets_json)

Get full text of tweets

In [None]:
#tweets_df

In [None]:
if 'text' in tweets_df.columns:
    text_arr = tweets_df['text'].values
    ext_tweet_arr = tweets_df['extended_tweet'].values
    num_changed = 0
    for i in range(0, len(ext_tweet_arr)):
        if type(ext_tweet_arr[i]) == dict:
            num_changed += 1
            #print(ext_tweet_arr[i]['full_text'])
            text_arr[i] = ext_tweet_arr[i]['full_text']
        pass
    print('changed {} of {}'.format(num_changed, len(text_arr)))
    tweets_df['full_text'] = text_arr

In [None]:
len(tweets_df)

## Adding sentiment analysis results

In [None]:
sentiment_results = [sentiment_spanish.sentiment(text) for text in tweets_df['full_text'].values]
len(sentiment_results)

In [None]:
tweets_df['Sentiment'] = sentiment_results
tweets_df.head(2)

Here we can set thresholds for labeling tweets as positive, negative and neutral

In [None]:
sent_class_arr = []
for res in tweets_df['Sentiment'].values:
    if res > 0.99:
        sent = 'POS'
    elif res > 0.001:
        sent = 'NEU'
    else:
        sent = 'NEG'
        pass
    sent_class_arr.append(sent)

In [None]:
tweets_df['Sentiment_class'] = sent_class_arr

In [None]:
tweets_df.groupby('Sentiment_class')['created_at'].count()

## Get user location

In [None]:
user_loc_arr = [user['location'] for user in tweets_df['user'].values]
len(user_loc_arr)

In [None]:
tweets_df['user_loc'] = user_loc_arr
tweets_df.head(2)

Analyzing user location

In [None]:
unique_locs = tweets_df['user_loc'].unique()
print(len(unique_locs))

In [None]:
print('Users with location')
print(sum(x is not None for x in user_loc_arr))

In [None]:
user_loc_arr[:10]

As we can see, the user-location is not standardised. It can be done, however, with other tools, such as google maps. Below, we will download a file with these locations being already mapped Google Maps queries. From these queries, we can extract geographical coordinates.

## Adding geo information

In [None]:
df_loc_name = pd.read_csv('location_name.txt', sep='\t', header=None)
df_loc_name.columns = ['name']
df_loc_name.head()

In [None]:
df_loc_url = pd.read_csv('location_url.txt', sep='\t', header=None)
df_loc_name['url'] = df_loc_url[0]
df_loc_name.head()

Parcing url of locations

In [None]:
name_arr = []
lat_arr = []
lon_arr = []

for url in df_loc_name['url'].values:
    tmp = url.split('/')
    name_arr.append(tmp[5])
    loc = tmp[6][1:-3].split(',')
    #print(loc)
    lat_arr.append(float(loc[0]))
    lon_arr.append(float(loc[1]))

In [None]:
df_loc_name['disp_name'] = name_arr
df_loc_name['lat'] = lat_arr
df_loc_name['lon'] = lon_arr
df_loc_name.head()

Additing location information to tweets_df

In [None]:
disp_name = []
lat_arr = []
lon_arr = []
for user_loc in tweets_df['user_loc'].values:
    df_row = df_loc_name[df_loc_name['name'] == user_loc]
    if len(df_row) > 0:
        disp_name.append(df_row['disp_name'].values[0])
        lat_arr.append(df_row['lat'].values[0])
        lon_arr.append(df_row['lon'].values[0])
        pass
    else:
        disp_name.append(None)
        lat_arr.append(None)
        lon_arr.append(None)
        pass
    pass

tweets_df['user_loc_disp_name'] = disp_name
tweets_df['user_loc_lat'] = lat_arr
tweets_df['user_loc_lon'] = lon_arr

tweets_df.head(2)

## Adding day and time

In [None]:
day_arr = []
day_hour_arr = []
time_arr = []
hour_arr = []
for val in tweets_df['created_at'].values:
    datetime_object = datetime.strptime(val, '%a %b %d %H:%M:%S +0000 %Y')
    day_arr.append(datetime_object.date())
    time_arr.append(datetime_object.time())
    hour_arr.append(datetime_object.time().hour)
    day_hour_arr.append(datetime_object.replace(minute=0).replace(second=0))

In [None]:
datetime_object

In [None]:
tweets_df['day'] = day_arr
tweets_df['time'] = time_arr
tweets_df['hour'] = hour_arr
tweets_df['day_hour'] = day_hour_arr

tweets_df.head(2)

# Plotting

## Statistics

In [None]:
tot = len(tweets_df)
print('Number of tweets: {}'.format(tot))
print('Starting date   : {}'.format(tweets_df['day'].min()))
print('End date        : {}'.format(tweets_df['day'].max()))
sent_class = tweets_df.groupby(['Sentiment_class'])['day'].count()
print('Negative        : {} ({}%)'.format(sent_class['NEG'], 
                                          np.round(sent_class['NEG'] / tot * 100, 2)))
print('Neutral         : {} ({}%)'.format(sent_class['NEU'], 
                                          np.round(sent_class['NEU'] / tot * 100, 2)))
print('Positive        : {} ({}%)'.format(sent_class['POS'], 
                                          np.round(sent_class['POS'] / tot * 100, 2)))

## Choosing a time period

In [None]:
start_date = datetime.strptime('2021-12-20', '%Y-%m-%d').date()
end_date = datetime.strptime('2022-01-19', '%Y-%m-%d').date()
#print(start_date)
#print(end_date)
tweets_df_tmp = tweets_df[(tweets_df['day'] >= start_date) & (tweets_df['day'] <= end_date)]
print('Chosen tweets: {} ({}%)'.format(
    len(tweets_df_tmp), np.round(len(tweets_df_tmp)/len(tweets_df)*100, 2)
))

In [None]:
tmp = tweets_df_tmp.groupby('day')['created_at'].count().reset_index()
tmp.columns = ['day', '# of tweets']
tmp.head()

### Number of tweets per day

In [None]:
px.line(data_frame=tmp, x='day', y=['# of tweets'], title='Number of tweets per day')

In [None]:
tmp = pd.pivot_table(tweets_df_tmp, index=["day"], columns=["Sentiment_class"], 
                     values=["created_at"], aggfunc='count')
tmp.reset_index(inplace=True)
tmp.columns = ['day', 'NEG', 'NEU', 'POS']
# replace nans with zeros
tmp['NEG'] = tmp['NEG'].fillna(0)
tmp['NEU'] = tmp['NEU'].fillna(0)
tmp['POS'] = tmp['POS'].fillna(0)

In [None]:
px.line(data_frame=tmp, x='day', title='Number of tweets per day', y=[  
    'NEU',
    'NEG',
    #'POS',
])

In [None]:
px.bar(data_frame=tmp, x='day', title='Number of tweets per day', y=[  
    'NEU',
    'NEG',
    'POS',
])

### Number of tweets per hour (during the day)

In [None]:
tmp = tweets_df_tmp.groupby('hour')['created_at'].count().reset_index()
tmp.columns = ['hour', '# tweets']
px.line(data_frame=tmp, x='hour', title='Total number of tweets per hour', y=['# tweets'])

Add 0 where required

In [None]:
tmp = tweets_df_tmp.groupby('day_hour')['created_at'].count().reset_index()
tmp.columns = ['day_hour', '# tweets']
#px.line(data_frame=tmp, x='day_hour', title='Total number of tweets per hour', y=['# tweets'])

In [None]:
full_num_arr = []
full_date_arr = []
available_date_arr = tmp['day_hour'].tolist()
available_count_arr = tmp['# tweets']

full_date_arr.append(available_date_arr[0])
full_num_arr.append(available_count_arr[0])
for i in range(1, len(available_date_arr)):
    next_date = available_date_arr[i]
    next_count = available_count_arr[i]
    last_date = full_date_arr[-1]
    while next_date > last_date + timedelta(hours=1):
        full_date_arr.append(last_date + timedelta(hours=1))
        full_num_arr.append(0)
        last_date = full_date_arr[-1]
        pass
    full_date_arr.append(next_date)
    full_num_arr.append(next_count)
    pass
tmp_new = pd.DataFrame()
tmp_new['date'] = full_date_arr
tmp_new['# tweets'] = full_num_arr

In [None]:
#tmp_new.tail(15)

In [None]:
px.line(data_frame=tmp_new, x='date', title='Total number of tweets per hour', y=['# tweets'])

### Geographical visualization

In [None]:
tweets_to_geo_df = tweets_df_tmp.groupby(by=['user_loc_disp_name', 'user_loc_lat', 'user_loc_lon'])[['created_at']].count().reset_index()
tweets_to_geo_df.columns = ['user_loc_disp_name', 'user_loc_lat', 'user_loc_lon', '# tweets']

num_with_geo = tweets_to_geo_df['# tweets'].sum()
print('Number of tweets with geo information {} ({}%)'.format(
    num_with_geo, np.round( num_with_geo/len(tweets_df_tmp)*100, 2)
))

In [None]:
print('Top geo locations:')
tweets_to_geo_df.sort_values(by='# tweets', ascending=False)[:9]

In [None]:
fig = px.scatter_geo(tweets_to_geo_df, lat="user_loc_lat", lon='user_loc_lon',
                     color="# tweets",
                     hover_name="user_loc_disp_name", 
                     #size="created_at", 
                     projection="natural earth"
                    )
fig.show()

#### Per sentiment class

In [None]:
sentiment_classes = [
    'NEG', 
    #'NEU',
    #'POS',
]

tweets_to_geo_df = tweets_df_tmp[tweets_df_tmp['Sentiment_class'].isin(sentiment_classes)].groupby(by=['user_loc_disp_name', 'user_loc_lat', 'user_loc_lon'])[['created_at']].count().reset_index()
tweets_to_geo_df.columns = ['user_loc_disp_name', 'user_loc_lat', 'user_loc_lon', '# tweets']

num_with_geo = tweets_to_geo_df['# tweets'].sum()
print('Number of tweets with geo information {}'.format(num_with_geo))

In [None]:
print('Top geo locations:')
tweets_to_geo_df.sort_values(by='# tweets', ascending=False)[:9]

In [None]:
fig = px.scatter_geo(tweets_to_geo_df, lat="user_loc_lat", lon='user_loc_lon',
                     color="# tweets",
                     hover_name="user_loc_disp_name", 
                     #size="created_at", 
                     projection="natural earth",
                     title='Number of tweets with sentiment in {}'.format(sentiment_classes)
                    )
fig.show()

# Image classification

In [None]:
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions

model = ResNet50(weights='imagenet')

The images associated with tweets are stored in the `extended_entities` column

In [None]:
print('Ids of tweets that contain media:')
possible_ids = tweets_df[~tweets_df['extended_entities'].isna()]['extended_entities'].index.tolist()
possible_ids

Let us analyse the image associated with the tweet with id=94

In [None]:
tweet_id = possible_ids[2]

In [None]:
image_url = tweets_df[~tweets_df['extended_entities'].isna()]['extended_entities'][tweet_id]['media'][0]['media_url']

image_source = requests.get(image_url)
image_file = "sample_image.jpg" 
file = open(image_file, "wb")
file.write(image_source.content)
file.close()

img = image.load_img(image_file, target_size=(224, 224))

x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)

preds = model.predict(x)
# decode the results into a list of tuples (class, description, probability)
# (one such list for each sample in the batch)
print('Predicted:')
for el in decode_predictions(preds, top=3)[0]:
    print('\t {} - {}'.format(el[1], el[2]))
img