In [1]:
# LIBRARIES:

import snscrape.modules.twitter as sntwitter
import pandas as pd
import re
import json

import psycopg2
from psycopg2 import sql

import spacy
nlp = spacy.load('es_core_news_sm')

import warnings
warnings.filterwarnings('ignore')

2022-03-02 15:43:33.623377: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-03-02 15:43:33.623423: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
# DB PARAMETERS

schema = 'smi_schema'
queries_path = '../build/queries/'

with open('../../../../context/SMI/config/postgres.config') as config_file:
    db_config = json.load(config_file)

# Local database deployment
conn = psycopg2.connect(
                        dbname=db_config['db_name'],
                        user=db_config['db_user'],
                        host='localhost',
                        port=db_config['db_port'],
                        password=db_config['db_password'],
                        options=db_config['db_options']
                        )
conn.autocommit = True
cur = conn.cursor()

In [3]:
## FUNCTIONS:

def fetchall_SQL(self, path):
    """
    Function to fetch all observations from a query to databasee:
    params:
        - path: relative path to the file.
    """
    # Read the SQL query from .sql file:
    with open(path, 'r') as f: 
        query = f.read().format(schema=self.schema)

    # Initialize SQL cursor:
    cur = self.conn.cursor()

    try:

        #Execute query
        cur.execute(query)
        db_fetch = cur.fetchall()
        
        return(db_fetch)

    except (Exception, psycopg2.DatabaseError) as error:
        self.conn.rollback()
        self.api_logger.exception(error)
    cur.close()

def get_stops(stops, path):
    
    with open(path + stops[0]) as f:
        stopw_1 = f.read().splitlines()
    with open(path + stops[1]) as f:
        stopw_2 = f.read().splitlines()

    stopw_1[0] = stopw_1[0].replace('\ufeff', '')
    stopw = stopw_1 + stopw_2
    stopw = [accent_rem(word) for word in stopw]
    return(stopw)

def get_ecofilter(path, files):
    '''
    Function to get the list of words to filter the tweets
    '''
    eco_filter = pd.DataFrame()
    for file in files:
        eco_filter = pd.concat([eco_filter, pd.read_excel(path + file, header=None)], axis=0)
    eco_filter.drop_duplicates(inplace=True)
    eco_filter = eco_filter.iloc[:,0].to_list()
    #eco_filter = [lemmatize(word) for word in eco_filter]
    return [accent_rem(word) for word in eco_filter]

def accent_rem(name):
    '''
    Function to remove accents from an alphanumeric string:
    params:
        - name: character string.
    Output: string without accents.
    '''
    #Define replacements (possible accents or other special char)
    replacements = (
        ("á", "a"),
        ("é", "e"),
        ("í", "i"),
        ("ó", "o"),
        ("ú", "u"),
        ("ñ", 'n'),
        ("à", "a"),
        ("è", "e"),
        ("ì", "i"),
        ("ò", "o"),
        ("ù", "u"),
        ("ä", 'a'),
        ("ë", "e"),
        ("ï", "i"),
        ("ö", "o"),
        ("ü", "u"),
    )
    #Replace with tuple:
    for a, b in replacements:
        name = name.replace(a, b).replace(a.upper(), b.upper())
    return(name)

def tweet_cleaner(tweet, stopw, ecol):
    '''
    Function to treat the text of a tweet.
    params:
        - tweet: the document itself.
    output: the tweet cleaned.
    '''
    # Remove urls (http in advance)
    tweet = re.sub(r'http.*',"", tweet)
    tweet = re.sub(r'pic.twitter\S+', '', tweet)
    # Remove mentions and hastags.
    tweet = re.sub(r'#\S+', '', tweet)
    tweet = re.sub(r'@\S+', '', tweet)
    # Remove spanish vowel accents.
    tweet = ' '.join([accent_rem(word) for word in tweet.split()])
    # Remove special characters.
    tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", tweet).split())
    # Lower captions.
    tweet = tweet.lower()
    # Remove numbers.
    tweet = ''.join([i for i in tweet if not i.isdigit()])
    # Remove white spaces.
    tweet = re.sub(' +', ' ', tweet)
    # Remove stopwords.
    tweet = ' '.join([word for word in tweet.split() if not word in stopw])
    # Filter words length (< 1 and > 15).
    tweet = ' '.join([word for word in tweet.split() if len(word) > 1 and len(word) <= 15])
    # Filter ecolist.
    commons = [word for word in ecol if word in tweet]
    if len(commons) < 1:
        tweet = ''
    #if len(tweet) > 1:
    #   #Lemmatize:
    #    tweet = ' '.join([tok.lemma_.lower() for tok in nlp(tweet)])
    return tweet

def treat_text(df, text_col, stopw = [], ecol =[], date_col = 'date', sent_col = 'sentiment'):
    '''
    Function to treat text columns:
    params:
        - df: dataframe to treat.
        - text_col: name of the text columns to treat.
        - ecolist
    Output: Dataframe treated.
    '''
    # Sanity checks :
    df = df.fillna('')
    
    # Formatting corpus columns:
    if date_col == 'date':
        print('Text mining job: Format date column.')
        df[date_col] = pd.to_datetime(df[date_col])
    if sent_col == 'sentiment':
        print('Text mining job: Format sentiment column.')
        df[sent_col] = df[sent_col].replace(',','', regex=True)
        df[sent_col] = df[sent_col].apply(lambda r: r.split('AGREEMENT')[0])
        df[sent_col] = df[sent_col].apply(lambda r: r.split('DI')[0])

    #Column text treatment:
    print('Text mining job: Treat text column.')
    df[text_col] = df[text_col].fillna(' ')
    df[text_col] = df[text_col].apply(lambda r: tweet_cleaner(r, stopw, ecol))
    df = df[df[text_col] != '']
    if text_col == 'text':
        df = df[['username', 'date', 'text']]
    df = df.reset_index(drop=True)
    return(df)

def get_tweets(user, date_ini, date_end, stopw, ecolist):
    '''
    Function to get tweets from a user given a period range.
    params:
        - user: twitter user name.
        - date_ini: first day of time window to retrieve tweets.
        - date_end: last date of time window to retrieve tweets.
    '''
    # Tweets list:
    twts_ls = []

    # Twitter scrapper:
    for i, tweet in enumerate(sntwitter.TwitterSearchScraper('from:' + user + ' since:' + date_ini + ' until:' + date_end).get_items()):
        twts_ls.append([tweet.user.username, tweet.date, tweet.content])
        
    # Tweets dataframe: 
    df = pd.DataFrame(twts_ls, columns=['username', 'date', 'text'])
    df = treat_text(df, 'text', stopw, ecolist, date_col = 'date', sent_col = None)
    return df

In [4]:
# FILTERS

path = '../../../../context/SMI/data/utils'
stops = ['/db_stopwords_spanish_1.txt', '/db_stopwords_spanish_2.txt']
stopw = get_stops(stops, path)
ecofile = '/ecofilter.xlsx'
ecofiles = [ecofile]
eco_filter = get_ecofilter(path, ecofiles)

In [None]:
## Select a user from users 

In [6]:
# Prueba para descargar tweets de 1 usuario

user = 'dresponsable'
date_ini = '2012-01-21'
date_end = '2012-01-22'

df_tweets = get_tweets(user, date_ini, date_end, stopw, eco_filter)

Text mining job: Format date column.
Text mining job: Treat text column.


In [8]:
df_tweets

Unnamed: 0,username,date,text
0,dresponsable,2012-01-21 23:43:12+00:00,mola gt gt emprendedores sociales mundo estare...
1,dresponsable,2012-01-21 20:41:26+00:00,os habeis llama publi
2,dresponsable,2012-01-21 20:13:28+00:00,facilon rosa corazon opacidad intereses oculto...
3,dresponsable,2012-01-21 19:28:31+00:00,os gusta bolso tomate regalado chica


In [12]:
tuple(df_tweets[['date', 'text']].iloc[1,:])

(Timestamp('2012-01-21 20:41:26+0000', tz='UTC'), 'os habeis llama publi')

In [40]:
# From a tweets backup file, get all the unique dates from each user, in order to avoid to scrap these dates again.

df_prueba_agg = df_prueba[['username', 'date']].drop_duplicates().groupby(['username'], as_index=False).agg({'date': ', '.join})

# Insert these dates for each users into DB (smi_date_tweets)

for i in range(df_prueba_agg.shape[0]):
    insert_new_users_into_db(tuple(df_prueba_agg.iloc[i, :]))

##
## insert into smi_schema.smi_date_tweets (username, date_tweets)
## values ('manu', '1995, 1996')
## on conflict (username) do update set date_tweets = smi_date_tweets.date_tweets || ', ' || excluded.date_tweets;

In [46]:
df_prueba_agg.head(5)

Unnamed: 0,username,date
0,005anibal,2015-10-22
1,007DoctorNo,2015-10-22
2,007Nela,2015-10-22
3,007martin,2015-10-22
4,00Be_Happy,2015-10-22


In [45]:
df_prueba_agg[df_prueba_agg['username'] == 'EqOVER']['date'].tolist()

['2015-10-22, 2015-10-23']

In [18]:
df_prueba_t = treat_text(df_prueba, stopw, eco_filter)

In [19]:
df_prueba_t

Unnamed: 0,username,date,text
0,COMBarcelona,2015-10-22 14:00:00,setmana parlarem economia social amb
1,COMBarcelona,2015-10-22 09:10:00,forum inversio healthcare social empreses
2,EqOVER,2015-10-23 00:56:00,buscar maneras mostrar vision mision empresa m...
3,Xaviicastro_,2015-10-22 22:34:00,manda orgullo reina desgracia
4,Xaviicastro_,2015-10-22 22:20:00,mercedes cambiate pareces abestruz
...,...,...,...
86470,OnasisZarate,2015-10-23 01:45:00,encuentro reunion dirigentes prd
86471,OnasisZarate,2015-10-23 00:28:00,quiero felicitar amiga nombramiento presidenta...
86472,OnasisZarate,2015-10-22 15:56:00,amiga felicidades tu cumpleanos gracias tu ami...
86473,OnasisZarate,2015-10-22 04:44:00,lideres izquierda


In [66]:
def dates_range(date_ini):
    '''
    Function to get all dates between a initial date and today.
    params:
        - date_ini: initial date of the range.
    output: string with all dates within the study interval (initial date and today).
    '''
    date_end = pd.to_datetime("today")
    return [str(date.date()) for date in pd.date_range(date_ini, date_end, freq='D')]

In [103]:
#Generate dates range from init date to now:
dates = dates_range('2012-01-01')

#From a given user, get all the tweets posting dates on the DB:
user_to_add_dates = 'EqOVER'
dates_with_tweets = df_prueba_agg[df_prueba_agg['username'] == user_to_add_dates]['date'].values[0].split(', ')

#Dates to scrap from user:
dates_to_scrap = list(set(dates) - set(dates_with_tweets))

#Scrap tweets;

#Persist scrapped dates on DB:

In [106]:
dates_to_scrap

['2013-01-16',
 '2012-08-17',
 '2015-02-01',
 '2014-07-04',
 '2021-02-16',
 '2018-02-19',
 '2016-12-27',
 '2019-11-30',
 '2016-09-09',
 '2018-09-06',
 '2015-05-03',
 '2016-02-04',
 '2017-08-29',
 '2021-07-14',
 '2014-03-25',
 '2016-06-28',
 '2016-04-23',
 '2017-11-25',
 '2016-02-21',
 '2021-01-01',
 '2016-08-24',
 '2013-02-07',
 '2016-09-13',
 '2018-06-22',
 '2013-02-20',
 '2018-01-20',
 '2014-08-02',
 '2018-04-29',
 '2019-06-22',
 '2018-04-12',
 '2014-07-20',
 '2017-03-26',
 '2019-01-03',
 '2016-06-14',
 '2012-08-01',
 '2014-08-29',
 '2017-11-27',
 '2019-10-31',
 '2019-09-07',
 '2017-05-13',
 '2019-09-12',
 '2020-10-09',
 '2018-04-25',
 '2016-10-21',
 '2016-01-28',
 '2016-08-04',
 '2014-02-25',
 '2014-11-09',
 '2013-10-18',
 '2013-08-16',
 '2015-10-08',
 '2015-12-24',
 '2014-11-21',
 '2012-06-12',
 '2014-02-14',
 '2013-02-01',
 '2020-03-18',
 '2022-02-22',
 '2012-03-26',
 '2014-03-13',
 '2016-05-17',
 '2018-10-05',
 '2021-02-13',
 '2016-01-19',
 '2019-07-28',
 '2020-06-08',
 '2019-09-

In [104]:
date_tweets_row = tuple(,)

NameError: name 'date_tweets_row' is not defined

In [98]:
with open(queries_path + 'SMI_insert_date_tweets.sql') as f:
    cur.execute(
        sql.SQL(f.read()).format(schema=sql.Identifier(schema)),
        date_tweets_row
    )

['2015-10-22', '2015-10-23']