In [1]:
# LIBRARIES:

import snscrape.modules.twitter as sntwitter
import pandas as pd
import re
import json
import random
import time
from io import StringIO

import psycopg2
from psycopg2 import sql

import spacy
nlp = spacy.load('es_core_news_sm')

import warnings
warnings.filterwarnings('ignore')

2022-03-03 09:56:45.060455: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-03-03 09:56:45.060529: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
# DB PARAMETERS

schema = 'smi_schema'
queries_path = '../build/queries/'

with open('../../../../context/SMI/config/postgres.config') as config_file:
    db_config = json.load(config_file)

# Local database deployment
conn = psycopg2.connect(
                        dbname=db_config['db_name'],
                        user=db_config['db_user'],
                        host='localhost',
                        port=db_config['db_port'],
                        password=db_config['db_password'],
                        options=db_config['db_options']
                        )
conn.autocommit = True
cur = conn.cursor()

In [225]:
## FUNCTIONS:

def fetchall_SQL(path, schema):
    """
    Function to fetch all observations from a query to databasee:
    params:
        - path: relative path to the file.
    """

    with open(path, 'r') as f: 
        query = f.read().format(schema=schema)

    try:
        #Execute query
        cur = conn.cursor()
        cur.execute(query)
        db_fetch = cur.fetchall()
        cur.close()
        return(db_fetch)

    except (Exception, psycopg2.DatabaseError) as error:
        conn.rollback()

def insert_datetweets_into_db(path, user_row, schema, conn):
    '''
    Function to update tweet dates on DB.
    params:
        - user_row: row to insert into the db.
    '''

    try:
        cur = conn.cursor()
        with open(path) as f:
            cur.execute(
                sql.SQL(f.read()).format(schema=sql.Identifier(schema)),
                user_row
            )

            conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        conn.rollback()

def df_to_postgres(df, table, conn):
    """
    Function to save dataframe into postgres with copy_from:
    params:
        - conn: database connection.
        - df: pandas dataframe.
        - table: database table.
    """
    #Buffering the dataframe into memory:
    buffer = StringIO()
    df.to_csv(buffer, header=False, index=False)
    buffer.seek(0)
    cur = conn.cursor()

    try:
        #Copy cached dataframe into postgres:
        cur.copy_from(buffer, table, sep=",")
        conn.commit()

    except (Exception, psycopg2.DatabaseError) as error:
        conn.rollback()
        cur.close()

    cur.close()

def query_SQL(path, schema, conn):
    """
    Function to make a query to database:
    params:
        - path: relative path to the file.
    """
    # Read the SQL query from .sql file:
    with open(path, 'r') as f:
        query = f.read().format(schema=schema)
    cur = conn.cursor()

    try:
        cur.execute(query)

    except (Exception, psycopg2.DatabaseError) as error:
        conn.rollback()

    cur.close()

def get_stops(stops, path):
    
    with open(path + stops[0]) as f:
        stopw_1 = f.read().splitlines()
    with open(path + stops[1]) as f:
        stopw_2 = f.read().splitlines()

    stopw_1[0] = stopw_1[0].replace('\ufeff', '')
    stopw = stopw_1 + stopw_2
    stopw = [accent_rem(word) for word in stopw]
    return(stopw)

def get_ecofilter(path, files):
    '''
    Function to get the list of words to filter the tweets
    '''
    eco_filter = pd.DataFrame()
    for file in files:
        eco_filter = pd.concat([eco_filter, pd.read_excel(path + file, header=None)], axis=0)
    eco_filter.drop_duplicates(inplace=True)
    eco_filter = eco_filter.iloc[:,0].to_list()
    #eco_filter = [lemmatize(word) for word in eco_filter]
    return [accent_rem(word) for word in eco_filter]

def accent_rem(name):
    '''
    Function to remove accents from an alphanumeric string:
    params:
        - name: character string.
    Output: string without accents.
    '''
    #Define replacements (possible accents or other special char)
    replacements = (
        ("á", "a"),
        ("é", "e"),
        ("í", "i"),
        ("ó", "o"),
        ("ú", "u"),
        ("ñ", 'n'),
        ("à", "a"),
        ("è", "e"),
        ("ì", "i"),
        ("ò", "o"),
        ("ù", "u"),
        ("ä", 'a'),
        ("ë", "e"),
        ("ï", "i"),
        ("ö", "o"),
        ("ü", "u"),
    )
    #Replace with tuple:
    for a, b in replacements:
        name = name.replace(a, b).replace(a.upper(), b.upper())
    return(name)

def tweet_cleaner(tweet, stopw, ecol):
    '''
    Function to treat the text of a tweet.
    params:
        - tweet: the document itself.
    output: the tweet cleaned.
    '''
    # Remove urls (http in advance)
    tweet = re.sub(r'http.*',"", tweet)
    tweet = re.sub(r'pic.twitter\S+', '', tweet)
    # Remove mentions and hastags.
    tweet = re.sub(r'#\S+', '', tweet)
    tweet = re.sub(r'@\S+', '', tweet)
    # Remove spanish vowel accents.
    tweet = ' '.join([accent_rem(word) for word in tweet.split()])
    # Remove special characters.
    tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", tweet).split())
    # Lower captions.
    tweet = tweet.lower()
    # Remove numbers.
    tweet = ''.join([i for i in tweet if not i.isdigit()])
    # Remove white spaces.
    tweet = re.sub(' +', ' ', tweet)
    # Remove stopwords.
    tweet = ' '.join([word for word in tweet.split() if not word in stopw])
    # Filter words length (< 1 and > 15).
    tweet = ' '.join([word for word in tweet.split() if len(word) > 1 and len(word) <= 15])
    # Filter ecolist.
    commons = [word for word in ecol if word in tweet]
    if len(commons) < 1:
        tweet = ''
    #if len(tweet) > 1:
    #   #Lemmatize:
    #    tweet = ' '.join([tok.lemma_.lower() for tok in nlp(tweet)])
    return tweet

def treat_text(df, text_col, stopw = [], ecol =[], date_col = 'date', sent_col = 'sentiment'):
    '''
    Function to treat text columns:
    params:
        - df: dataframe to treat.
        - text_col: name of the text columns to treat.
        - ecolist
    Output: Dataframe treated.
    '''
    # Sanity checks :
    df = df.fillna('')
    
    # Formatting corpus columns:
    if date_col == 'date':
        print('Text mining job: Formatting the date column.')
        df[date_col] = pd.to_datetime(df[date_col])
    if sent_col == 'sentiment':
        print('Text mining job: Formatting the sentiment column.')
        df[sent_col] = df[sent_col].replace(',','', regex=True)
        df[sent_col] = df[sent_col].apply(lambda r: r.split('AGREEMENT')[0])
        df[sent_col] = df[sent_col].apply(lambda r: r.split('DI')[0])

    #Column text treatment:
    print('Text mining job: Treat text column.')
    df[text_col] = df[text_col].fillna(' ')
    df[text_col] = df[text_col].apply(lambda r: tweet_cleaner(r, stopw, ecol))
    df = df[df[text_col] != '']
    if text_col == 'text':
        df = df[['username', 'date', 'text']]
    df = df.reset_index(drop=True)
    return(df)

def get_tweets(user, date_ini, date_end, stopw, ecolist):
    '''
    Function to get tweets from a user given a period range.
    params:
        - user: twitter user name.
        - date_ini: first day of time window to retrieve tweets.
        - date_end: last date of time window to retrieve tweets.
    '''
    # Tweets list:
    twts_ls = []

    # Twitter scrapper:
    for i, tweet in enumerate(sntwitter.TwitterSearchScraper('from:' + user + ' since:' + date_ini + ' until:' + date_end).get_items()):
        twts_ls.append([tweet.user.username, tweet.date, tweet.content])
        
    # Tweets dataframe: 
    df = pd.DataFrame(twts_ls, columns=['username', 'date', 'text'])
    df = treat_text(df, 'text', stopw, ecolist, date_col = 'date', sent_col = None)
    return df

In [30]:
# FILTERS

path = '../../../../context/SMI/data/utils'
stops = ['/db_stopwords_spanish_1.txt', '/db_stopwords_spanish_2.txt']
stopw = get_stops(stops, path)
ecofile = '/ecofilter.xlsx'
ecofiles = [ecofile]
eco_filter = get_ecofilter(path, ecofiles)

In [228]:
##get_tweets script body:

#while True:

## Select user randomly from date tweets table (SQL query) "user":
print('Database job: Select user randomly from DB')
user = fetchall_SQL(queries_path + 'SMI_get_random_user.sql', schema)[0]

## Remove comma on the head of the vectorized version of the column "smi_str_datetweets" of the user and call it "user_date_tweets"
## Create object with username called user_screename:
user = list(user)
if len(user[1]) > 0:
    if user[1].split()[0] == ',':
        user[1] = ', '.join(user[1].split(', ')[1:])

user_screename = user[0]
user_date_tweets = user[1]

## Create vector of dates [2012-01-01, today] called "all_dates":
all_dates = [str(date.date()) for date in pd.date_range('2012-01-01', pd.to_datetime("today"), freq='D')]

## Create vector of disjoint dates between "all_dates" and "user_date_tweets" and call it "dates_to_scrap":
dates_to_scrap = list(set(all_dates) - set(user_date_tweets))

## Select randomly one of "dates_to_scrap":
ini_date = random.choice(dates_to_scrap)

## Create end_date as initial_date + 1 day:
end_date = (pd.to_datetime(ini_date) + pd.DateOffset(days=1)).date().strftime('%Y-%m-%d')

## Scrap the tweets of the user on that dates range:
print('Scrapping job: Retrieving tweets from user')
df_tweets = get_tweets(user_screename, ini_date, end_date, stopw, eco_filter)
print('Scrapping job: Number of scrapped tweets: ' + str(df_tweets.shape[0]))

## Insert the tweets in the tweets table (df_to_postgres) if there are tweets, otherwise do nothing
print('Database job: Inserting scrapped tweets on DB')
df_to_postgres(df_tweets, 'smi_tweets', conn)
print('Database job: Scrapped tweets inserted on DB')

## Execute SQL query to remove duplicated entries on the tweets table:
print('Database job: Removing duplicated entries')
query_SQL(queries_path + 'SMI_remove_dup_tweets.sql', schema, conn)
print('Database job: Duplicated entries removed')

## Update new retrieval date on smi_date_tweets table on DB
print('Database job: Inserting new scrapped date tweets into DB')
user_to_insert = user.copy()
user_to_insert[1] = ', '.join(user_to_insert[1].split(', ') + [ini_date])
insert_datetweets_into_db(queries_path + 'SMI_insert_date_tweets.sql', tuple(user_to_insert), schema, conn)
print('Database job: Scrapped date tweets inserted into DB')

## SLEEP n seconds, choose n randomly in the interval [60, 120]
time.sleep(random.randint(60, 120))

Database job: Select user randomly from DB
Scrapping job: Retrieving tweets from user
Text mining job: Formatting the date column.
Text mining job: Treat text column.
Scrapping job: Number of scrapped tweets: 0
Database job: Inserting scrapped tweets on DB
Database job: Scrapped tweets inserted on DB
Database job: Removing duplicated entries
Database job: Duplicated entries removed
Database job: Inserting new scrapped date tweets into DB
Database job: Scrapped date tweets inserted into DB


In [202]:
df_tweets

Unnamed: 0,username,date,text
0,MeteOrihuela,2016-02-29 14:27:10+00:00,interior sureste peninsular inmediaciones imag...
1,MeteOrihuela,2016-02-29 10:04:55+00:00,estabilidad atmosferica amanecer imagen andres...
