In [1]:
# importing libraries and packages
import snscrape.modules.twitter as sntwitter
import pandas as pd
import re

import spacy
nlp = spacy.load('es_core_news_sm')

import warnings
warnings.filterwarnings('ignore')

2022-02-24 21:04:34.255845: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-02-24 21:04:34.255907: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [52]:
def accent_rem(name):
    '''
    Function to remove accents from an alphanumeric string:
    params:
        - name: character string.
    Output: string without accents.
    '''
    #Define replacements (possible accents or other special char)
    replacements = (
        ("á", "a"),
        ("é", "e"),
        ("í", "i"),
        ("ó", "o"),
        ("ú", "u"),
        ("ñ", 'n'),
        ("à", "a"),
        ("è", "e"),
        ("ì", "i"),
        ("ò", "o"),
        ("ù", "u"),
        ("ä", 'a'),
        ("ë", "e"),
        ("ï", "i"),
        ("ö", "o"),
        ("ü", "u"),
    )
    #Replace with tuple:
    for a, b in replacements:
        name = name.replace(a, b).replace(a.upper(), b.upper())
    return(name)

def get_stops(stops, path):
    
    with open(path + stops[0]) as f:
        stopw_1 = f.read().splitlines()
    with open(path + stops[1]) as f:
        stopw_2 = f.read().splitlines()

    stopw_1[0] = stopw_1[0].replace('\ufeff', '')
    stopw = stopw_1 + stopw_2
    stopw = [accent_rem(word) for word in stopw]
    return(stopw)

def get_ecofilter(path, files):
    '''
    Function to get the list of words to filter the tweets
    '''
    eco_filter = pd.DataFrame()
    for file in files:
        eco_filter = pd.concat([eco_filter, pd.read_excel(path + file, header=None)], axis=0)
    eco_filter.drop_duplicates(inplace=True)
    eco_filter = eco_filter.iloc[:,0].to_list()
    #eco_filter = [lemmatize(word) for word in eco_filter]
    return [accent_rem(word) for word in eco_filter]

def remove_stops(tweet, stopw):
    return ' '.join([word for word in tweet.split() if not word in stopw])

def filter_noneco(tweet, ecolist):
    '''
    Function to check whether the tweet has economic topic or not:
    params:
        - tweet: the document itself.
        - ecolist: list of words related to economy and politics.
    output: the tweets if it contains at least one word in the ecolist.
    '''
    commons = [word for word in tweet.split() if word in ecolist]
    if len(commons) <= 1:
        tweet = ''
    return tweet

def get_ecotweets(df, ecolist, text_col = 'text'):
    
    df[text_col] = df[text_col].apply(lambda r: filter_noneco(r, ecolist))
    df = df[df[text_col] != '']
    
    return(df)

def lemmatize(tweet):
    doc = nlp(tweet)
    lemmas = [tok.lemma_.lower() for tok in doc]
    return ' '.join(lemmas)

def trail_ws(tweet):
    return re.sub(' +', ' ', tweet)

def remove_num(tweet):
    return ''.join([i for i in tweet if not i.isdigit()])

def treat_text(df, text_col, stopw = [], ecolist = [], date_col = 'date', sent_col = 'sentiment'):
        '''
        Function to treat the corpus columns:
        params:
            - df: dataframe to treat.
        Output: Dataframe treated.
        '''
        # Sanity checks:
        df = df.fillna('')
        
        # Formatting corpus columns:

        if date_col == 'date':
            df[date_col] = pd.to_datetime(df[date_col])
        if sent_col == 'sentiment':
            df[sent_col] = df[sent_col].replace(',', '', regex=True)

        #Columns treatment:
        df[text_col] = df[text_col].apply(lambda r: ' '.join([accent_rem(name) for name in r.split()]))
        df[text_col] = df[text_col].apply(lambda r: ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", r).split()))
        df[text_col] = df[text_col].apply(lambda r: r.lower())
        df[text_col] = df[text_col].apply(lambda r: remove_num(r))
        df[text_col] = df[text_col].apply(lambda r: trail_ws(r))
        df[text_col] = df[text_col].apply(lambda r: remove_stops(r, stopw))
        #df[text_col] = df[text_col].apply(lambda r: lemmatize(r))
        df[text_col] = df[text_col].apply(lambda r: ' '.join([accent_rem(name) for name in r.split()]))
    
        # Filter:
        df = df[df[text_col] != '']

        return(df.reset_index(drop=True))

def get_tweets(user, date_ini, date_end, stopw, ecolist):
    '''
    Function to get tweets from a user given a period range.
    params:
        - user: twitter user name.
        - date_ini: first day of time window to retrieve tweets.
        - date_end: last date of time window to retrieve tweets.
    '''
    # Tweets list:
    twts_ls = []

    # Twitter scrapper:
    for i, tweet in enumerate(sntwitter.TwitterSearchScraper('from:' + user + ' since:' + date_ini + ' until:' + date_end).get_items()):
        twts_ls.append([tweet.user.username, tweet.date, tweet.content])
        
    # Tweets dataframe: 
    df = pd.DataFrame(twts_ls, columns=['username', 'date', 'text'])
    df = treat_text(df, 'text', stopw, ecolist, date_col = 'date', sent_col = None)
    return df

In [53]:
path = '../../../../context/SMI/data/utils'
stops = ['/db_stopwords_spanish_1.txt', '/db_stopwords_spanish_2.txt']
stopw = get_stops(stops, path)
ecofile = '/ecofilter.xlsx'
ecofiles = [ecofile]
eco_filter = get_ecofilter(path, ecofiles)

In [54]:
# TODO: 
# Add stop words to database
# Add ecolist to database

In [55]:
# Prueba

user = 'dresponsable'
date_ini = '2015-10-21'
date_end = '2015-10-23'

df_tweets = get_tweets(user, date_ini, date_end, stopw, eco_filter)

In [56]:
df_tweets

Unnamed: 0,username,date,text
0,dresponsable,2015-10-22 23:58:14+00:00,reciben semanal rse rsc quieres apuntar puedes...
1,dresponsable,2015-10-22 22:08:44+00:00,incita rse comprar diario responsable
2,dresponsable,2015-10-22 18:14:20+00:00,carmen alvarez arenas pp q rse
3,dresponsable,2015-10-22 17:56:34+00:00,directo periscope partidos politicos rse obser...
4,dresponsable,2015-10-22 17:48:40+00:00,carmen alvarez arenas rse
5,dresponsable,2015-10-22 17:45:42+00:00,directo periscope partidos politicos rse
6,dresponsable,2015-10-22 17:37:34+00:00,equipo siguiendo propuestas partidos politicos...
7,dresponsable,2015-10-22 17:37:05+00:00,vision rse
8,dresponsable,2015-10-22 17:28:34+00:00,vision
9,dresponsable,2015-10-22 17:22:58+00:00,llenazo acto observatorio rsc


In [57]:
get_ecotweets(df_tweets, eco_filter, 'text')

Unnamed: 0,username,date,text
3,dresponsable,2015-10-22 17:56:34+00:00,directo periscope partidos politicos rse obser...
5,dresponsable,2015-10-22 17:45:42+00:00,directo periscope partidos politicos rse
6,dresponsable,2015-10-22 17:37:34+00:00,equipo siguiendo propuestas partidos politicos...
14,dresponsable,2015-10-21 18:33:29+00:00,guia x afrontar contribucion empresas ods rse
