In [None]:
# Environment variables
import os
import time

import dotenv
import pandas as pd

from tweety import Twitter, filters
from geotext import GeoText

In [None]:
app = Twitter("session")

In [None]:
dotenv.load_dotenv()

username = os.environ.get('twitter_username')
password = os.environ.get('twitter_password')
# print(username, password)

In [None]:
app.sign_in(username=username, password=password)

In [None]:
def search_tweets(query, label_text, year, pages=1, wait_time=2):
    # Get cursor top from file
    if os.path.exists('./cursor_top.txt'):
        with open('./cursor_top.txt', 'r') as file:
            cursor_top = file.read()
    else:
        cursor_top = None

    since = f'{year}-01-01'
    until = f'{year}-12-31'
    temporal_range = f'since:{since} until:{until}'

    language = 'es'

    mention = ['@policiaecuador']
    not_from = ['@policiaecuador']

    query_mention = ' OR '.join(mention)
    query_not_from = f'-from:{" -from:".join(not_from)}'

    tweets = app.search(keyword=f'{query} {query_mention} {query_not_from} lang:{language} {temporal_range}',
                        filter_=filters.SearchFilters.Latest(),
                        pages=pages,  # page = 1 => 20 tweets aprox
                        wait_time=wait_time,  # wait_time is in seconds
                        # cursor=cursor_top,  # pagination cursor of the last tweet gotten
                        )
    return tweets

In [None]:
def load_into_dataframe(tweets, label_text) -> pd.DataFrame:
    id, date, text, url, location, label = [], [], [], [], [], []

    for tweet in tweets:
        id.append(int(tweet.id))
        date.append(tweet.date)
        text.append(str(tweet.text).replace("\n", " "))
        url.append(tweet.url)
        label.append(label_text)

        cities = GeoText(tweet.text).cities
        if cities:
            location.append(cities)
        else:
            place = tweet.place
            if place is not None:
                location.append(place.name)
            else:
                location.append([])

    return pd.DataFrame(

        data={
            "id": id,
            "date": date,
            "text": text,
            "url": url,
            "location": location,
            "label": label,
        }
    )

In [None]:
def save_tweets_as_csv(dataframe, label_text, keywords, path):

    if not os.path.exists(path):
        os.makedirs(path)

    dataframe.to_csv(f'./{path}/tweets - {label_text}.csv', index=False, quoting=2)

    # save search queries
    with open(f'./{path}/tweets - {label_text}.txt', 'w') as f:
        for keyword in keywords:
            f.write(keyword +  '\n')

    # To save as json
    # dataframe.to_json(f'./{path}/tweets.json', orient='records', force_ascii=False, date_format='iso')

In [None]:
def read_tweets_from_csv(path, label_text):
    # date = "2023-12-30 04:29:26+00:00"
    # date = pd.to_datetime(date)
    # date

    test = pd.read_csv(f'./{path}/tweets - {label_text}.csv')
    # test.date = test.date.apply(lambda x: pd.to_datetime(x))
    test.info()

In [None]:
search_keywords = {
    'robo': 'robo OR robando OR robaron OR robarle OR robado OR robada OR hurto OR huraton OR hurtado OR hurtada OR asalto OR asaltado OR asaltada OR asaltaron OR atraco OR atracaron OR atracado OR atracada',
    # 'asesinato' :'matan OR mataron OR asesinato OR asesinan OR asesinaron OR asesinado OR asesinada OR asesinadas OR asesinados OR femicidio OR sicariato OR acribillan OR acribillaron OR acribillado OR acribillada OR acribilladas OR acribillados OR homicidio',
    # 'secuestro' :'secuestraron OR secuestro OR secuestrada OR secuestrado OR secuestradas OR secuestrados',
    # 'terrorismo' :'terrorismo bomba OR explosivo OR bombas OR explosivos OR bombas OR explosivos OR atentados OR atentado OR terrorista OR terroristas OR tiroteo OR tiroteos',
    # 'extorsion' :'extorsión OR extorsion OR extorsionaron OR extorsionado OR extorsionada OR extorsionadas OR extorsionados OR vacuna OR vacunas',
}

data = {}
years = range(2021, 2024) # 2021 - 2023 años con mayor indice de delincuencia
wait_time = 5
pages = 2

for label_text, query in search_keywords.items():
    tweets = []
    keywords = []
    for year in years:
        print('year:', year, '\nlabel:', label_text, '\nquery:', query)
        result = search_tweets(query, label_text, year, pages=pages, wait_time=wait_time)
        time.sleep(wait_time)
        tweets.extend(result)
        keywords.append(result.keyword)
    data[label_text] = {}
    data[label_text]['tweets'] = load_into_dataframe(tweets, label_text)
    data[label_text]['keywords'] = keywords
    print(data)

In [None]:
path = f"./../datasets/non-reviewed"

for label_text in search_keywords.keys():
    print(data[label_text]['tweets'].info())

    # to display the full text / if n it will display n characters
    # pd.set_option('display.max_colwidth', None)
    # display(content['tweets'].head())

    # save into csv
    save_tweets_as_csv(data[label_text]['tweets'], label_text, data[label_text]['keywords'], path)

In [None]:
for label_text in search_keywords.keys():
    read_tweets_from_csv(path, label_text=label_text)