# Creates Geo-Spatial Dengue Dataset

In [None]:
%cd /home/manoelribeiro/PycharmProjects/GeoDiseaseTwitter/
from scripts.dengue_dataset_utils import str_to_indexes, load_data, match_all, treat_data_nn, path_leaf
from keras.models import load_model
from nnet.data_utils import Data
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import datetime
import json
import glob
import time
import csv

- Config

In [None]:
names = ["id_tweet", "timestamp", "geolocation", "id_user", "n_friends", "n_followers", "text", "reg1", "reg2", "state", "city"]
keywords = ["dengue", "aedes"]
model = load_model("./nnet/logs/backup_model_best.hdf5")
config = json.load(open("./nnet/dengue.json"))
labels = ["Campaign", "Personal", "Information", "Opinion", "Joke"]
labels_dict = {"c":"Campaign", "e":"Personal", "i":"Information", "o":"Opinion", "p":"Joke"}

dict_alphabet = {}

for idx, char in enumerate(config["data"]["alphabet"]):
    dict_alphabet[char] = idx + 1

## Makes processed files

In [None]:
############ CHANGE THIS ############
start = 0
do_it = False
do_text = False
do_nn = False
make_check = False
do_incorporate = True
#####################################

- Make city files 

In [None]:
count = -1

if do_it:
    for city in glob.glob("./data/cidades_total/*"):
        count+=1
        if count < start:
            continue
            
        # Loads the data
        df = load_data(city)

        # Filters tweets by keyword matching
        df.loc[:, "keyword"] = df.text.apply(lambda x: match_all(x, keywords)).values

        # Drop column "state"
        df = df.drop("state", axis=1)

        # Create "user_keywords" field
        user_keywords = df.groupby("id_user")["keyword"].aggregate(lambda x: np.any(x.values))
        user_keywords = user_keywords[user_keywords]
        df.loc[:, "user_keywords"] = df.apply(lambda x: x["id_user"] in user_keywords, axis=1 )
        
        # print(city, (df["user_keywords"] == True).sum())
        
        # Creates new files with neural networks
        if (df["user_keywords"] == True).sum() > 0  and do_nn:
            inputs = list(df.loc[df["keyword"], "text"].apply(lambda x: list(treat_data_nn(x, config["data"]["input_size"], dict_alphabet))).values)
            inputs = np.asarray(inputs, dtype='int64')
            y_score = model.predict(inputs)
            mask = y_score.max(axis=1,keepdims=1) == y_score
            df.loc[:, "kw_type"] = "Other"
            df.loc[df["keyword"], "kw_type"] = [labels[i] for i in mask.argmax(axis=1)]
            df.to_csv("./data/cities_processed/" + path_leaf(city) + ".csv")
        
        # Writes cities and number of relevant tweets into a file!
        if do_text:
            with open("./data/city_count", "a") as f:
                f.write("{0},{1}\n".format(city, (df["user_keywords"] == True).sum()))

- Make file to be manually annotated

In [None]:
if make_check:
    df_all = pd.DataFrame(columns=["kw_type","text", "city"])
    for city in glob.glob("./data/cities_processed/*"):
        df = pd.read_csv(city, index_col=0)
        df.drop(df[df.kw_type == "Other"].index.values, axis=0, inplace=True)
        df.loc[:, "city"] = city
        df_all = pd.concat([df_all, df], sort=False)    
    df_all.sort_values(["kw_type", "text"], axis=0).to_csv("./data/_all_check.csv", index=True)

- Incorporates manual annotations

In [None]:
users = set()
if do_incorporate:
    df_all = pd.read_csv("./data/all_check.csv", index_col=0)
    for city in glob.glob("./data/cities_processed/*"):
        df = pd.read_csv(city, index_col=0)
        users = users.union(set(list(df.id_user.values)))
        df_all_city = df_all.loc[df_all.city == city, :]
        df.loc[df_all_city.index.values, "kw_type"] = df_all_city["kw_type"]
        # Drop column "state"
        df = df.drop("text", axis=1)
        df.to_csv("./data/cities/" + path_leaf(city) )
print(len(users))

- Final Adjustments to Key-Word Dataset

In [None]:
df = pd.read_csv("./data/dengue/classificados_all_header.tsv", sep="\t")
df = df.rename(columns={"class": "kw_type", "id": "tweet_id", "date":"timestamp"})
df.loc[:, "kw_type"] = df.kw_type.apply(lambda x: labels_dict[x])
df = df.drop("text", axis=1)
df.loc[:, "timestamp"] = df.timestamp.apply(lambda s:  str(int(time.mktime(datetime.datetime.strptime(s.strip(), "%Y-%m-%d").timetuple()))))
df.to_csv("./data/keywords_final.csv")

## Plots number of tweets of interest per city

In [None]:
cities = pd.read_csv("./data/city_count").sort_values(by="number", ascending=False).reset_index(drop=True)
cities["numeric"] = cities.index
ax = cities.plot(x="numeric", y="number", kind="scatter", figsize=(4.5,4), xticks = list(range(0,301, 30)), xlim=[0,300], logy=True, 
                yticks=[10**1, 10**2, 10**3, 10**4, 10**5, 10**6], alpha=0.8)
ax.set_title("Number of Tweets of Interest per City")
ax.set_xlabel("i-th city")
ax.set_ylabel("#tweets of interest")
print("Total number of cities:", len(cities))
print("Number of cities with any tweet of interest", (cities["number"] > 0).sum())
fig = ax.get_figure()
fig.savefig('./data/full_figure.pdf')

## Gets prevalence in the labeled data

- Keyword Tweets

In [None]:
from collections import Counter

df = pd.read_csv("./data/dengue/classificados_all_header.tsv", sep="\t", )

tmp = dict(Counter(df["class"].values))
tmp_p = dict()
total = 0
for key, val in tmp.items():
    total += val
for key, val in tmp.items():
    tmp_p[key] = tmp[key]/total
    
print(tmp)
print(tmp_p)

- Geolocated Tweets

In [None]:
from collections import Counter

df = pd.read_csv("./data/all_check.csv")

tmp = dict(Counter(df["kw_type"].values))
tmp_p = dict()
total = 0
for key, val in tmp.items():
    total += val
for key, val in tmp.items():
    tmp_p[key] = tmp[key]/total
    
print(tmp)
print(tmp_p)

---