In [1]:
import pandas as pd
import glob
from difflib import SequenceMatcher
import re

# Helper functions

In [2]:
def addToPrev(prev, index, text):
    prev[index] = text
    return prev

################################

def inPrev(prev, text, tot_sims):
    for message in prev:
        if SequenceMatcher(None, text, message).ratio() >= 0.7:
            tot_sims.setdefault(message, []).append(text)
            return True, tot_sims
    return False, tot_sims

################################

def reduce_tweets(tweets, reduceBy): 
    prev = [""]*reduceBy
    index = 0
    tot_sims = {}
    filtered_tweets = []
    
    for _tweet in tweets:
        in_prev, tot_sims = inPrev(prev, _tweet, tot_sims)
        if not in_prev:
            prev = addToPrev(prev, index, _tweet)
            index = (index + 1) % reduceBy
            filtered_tweets.append(_tweet)
    
    return filtered_tweets

################################

def check_sim(check_word, sims):
    words = ["mosquito", "protect", "repel", "spray"]
    for word in words:
        ratio = SequenceMatcher(None, check_word, word).ratio()
        if ratio >= 0.7:
            sims[check_word] = ratio
            return True, sims
    return False, sims

################################

def check_sim_v2(check_word, sims, lang):
    if re.compile('m[a-z]+sq[a-z]+to').match(check_word):
        sims.append(check_word)
        return True, sims
    
    words = ['repel', 'protect', 'spray']
    words_es = ["proteger", "repeler", "rociar"]
    words_pt = ["proteger", "repelir", "spary"]
    if lang == "es":
        words = words_es
    elif lang == "pt":
        words = words_pt
        
    for word in words:
        if re.compile(word).search(check_word):
            sims.append(check_word)
            return True, sims
    
    return False, sims

# Store Tweets

In [3]:
path = "/Users/datacsv/*.csv"

In [4]:
tweets_es = []
tweets_pt = []
regex = re.compile('[^a-zA-Z]')
sims_es = []
sims_pt = []
for file in glob.glob(path):
    df = pd.read_csv(file, sep=',', usecols=['Message', 'Language'])
    df_tweets_es = df[df["Language"] == "es"]["Message"]
    df_tweets_pt = df[df["Language"] == "pt"]["Message"]
    
    for tweet in df_tweets_es:
        if tweet[0:2] != "RT":
            for word in tweet.lower().split():
                isSimilar, sims_es = check_sim_v2(regex.sub('', word), sims_es, "es")
                if(isSimilar):
                    tweets_es.append(tweet)
                    break
                    
    for tweet in df_tweets_pt:
        if tweet[0:2] != "RT":
            for word in tweet.lower().split():
                isSimilar, sims_pt = check_sim_v2(regex.sub('', word), sims_pt, "pt")
                if(isSimilar):
                    tweets_pt.append(tweet)
                    break

In [5]:
print("Number of tweets that contain mosquito or a similar word is:")
print("es: {}".format(len(tweets_es)))
print("pt: {}".format(len(tweets_pt)))


Number of tweets that contain mosquito or a similar word is:
es: 121373
pt: 43401


In [6]:
pd.DataFrame({"tweets_es": tweets_es}).to_csv("all_tweets_es.csv")
pd.DataFrame({"tweets_pt": tweets_pt}).to_csv("all_tweets_pt.csv")

### Remove redundant tweets

In [7]:
filtered_tweets_es = reduce_tweets(tweets_es, 10)
filtered_tweets_pt = reduce_tweets(tweets_pt, 10)

In [8]:
print("Number of tweets after prev 10 algorithm:")
print("es: {}".format(len(filtered_tweets_es)))
print("pt: {}".format(len(filtered_tweets_pt)))

Number of tweets after prev 10 algorithm:
es: 59810
pt: 26770


In [9]:
pd.DataFrame({"filtered_tweets_es": filtered_tweets_es}).to_csv("filtered_tweets_es.csv")
pd.DataFrame({"filtered_tweets_pt": filtered_tweets_pt}).to_csv("filtered_tweets_pt.csv")


### Sims

In [10]:
pd.DataFrame({"sims_es": list(set(sims_es))}).to_csv("sims_es.csv")
pd.DataFrame({"sims_pt": list(set(sims_pt))}).to_csv("sims_pt.csv")