In [13]:
import pandas as pd
import glob
from difflib import SequenceMatcher
import time
import re

In [2]:
path = "/Users/datacsv/*.csv"
test = "/Users/datacsv/tweets_2016_09_28.csv"

## Helper functions

In [8]:
def check_sim(check_word, lang):
    words = ["mosquito", "protect", "repel", "spary"]
    words_es = ["mosquito", "proteger", "repeler", "rociar"]
    words_pt = ["mosquito", "proteger", "repelir", "spary"]
    
    if lang == "es":
        words = words_es
    elif lang == "pt":
        words = words_pt
    
    for word in words:
        if SequenceMatcher(None, check_word, word).ratio() >= 0.7:
            return True
    return False

def get_tweets(path, lang):
    tweets = []
    sims = []
    regex = re.compile('[^a-zA-Z]')
    for file in glob.glob(path):
        df = pd.read_csv(file)
        messages = df[df['Language'] == lang]['Message']
        
        for tweet in messages:
            if tweet[0:2] != "RT":
                for word in tweet.lower().split():
                    if check_sim(regex.sub('', word), lang):
                        tweets.append(tweet)
                        sims.append(regex.sub('', word))
                        break
    return tweets, sims

def addToPrev(prev, index, text):
    prev[index] = text
    return prev

def inPrev(prev, text, tot_sims):
    for message in prev:
        if SequenceMatcher(None, text, message).ratio() >= 0.7:
            tot_sims.setdefault(message, []).append(text)
            return True, tot_sims
    return False, tot_sims

def reduce_tweets(tweets, reduceBy): 
    prev = [""]*reduceBy
    index = 0
    tot_sims = {}
    filtered_tweets = []
    
    for _tweet in tweets:
        in_prev, tot_sims = inPrev(prev, _tweet, tot_sims)
        if not in_prev:
            prev = addToPrev(prev, index, _tweet)
            index = (index + 1) % reduceBy
            filtered_tweets.append(_tweet)
    
    return filtered_tweets

## Reading and storing data

In [4]:
tweets_es, sims_es = get_tweets(path, "es")
tweets_pt, sims_pt = get_tweets(path, "pt")

  if self.run_code(code, result):


In [5]:
print("Number of Spanish tweets: {}".format(len(tweets_es)))
print("Number of Portuguese tweets: {}".format(len(tweets_pt)))

Number of Spanish tweets: 147169
Number of Portuguese tweets: 82380


In [6]:
pd.DataFrame({"tweets": tweets_es}).to_csv("tweets_es.csv")
pd.DataFrame({"tweets": tweets_pt}).to_csv("tweets_pt.csv")
pd.DataFrame({"sims": sims_es}).to_csv("sims_es.csv")
pd.DataFrame({"sims": sims_pt}).to_csv("sims_pt.csv")

## Remove redundancy

In [9]:
filtered_tweets_es = reduce_tweets(tweets_es, 10)
filtered_tweets_pt = reduce_tweets(tweets_pt, 10)

In [10]:
print("Number of tweets after prev 10 algorithm:\nSpanish: {}\nPortuguese: {}".format(len(filtered_tweets_es), len(filtered_tweets_pt)))


Number of tweets after prev 10 algorithm:
Spanish: 74332
Portuguese: 57448


In [11]:
pd.DataFrame({"tweets": filtered_tweets_es}).to_csv("filtered_tweets_es.csv")
pd.DataFrame({"tweets": filtered_tweets_pt}).to_csv("filtered_tweets_pt.csv")