In [1]:
import pandas as pd
import glob
from difflib import SequenceMatcher
import re

# Helper functions

In [2]:
def addToPrev(prev, index, text):
    prev[index] = text
    return prev

################################

def inPrev(prev, text, tot_sims):
    for message in prev:
        if SequenceMatcher(None, text, message).ratio() >= 0.7:
            tot_sims.setdefault(message, []).append(text)
            return True, tot_sims
    return False, tot_sims

################################

def reduce_tweets(tweets, reduceBy): 
    prev = [""]*reduceBy
    index = 0
    tot_sims = {}
    filtered_tweets = []
    
    for _tweet in tweets:
        in_prev, tot_sims = inPrev(prev, _tweet, tot_sims)
        if not in_prev:
            prev = addToPrev(prev, index, _tweet)
            index = (index + 1) % reduceBy
            filtered_tweets.append(_tweet)
    
    return filtered_tweets

################################

def check_sim(check_word, sims):
    words = ["mosquito", "protect", "repel", "spray"]
    for word in words:
        ratio = SequenceMatcher(None, check_word, word).ratio()
        if ratio >= 0.7:
            sims[check_word] = ratio
            return True, sims
    return False, sims

################################

def check_sim_v2(check_word, sims):
    if re.compile('m[a-z]+sq[a-z]+to').match(check_word):
        sims.append(check_word)
        return True, sims
    
    for word in ['repel', 'protect', 'spray']:
        if re.compile(word).search(check_word):
            sims.append(check_word)
            return True, sims
    
    return False, sims

# Store Tweets

In [3]:
path = "/Users/datacsv/*.csv"

In [4]:
tweets = []
regex = re.compile('[^a-zA-Z]')
sims = []
for file in glob.glob(path):
    df = pd.read_csv(file, sep=',', usecols=['Message', 'Language'])
    df_tweets = df[df["Language"] == "en"]["Message"]

    for tweet in df_tweets:
        if tweet[0:2] != "RT":
            for word in tweet.lower().split():
                isSimilar, sims = check_sim_v2(regex.sub('', word), sims)
                if(isSimilar):
                    tweets.append(tweet)
                    break

In [5]:
print("Number of tweets that contain mosquito or a similar word is: {}".format(len(tweets)))
#357052

Number of tweets that contain mosquito or a similar word is: 454661


In [6]:
pd.DataFrame({"tweets": tweets}).to_csv("all_tweets.csv")

### Remove redundant tweets

In [7]:
filtered_tweets = reduce_tweets(tweets, 10)
print("Number of tweets after prev 10 algorithm: {}".format(len(filtered_tweets)))


Number of tweets after prev 10 algorithm: 260654


In [8]:
pd.DataFrame({"filtered_tweets": filtered_tweets}).to_csv("filtered_tweets.csv")


### Sims

In [13]:
pd.DataFrame({"sims": set(sims)}).to_csv("sims.csv")