In [None]:
import pandas as pd
import glob
from difflib import SequenceMatcher
import re

# Helper functions

In [None]:
def addToPrev(prev, index, text):
    prev[index] = text
    return prev

def inPrev(prev, text, tot_sims):
    for message in prev:
        if SequenceMatcher(None, text, message).ratio() >= 0.7:
            tot_sims.setdefault(message, []).append(text)
            return True, tot_sims
    return False, tot_sims

def reduce_tweets(tweets, reduceBy): 
    prev = [""]*reduceBy
    index = 0
    tot_sims = {}
    filtered_tweets = []
    
    for _tweet in tweets:
        in_prev, tot_sims = inPrev(prev, _tweet, tot_sims)
        if not in_prev:
            prev = addToPrev(prev, index, _tweet)
            index = (index + 1) % reduceBy
            filtered_tweets.append(_tweet)
    
    return filtered_tweets

def check_sim(check_word, sims):
    words = ["mosquito", "protect", "repel", "spray"]
    for word in words:
        ratio = SequenceMatcher(None, check_word, word).ratio()
        if ratio >= 0.7:
            sims[check_word] = ratio
            return True, sims
    return False, sims

# Store Tweets

In [None]:
path = "/Users/datacsv/*.csv"

In [None]:
tweets = []
regex = re.compile('[^a-zA-Z]')
sims = {}
for file in glob.glob(path):
    df = pd.read_csv(file, sep=',', usecols=['Message', 'Language'])
    df_tweets = df[df["Language"] == "en"]["Message"]

    for _tweet in df_tweets:
        if _tweet[0:2] != "RT":
            for word in _tweet.lower().split():
                isSimilar, sims = check_sim(regex.sub('', word), sims)
                if(isSimilar):
                    tweets.append(_tweet)
                    break

In [None]:
print("Number of tweets that contain mosquito or a similar word is: {}".format(len(tweets)))
#357052

In [None]:
pd.DataFrame({"tweets": tweets}).to_csv("all_tweets.csv")

### Reduce Redundancy

In [None]:
filtered_tweets = reduce_tweets(tweets, 10)
print("Number of tweets after prev 10 algorithm: {}".format(len(filtered_tweets)))


In [None]:
pd.DataFrame({"filtered_tweets": filtered_tweets}).to_csv("filtered_tweets.csv")


### Sims

In [None]:
keys = sorted(sims, key=sims.get)
keys.reverse()
values = []
for i in keys:
    values.append(sims[i])

In [None]:
pd.DataFrame({"keys": keys, "values": values}).to_csv("sims.csv")