In [1]:
import pandas as pd
import numpy as np
import glob
from difflib import SequenceMatcher
import re

# Helper functions

In [2]:
def addToPrev(prev, index, text):
    prev[index] = text
    return prev

################################

def inPrev(prev, text, tot_sims):
    for message in prev:
        if SequenceMatcher(None, text, message).ratio() >= 0.7:
            tot_sims.setdefault(message, []).append(text)
            return True, tot_sims
    return False, tot_sims

################################

def reduce_tweets(tweets, reduceBy): 
    prev = [""]*reduceBy
    index = 0
    tot_sims = {}
    filtered_tweets = []
    
    for _tweet in tweets:
        in_prev, tot_sims = inPrev(prev, _tweet, tot_sims)
        if not in_prev:
            prev = addToPrev(prev, index, _tweet)
            index = (index + 1) % reduceBy
            filtered_tweets.append(_tweet)
    
    return filtered_tweets

################################

def check_sim(check_word, sims):
    words = ["mosquito", "protect", "repel", "spray"]
    for word in words:
        ratio = SequenceMatcher(None, check_word, word).ratio()
        if ratio >= 0.7:
            sims[check_word] = ratio
            return True, sims
    return False, sims

################################

def check_sim_v2(check_word, sims):
    if re.compile('m[a-z]+sq[a-z]+to').match(check_word):
        sims.append(check_word)
        return True, sims
    
    words = ['repel', 'protect', 'spray', 
             "proteger", "repeler", "rociar","repelir"]

    for word in words:
        if re.compile(word).search(check_word):
            sims.append(check_word)
            return True, sims
    
    return False, sims

# Stats

In [3]:
path = "/Users/datacsv/*.csv"

In [4]:
temp_df = pd.read_csv("/Users/datacsv/tweets_2015_08_16.csv")
filtered_df = pd.DataFrame(columns=temp_df.columns)
data = {}
regex = re.compile('[^a-zA-Z]')
index = 0
sims = []
for file in glob.glob(path):
    df = pd.read_csv(file)
    df[df == "None"] = np.nan
    df = df.dropna(subset=["City", "State", "Country"], how='all')
    df = df.fillna("")

    #Find number of total tweets in a location
    group_sums = df.groupby(["City", "State", "Country"])["Message"].agg(np.size)
    cities = group_sums.index.get_level_values(0)
    states = group_sums.index.get_level_values(1)
    countries = group_sums.index.get_level_values(2)
    
    for city, state, country in zip(cities, states, countries):
        location = (city, state, country)
        data[location] = data.get(location, {})
        data[location]['num_total'] = data[location].get('num_total', 0)
        data[location]['num_total'] += group_sums.loc[city, state, country]
    
    #Filter tweets, then store in filtered_df
    for i in df.index:
        tweet = df.loc[i]["Message"]
        if tweet[0:2] != "RT":
            for word in tweet.lower().split():
                    isSimilar, sims = check_sim_v2(regex.sub('', word), sims)
                    if(isSimilar):
                        filtered_df.loc[index] = df.loc[i]
                        index += 1
                        break
                    

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
group_sums = filtered_df.groupby(["City", "State", "Country"])["Message"].agg(np.size)
cities = group_sums.index.get_level_values(0)
states = group_sums.index.get_level_values(1)
countries = group_sums.index.get_level_values(2)

for city, state, country in zip(cities, states, countries):
    location = (city, state, country)
    data[location] = data.get(location, {})
    data[location]['filtered_total'] = data[location].get('filtered_total', 0)
    data[location]['filtered_total'] += group_sums.loc[city, state, country]
    

In [6]:
data

{('', '', 'Afghanistan'): {'num_total': 202, 'filtered_total': 11},
 ('', '', 'Algeria'): {'num_total': 696, 'filtered_total': 9},
 ('', '', 'Argentina'): {'num_total': 51622, 'filtered_total': 2520},
 ('', '', 'Australia'): {'num_total': 17577, 'filtered_total': 959},
 ('', '', 'Bangladesh'): {'num_total': 3247, 'filtered_total': 209},
 ('', '', 'Barbados'): {'num_total': 1985, 'filtered_total': 107},
 ('', '', 'Belize'): {'num_total': 658, 'filtered_total': 33},
 ('', '', 'Bolivia'): {'num_total': 4147, 'filtered_total': 172},
 ('', '', 'Brazil'): {'num_total': 200379, 'filtered_total': 4964},
 ('', '', 'Brunei'): {'num_total': 351, 'filtered_total': 17},
 ('', '', 'Bulgaria'): {'num_total': 589, 'filtered_total': 17},
 ('', '', 'Cambodia'): {'num_total': 396, 'filtered_total': 22},
 ('', '', 'Canada'): {'num_total': 41986, 'filtered_total': 2834},
 ('', '', 'Cayman Islands'): {'num_total': 755, 'filtered_total': 61},
 ('', '', 'Chile'): {'num_total': 30648, 'filtered_total': 2062},


In [7]:
final_df = pd.DataFrame(columns=['city', 'state', 'country', 'total_tweets', 
                                 'filtered_tweets', 'percentage'])
index = 0
for col in data:
    try:
        total_tweets = data[col]['num_total']
    except KeyError:
        continue
        
    try:
        filtered_tweets = data[col]['filtered_total']
    except KeyError:
        filtered_tweets = 0
        
    percentage = filtered_tweets/total_tweets * 100
    final_df.loc[index] = [col[0], col[1], col[2], total_tweets, filtered_tweets,
                          percentage]
    index += 1
    

In [8]:
final_df.to_csv("final_df.csv")

In [9]:
print(final_df.shape)
final_df.head()

(4762, 6)


Unnamed: 0,city,state,country,total_tweets,filtered_tweets,percentage
0,,,Afghanistan,202,11,5.445545
1,,,Algeria,696,9,1.293103
2,,,Argentina,51622,2520,4.88164
3,,,Australia,17577,959,5.455994
4,,,Bangladesh,3247,209,6.436711


In [27]:
final_df[final_df['total_tweets'] >= 1000][final_df['percentage'] >= 10]

  """Entry point for launching an IPython kernel.


Unnamed: 0,city,state,country,total_tweets,filtered_tweets,percentage
178,,Punjab,Pakistan,1378,172,12.481858
224,Alpharetta,Georgia,United States,1396,168,12.034384
235,Appleton,Wisconsin,United States,1356,217,16.00295
370,Charleston,South Carolina,United States,2918,354,12.131597
405,Compton,California,United States,1361,232,17.046289
410,Coral Gables,Florida,United States,1652,243,14.709443
495,Fort Lauderdale,Florida,United States,5301,531,10.016978
496,Fort Myers,Florida,United States,1138,125,10.984183
538,Greenville,South Carolina,United States,1055,110,10.42654
599,Jacksonville,Florida,United States,6268,677,10.800893


In [15]:
print(final_df['filtered_tweets'].sum())
print(final_df['total_tweets'].sum())

277758
5492487


In [16]:
print("Total percentage: {:.2f}%".format(final_df['filtered_tweets'].sum()/final_df['total_tweets'].sum()*100))

Total percentage: 5.06%
