# Parse results

## Imports

In [0]:
import boto3
import pandas as pd

## Utils

In [0]:
def endswith_backslash(str1):
  return str1.endswith("/")

def get_dirlist_s3(prefix: str,
                   client=boto3.client('s3'),
                   bucket='REDACTED'):
  """
  args:
      client: boto3 client (s3)
      bucket (str): name of bucket
      prefix (str): path without bucket (endswith backslash)
  """
  if not endswith_backslash(prefix):
    raise ValueError("prefix must end with backslash")
  return client.list_objects(Bucket=bucket, Prefix=prefix, Delimiter='/')

In [0]:
file_names = []
dates = ["2022-01-23", "2022-01-25", "2022-02-01", "2022-02-03", "2022-02-06", "2022-02-08", "2022-02-09", "2022-02-10", "2022-02-13", "2022-02-14", "2022-02-15", "2022-02-16", "2022-02-18"]
for date in dates:
  raw_data = get_dirlist_s3(f'orbl/twint/amazon/dt={date}/')
  temp_file_names = ["s3://REDACTED/" + dct["Key"] for dct in raw_data["Contents"]]
  file_names += temp_file_names
file_names[:5]

In [0]:
df = pd.read_csv(file_names[0])
for path in file_names[1:]:
  temp_df = pd.read_csv(path)
  df = pd.concat([df, temp_df], axis=0)
df = df.drop_duplicates()

# Filter out anomalies
* by modified_date
* by # tweets above threshold

In [0]:
TWEETS_THRESHOLD = 10
df["modified_date"] = df.date.apply(lambda date: date[:10])
count_df = df\
  .groupby(["type", "modified_date"]).size()\
  .reset_index()\
  .rename(columns={0: "count"})
count_df = count_df[(count_df["count"] > TWEETS_THRESHOLD) & 
                   (count_df["modified_date"] != "2022-01-13") & 
                   (count_df["modified_date"] != "2022-01-14") & 
                   (count_df["modified_date"] != "2022-02-18") & 
                   (count_df["modified_date"] != "2022-02-19")]

In [0]:
tweets_df = df\
    .merge(count_df, 
           on="modified_date", 
           how="inner")\
   .rename(columns={'type_x': "type"})[df.columns]
tweets_df.head()

Unnamed: 0,id,conversation_id,created_at,date,timezone,place,tweet,language,hashtags,cashtags,user_id,user_id_str,username,name,day,hour,link,urls,photos,video,thumbnail,retweet,nlikes,nreplies,nretweets,quote_url,search,near,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest,query,type,modified_date
0,1485344539074301957,1485342419159855109,1642969000000.0,2022-01-23 20:11:49,0,,@spikepoint The one thing I hate about Amazon ...,en,[],[],1269508676823519233,1269508676823519233,suburbancoyotee,theresa!,7,20,https://twitter.com/suburbancoyotee/status/148...,[],[],0,,False,0,1,0,,amazon app,,,,,,,"[{'screen_name': 'spikepoint', 'name': 'Capt. ...",,,,,amazon app,application,2022-01-23
1,1485344539074301957,1485342419159855109,1642969000000.0,2022-01-23 20:11:49,0,,@spikepoint The one thing I hate about Amazon ...,en,[],[],1269508676823519233,1269508676823519233,suburbancoyotee,theresa!,7,20,https://twitter.com/suburbancoyotee/status/148...,[],[],0,,False,0,1,0,,amazon app,,,,,,,"[{'screen_name': 'spikepoint', 'name': 'Capt. ...",,,,,amazon app,application,2022-01-23
2,1485343775241158662,1481322928662884364,1642969000000.0,2022-01-23 20:08:47,0,,@graanhay Para que podamos guiarte de la mejor...,es,[],[],85741735,85741735,AmazonHelp,Amazon Help,7,20,https://twitter.com/AmazonHelp/status/14853437...,[],[],0,,False,0,1,0,,amazon app,,,,,,,"[{'screen_name': 'graanhay', 'name': 'Gra 🇦🇷🇦🇷...",,,,,amazon app,application,2022-01-23
3,1485343775241158662,1481322928662884364,1642969000000.0,2022-01-23 20:08:47,0,,@graanhay Para que podamos guiarte de la mejor...,es,[],[],85741735,85741735,AmazonHelp,Amazon Help,7,20,https://twitter.com/AmazonHelp/status/14853437...,[],[],0,,False,0,1,0,,amazon app,,,,,,,"[{'screen_name': 'graanhay', 'name': 'Gra 🇦🇷🇦🇷...",,,,,amazon app,application,2022-01-23
4,1485343321883090949,1481322928662884364,1642968000000.0,2022-01-23 20:06:58,0,,@AmazonHelp La app está en el servicio de Tele...,es,[],[],1427811495048617989,1427811495048617989,graanhay,Gra 🇦🇷🇦🇷,7,20,https://twitter.com/graanhay/status/1485343321...,[],[],0,,False,0,1,0,,amazon app,,,,,,,"[{'screen_name': 'AmazonHelp', 'name': 'Amazon...",,,,,amazon app,application,2022-01-23


In [0]:
import datetime
today = datetime.datetime.today().strftime("%Y-%m-%d")
path = f"s3://REDACTED/orbl/twint/amazon/processed_not_labeled/dt={today}/tweets.csv"
tweets_df.to_csv(path, index=False)

In [0]:
tweets_df.tweet.iloc[0]