In [53]:
import sys, os
sys.path.append(os.path.dirname(os.getcwd()))
import utils
import pandas as pd
import emoji
import glob
import ast
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [54]:
s3 = utils.S3_Manager()
# s3.retrieve_from_bucket()

#### Cleaning

In [55]:
def centralized_cleaning(text_li, stopwords, lemma):
    ### remove stopwords 
    return [lemma.lemmatize(x) for x in text_li if x not in set(stopwords)]

def cleaning_df(df_text_field):
    ### Demojize -- replace emojis with text 
    text = df_text_field.apply(lambda x: emoji.demojize(str(x)))
    ### Hashtags -- remove hashtags and underscores but keep their terminology (alternative is to replace the hashtag with 'HASHTAG' label, but you lose data with this alternative)
    ### Mentions -- replace mentions with "MENTION"; who they are mentioning does not matter for our purposes (we don't have the scope to make that work), but that they are mentioning someone does
    ### Removed punctuation and transformed text to lowercase
    text = text.str.replace('#', '').str.replace(r'_|-', ' ', regex = True).str.lower().str.replace(r'@[^\s]+', 'MENTION', regex = True).str.replace(r'[^a-zA-Z0-9\s]', '', regex = True)
    ### Convert from string into list
    tw = TweetTokenizer()
    text_li = text.apply(lambda x: tw.tokenize(x))
    ### Remove Stopwords and lemmatize
    stops = stopwords.words('english')
    lemmy = WordNetLemmatizer()
    cleaned_text_li = text_li.apply(lambda x: centralized_cleaning(x, stops, lemmy))
    return cleaned_text_li

#### Checking with one dataset

In [56]:
df = pd.read_csv('./inputs/avengers_subreddit.csv')
df

Unnamed: 0,submission_id,threadText
0,i9c6di7,The first Avengers might still be my favorite ...
1,i9bncpq,"The OG avengers was awesome, helping set the s..."
2,i9bmot8,NWH doesn’t hold a candle to Avengers
3,i9bnity,"I cried multiple times during NWH, only once d..."
4,i9cdhlk,Congrats on becoming a father!
...,...,...
28684,hklkfw7,“What would you do… if you only had… one more ...
28685,hk9dkvo,Platform exclusives are a cancer on the gaming...
28686,hk9txn7,Would have loved spider-man on PC. Don't under...
28687,hk82lyr,Isn’t July-September third quarter of the year...


In [57]:
cleaned_df = cleaning_df(df['threadText'])
cleaned_df

0        [first, avenger, might, still, favorite, mcu, ...
1        [og, avenger, awesome, helping, set, stage, ye...
2                     [nwh, doesnt, hold, candle, avenger]
3        [cried, multiple, time, nwh, avenger, granted,...
4                             [congrats, becoming, father]
                               ...                        
28684                        [would, one, frog, im, dying]
28685    [platform, exclusive, cancer, gaming, industry...
28686    [would, loved, spider, man, pc, dont, understa...
28687    [isnt, july, september, third, quarter, year, ...
28688    [give, good, idea, america, math, education, l...
Name: threadText, Length: 28689, dtype: object

In [58]:
df['threadText'].iloc[28687]

'Isn’t July-September third quarter of the year? Or had 2020 messed us up that much we have more months in a year x_x'

In [59]:
cleaned_df.iloc[28687]

['isnt',
 'july',
 'september',
 'third',
 'quarter',
 'year',
 '2020',
 'messed',
 'u',
 'much',
 'month',
 'year',
 'x',
 'x']

Will be looking into emoticon-text mapping. Aside from that, things look good.

#### Application of Cleaning Function

Need to align the fields and fieldnames used across datasets.

In [60]:
def ready_twitter(df):
    df['public_metrics'] = df['public_metrics'].apply(lambda x: ast.literal_eval(x))
    out = df[['tweetid','tweet','created_at']]
    out['likes'] = df['public_metrics'].apply(lambda x: x.get('like_count'))
    out['retweets'] = df['public_metrics'].apply(lambda x: x.get('retweet_count'))
    out['replies'] = df['public_metrics'].apply(lambda x: x.get('reply_count'))
    out['source'] = 'Twitter'
    out.columns = ['id','text','created_at','likes','retweets','replies','source']
    return out

def ready_reddit(df):
    df.columns = ['id','text']
    df['source'] = 'Reddit'
    return df

def ready_youtube(df):
    df = df[['id','text','published_time','likes','replies']]
    df.columns = ['id','text','created_at','likes','replies']
    df['source'] = 'YouTube'
    return df

In [61]:
files = glob.glob('./inputs/**')
output_df = pd.DataFrame()
for i, file in enumerate(files):
    print(f'File {i}, {os.path.basename(file)}')
    if 'reddit' in file:
        readied_df = ready_reddit(pd.read_csv(file))
    elif 'tweet' in file:
        readied_df = ready_twitter(pd.read_csv(file))
    elif 'youtube' in file:
        readied_df = ready_youtube(pd.read_csv(file))
    else:
        raise Exception("That's now supposed to happen")

    output_df = pd.concat((output_df, readied_df), axis = 0, ignore_index=True)

File 0, unique_tweet0509_0521.csv
File 1, youtube_2022-05-21 20:07:10.521156.csv
File 2, marveltheories_subreddit.csv
File 3, marvelstudios_subreddit.csv
File 4, avengers_subreddit.csv


In [62]:
output_df.shape

(844085, 7)

In [63]:
output_df.head()

Unnamed: 0,id,text,created_at,likes,retweets,replies,source
0,1525736805261262848,RT @driiftyfilm: This is a hot take but I kind...,2022-05-15 07:16:15+00:00,0,1210.0,0.0,Twitter
1,1525736784985849856,RT @NerdNews12345: Random Person: Marvel movie...,2022-05-15 07:16:11+00:00,0,1078.0,0.0,Twitter
2,1525736712089092096,👀 Was just thinking about this before goin' to...,2022-05-15 07:15:53+00:00,0,0.0,0.0,Twitter
3,1525736658976522241,Before I even heard of the #NFTCommunity &amp;...,2022-05-15 07:15:40+00:00,1,0.0,0.0,Twitter
4,1525736413706260480,I'm not even a die-hard Marvel anti but damn s...,2022-05-15 07:14:42+00:00,0,0.0,0.0,Twitter


In [64]:
output_df[output_df['source'] == 'Reddit']

Unnamed: 0,id,text,created_at,likes,retweets,replies,source
760277,i9c64y4,I like the theory but at this point i guess sh...,,,,,Reddit
760278,i9bzzok,hmmm intriguing! I like this theory!,,,,,Reddit
760279,i9ca9mx,There would have made a wink at the audience a...,,,,,Reddit
760280,i9d1ckw,I like time stone better. If they come back (I...,,,,,Reddit
760281,i9cfhz3,I think this works only if he uses an infinity...,,,,,Reddit
...,...,...,...,...,...,...,...
844080,hklkfw7,“What would you do… if you only had… one more ...,,,,,Reddit
844081,hk9dkvo,Platform exclusives are a cancer on the gaming...,,,,,Reddit
844082,hk9txn7,Would have loved spider-man on PC. Don't under...,,,,,Reddit
844083,hk82lyr,Isn’t July-September third quarter of the year...,,,,,Reddit


In [65]:
output_df[output_df['source'] == 'Twitter']

Unnamed: 0,id,text,created_at,likes,retweets,replies,source
0,1525736805261262848,RT @driiftyfilm: This is a hot take but I kind...,2022-05-15 07:16:15+00:00,0,1210.0,0.0,Twitter
1,1525736784985849856,RT @NerdNews12345: Random Person: Marvel movie...,2022-05-15 07:16:11+00:00,0,1078.0,0.0,Twitter
2,1525736712089092096,👀 Was just thinking about this before goin' to...,2022-05-15 07:15:53+00:00,0,0.0,0.0,Twitter
3,1525736658976522241,Before I even heard of the #NFTCommunity &amp;...,2022-05-15 07:15:40+00:00,1,0.0,0.0,Twitter
4,1525736413706260480,I'm not even a die-hard Marvel anti but damn s...,2022-05-15 07:14:42+00:00,0,0.0,0.0,Twitter
...,...,...,...,...,...,...,...
39915,1526271033807888384,If your favorite movie of all time is a marvel...,2022-05-16 18:39:05+00:00,5,0.0,3.0,Twitter
39916,1526270995547447297,"Listen, @MorbiusMovie wasn’t a bad movie but, ...",2022-05-16 18:38:56+00:00,0,0.0,0.0,Twitter
39917,1526270690382471170,The one flaw of the original marvel comic even...,2022-05-16 18:37:43+00:00,3,0.0,2.0,Twitter
39918,1526253297736290307,omg i’m bouta watch this marvel movie with chr...,2022-05-16 17:28:37+00:00,2,0.0,1.0,Twitter


In [66]:
output_df[output_df['source'] == 'YouTube']

Unnamed: 0,id,text,created_at,likes,retweets,replies,source
39920,116Vptosbg8,Marvel Snap - Official Cinematic Reveal Traile...,2 days ago,,,,YouTube
39921,UgxktLMG7OwYTr07GNx4AaABAg,They might as well have given her the elastici...,,19,,3.0,YouTube
39922,UgxXGzflxinrpVapTmV4AaABAg,Stunning and brave,,10,,,YouTube
39923,Ugw0lCUW_y3C5p0eIX54AaABAg,"HOHOHOHO...""If you save one life, you save the...",,5,,,YouTube
39924,UgysXnN5ZasAySKT_QB4AaABAg,I’m actually glad they did that too her powers...,,4,,5.0,YouTube
...,...,...,...,...,...,...,...
760272,Ugxxo6Jm6M9Tiuegr0R4AaABAg,Op avengers,,,,,YouTube
760273,Ugw1eDzisVJyd-MMpaV4AaABAg,T,,,,,YouTube
760274,UgyPEP4wTHbtRX6Cyhx4AaABAg,Mekk,,,,,YouTube
760275,Ugwr5Jg6TWP1CBvcP7F4AaABAg,Sir William and,,,,,YouTube


Jonathan?

In [67]:
output_df

Unnamed: 0,id,text,created_at,likes,retweets,replies,source
0,1525736805261262848,RT @driiftyfilm: This is a hot take but I kind...,2022-05-15 07:16:15+00:00,0,1210.0,0.0,Twitter
1,1525736784985849856,RT @NerdNews12345: Random Person: Marvel movie...,2022-05-15 07:16:11+00:00,0,1078.0,0.0,Twitter
2,1525736712089092096,👀 Was just thinking about this before goin' to...,2022-05-15 07:15:53+00:00,0,0.0,0.0,Twitter
3,1525736658976522241,Before I even heard of the #NFTCommunity &amp;...,2022-05-15 07:15:40+00:00,1,0.0,0.0,Twitter
4,1525736413706260480,I'm not even a die-hard Marvel anti but damn s...,2022-05-15 07:14:42+00:00,0,0.0,0.0,Twitter
...,...,...,...,...,...,...,...
844080,hklkfw7,“What would you do… if you only had… one more ...,,,,,Reddit
844081,hk9dkvo,Platform exclusives are a cancer on the gaming...,,,,,Reddit
844082,hk9txn7,Would have loved spider-man on PC. Don't under...,,,,,Reddit
844083,hk82lyr,Isn’t July-September third quarter of the year...,,,,,Reddit


In [68]:
output_df['cleaned_text'] = cleaning_df(output_df['text'])

In [69]:
output_df

Unnamed: 0,id,text,created_at,likes,retweets,replies,source,cleaned_text
0,1525736805261262848,RT @driiftyfilm: This is a hot take but I kind...,2022-05-15 07:16:15+00:00,0,1210.0,0.0,Twitter,"[rt, MENTION, hot, take, kinda, wish, marvel, ..."
1,1525736784985849856,RT @NerdNews12345: Random Person: Marvel movie...,2022-05-15 07:16:11+00:00,0,1078.0,0.0,Twitter,"[rt, MENTION, random, person, marvel, movie, g..."
2,1525736712089092096,👀 Was just thinking about this before goin' to...,2022-05-15 07:15:53+00:00,0,0.0,0.0,Twitter,"[eye, thinking, goin, bed, like, hmm, avatar, ..."
3,1525736658976522241,Before I even heard of the #NFTCommunity &amp;...,2022-05-15 07:15:40+00:00,1,0.0,0.0,Twitter,"[even, heard, nftcommunity, amp, nft, screenwr..."
4,1525736413706260480,I'm not even a die-hard Marvel anti but damn s...,2022-05-15 07:14:42+00:00,0,0.0,0.0,Twitter,"[im, even, die, hard, marvel, anti, damn, sequ..."
...,...,...,...,...,...,...,...,...
844080,hklkfw7,“What would you do… if you only had… one more ...,,,,,Reddit,"[would, one, frog, im, dying]"
844081,hk9dkvo,Platform exclusives are a cancer on the gaming...,,,,,Reddit,"[platform, exclusive, cancer, gaming, industry..."
844082,hk9txn7,Would have loved spider-man on PC. Don't under...,,,,,Reddit,"[would, loved, spider, man, pc, dont, understa..."
844083,hk82lyr,Isn’t July-September third quarter of the year...,,,,,Reddit,"[isnt, july, september, third, quarter, year, ..."


In [70]:
output_df[output_df['source'] == 'Twitter']

Unnamed: 0,id,text,created_at,likes,retweets,replies,source,cleaned_text
0,1525736805261262848,RT @driiftyfilm: This is a hot take but I kind...,2022-05-15 07:16:15+00:00,0,1210.0,0.0,Twitter,"[rt, MENTION, hot, take, kinda, wish, marvel, ..."
1,1525736784985849856,RT @NerdNews12345: Random Person: Marvel movie...,2022-05-15 07:16:11+00:00,0,1078.0,0.0,Twitter,"[rt, MENTION, random, person, marvel, movie, g..."
2,1525736712089092096,👀 Was just thinking about this before goin' to...,2022-05-15 07:15:53+00:00,0,0.0,0.0,Twitter,"[eye, thinking, goin, bed, like, hmm, avatar, ..."
3,1525736658976522241,Before I even heard of the #NFTCommunity &amp;...,2022-05-15 07:15:40+00:00,1,0.0,0.0,Twitter,"[even, heard, nftcommunity, amp, nft, screenwr..."
4,1525736413706260480,I'm not even a die-hard Marvel anti but damn s...,2022-05-15 07:14:42+00:00,0,0.0,0.0,Twitter,"[im, even, die, hard, marvel, anti, damn, sequ..."
...,...,...,...,...,...,...,...,...
39915,1526271033807888384,If your favorite movie of all time is a marvel...,2022-05-16 18:39:05+00:00,5,0.0,3.0,Twitter,"[favorite, movie, time, marvel, movie, hate]"
39916,1526270995547447297,"Listen, @MorbiusMovie wasn’t a bad movie but, ...",2022-05-16 18:38:56+00:00,0,0.0,0.0,Twitter,"[listen, MENTION, wasnt, bad, movie, cmon, eve..."
39917,1526270690382471170,The one flaw of the original marvel comic even...,2022-05-16 18:37:43+00:00,3,0.0,2.0,Twitter,"[one, flaw, original, marvel, comic, event, ci..."
39918,1526253297736290307,omg i’m bouta watch this marvel movie with chr...,2022-05-16 17:28:37+00:00,2,0.0,1.0,Twitter,"[omg, im, bouta, watch, marvel, movie, chris, ..."


In [71]:
output_df['cleaned_text'] = output_df['cleaned_text'].apply(lambda x: ' '.join(x))

In [72]:
output_df

Unnamed: 0,id,text,created_at,likes,retweets,replies,source,cleaned_text
0,1525736805261262848,RT @driiftyfilm: This is a hot take but I kind...,2022-05-15 07:16:15+00:00,0,1210.0,0.0,Twitter,rt MENTION hot take kinda wish marvel studio d...
1,1525736784985849856,RT @NerdNews12345: Random Person: Marvel movie...,2022-05-15 07:16:11+00:00,0,1078.0,0.0,Twitter,rt MENTION random person marvel movie gray dul...
2,1525736712089092096,👀 Was just thinking about this before goin' to...,2022-05-15 07:15:53+00:00,0,0.0,0.0,Twitter,eye thinking goin bed like hmm avatar 2 movie ...
3,1525736658976522241,Before I even heard of the #NFTCommunity &amp;...,2022-05-15 07:15:40+00:00,1,0.0,0.0,Twitter,even heard nftcommunity amp nft screenwriting ...
4,1525736413706260480,I'm not even a die-hard Marvel anti but damn s...,2022-05-15 07:14:42+00:00,0,0.0,0.0,Twitter,im even die hard marvel anti damn sequel espec...
...,...,...,...,...,...,...,...,...
844080,hklkfw7,“What would you do… if you only had… one more ...,,,,,Reddit,would one frog im dying
844081,hk9dkvo,Platform exclusives are a cancer on the gaming...,,,,,Reddit,platform exclusive cancer gaming industry need...
844082,hk9txn7,Would have loved spider-man on PC. Don't under...,,,,,Reddit,would loved spider man pc dont understand excl...
844083,hk82lyr,Isn’t July-September third quarter of the year...,,,,,Reddit,isnt july september third quarter year 2020 me...


In [73]:
now = datetime.now()
filename = f'cleaned_data_{now}.csv'
output_df.to_csv(f'outputs/{filename}', index = False)
s3.upload_to_bucket(filename=f'./outputs/{filename}', dirname='CleanedData')

Done!
