In [35]:
import jsonlines
import numpy as np
import pandas as pd
from process_data import DataProcessor
from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [26]:
INPUT_PATH = "./example-data/comments"
OUTPUT_PATH = "./example-output"
MODE = "comments"
MAX_DOCS = 500
MIN_TOKENS = 50
MAX_TOKENS = 100
PHRASES = ["this community", "this subreddit", "this place", "you guys", "you all"]

In [27]:
p = DataProcessor(
        input_path=INPUT_PATH,
        output_directory=OUTPUT_PATH,
        mode=MODE,
    )
gen = p.get_generator()
count = 0
n = 0
docs = []

for line in gen:
    n += 1
    text = line.get("body")
    if text:
        if line.get("author") == "AutoModerator":
            pass
        else:
            for phrase in PHRASES:
                if phrase in text.lower():
                    l = len(wordpunct_tokenize(text))
                    if l >= MIN_TOKENS and l <= MAX_TOKENS:
                        docs.append(text)
                        count += 1
                if count >= MAX_DOCS:
                    raise StopIteration(f"Iterated over {n} lines.")

StopIteration: Iterated over 546578 lines.

In [29]:
# get array

vectorizer = TfidfVectorizer(max_df=.65, min_df=1, stop_words=None, use_idf=True, norm=None)
docs_array = vectorizer.fit_transform(docs).toarray()

In [30]:
docs_array.shape

(500, 4223)

In [31]:
# get the list of tokens

columns = list(vectorizer.get_feature_names_out()) 

In [32]:
doc_dfs = []
for i, doc_array in enumerate(docs_array):
    # construct a dataframe
    tf_idf_tuples = list(zip(columns, doc_array))
    df = pd.DataFrame.from_records(tf_idf_tuples, columns=['term', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True)
    doc_dfs.append(df)

In [33]:
# get top N keywords of every comment

TOP_N = 5
docs_dict = {}
for i, df in enumerate(doc_dfs):
    sdf = df.iloc[:TOP_N]
    terms = list(sdf["term"])
    docs_dict[i] = terms

top_df = pd.DataFrame(docs_dict)

In [36]:
top_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434,435,436,437,438,439,440,441,442,443,444,445,446,447,448,449,450,451,452,453,454,455,456,457,458,459,460,461,462,463,464,465,466,467,468,469,470,471,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499
0,white,white,shit,media,europe,sorry,supress,mistakes,brutality,screenshot,arse,harry,choice,take,rent,choose,exhausted,into,happens,connection,private,photoshopped,aren,delivering,pleasure,about,pair,any,half,innocent,id,object,awkward,their,issue,black,rice,language,garbage,ll,tuition,may,men,onlyfans,family,disorders,delicately,dont,dying,restaurant,waste,men,paper,distance,hands,bad,hand,love,weak,lost,century,sex,happening,this,mortgage,houses,funny,her,water,them,them,who,him,anal,covid,vaccine,partner,your,carrier,territory,prob,sick,top,lie,eat,maybe,naturally,wrong,her,ya,peeping,which,800k,tomorrows,india,conversation,feel,yippy,yippy,child,sure,pics,her,say,am,number,each,lvl,speech,help,tall,true,ups,im,from,never,gender,he,once,countries,greater,broccoli,advice,her,news,muslim,repeated,same,longer,gfs,condom,difference,live,myself,side,her,their,gives,uncomfortable,guy,ignorant,50,sometimes,dad,girlfriends,infection,morally,learn,divorce,depression,prescription,child,would,different,respect,take,wash,last,show,25,humbly,look,look,someone,assumptions,family,menopause,arguing,conditions,people,even,had,he,he,they,parents,treat,blah,perimenopause,same,racism,extra,parents,gay,anyone,supporting,annoying,haters,affecting,arguing,circumcised,play,eachother,doing,gorgeous,drunk,10,kg,conscious,little,instinct,negative,prices,tax,only,very,exist,exist,relationsship,pads,creepy,bond,weight,stains,straight,becomes,more,we,body,universal,own,sex,fuck,sex,unvaccinated,he,best,were,rape,forward,conditioning,shirts,leave,we,problem,know,wouldn,model,white,smile,cool,by,trying,vaxx,deer,gf,unsolicited,homophobia,hate,road,flag,sweden,us,drivers,understand,dubai,harley,was,sound,leg,mindset,where,picnic,offends,weight,fat,guess,paid,overanalysing,sarcastic,likely,meltdown,pregnancy,brings,yields,ad,why,short,cheers,seen,dumb,people,extremely,but,plays,xd,wife,integrity,ask,activity,voted,know,pastry,she,ran,checking,move,refuse,area,woman,daughter,traumatise,resist,matter,question,added,why,raving,uyghur,size,voices,our,accustomed,yes,shoes,skin,covid,passport,gift,getting,exams,us,mailbox,framed,view,ones,expensive,federal,card,there,questions,vote,ready,pure,preferences,she,normal,residents,cum,men,ask,wrong,each,home,ask,legit,him,cover,floor,happened,internet,symptoms,vaccinated,does,think,want,effects,nice,freedom,pressured,buy,000,together,payments,stereotypes,flag,debt,on,they,ring,can,payout,financial,about,forgiven,salary,loan,engagement,ring,ring,ah,right,myth,an,about,will,phone,her,someone,problem,okay,voting,make,decide,bring,pants,off,milk,gun,she,there,removed,them,swinging,minute,so,so,weekend,agenda,asthma,treat,000,ghost,logically,personal,belong,mask,adult,smoked,vaccinated,curve,difference,neurotoxin,sweaty,pronouns,before,her,discharge,keep,house,comment,sounds,cramps,ugly,cleaning,her,we,standard,women,against,seeing,unattractive,her,female,shit,generation,work,christian,movies,nonstop,its,respectful,hear,happy,contact,he,he,him,talk,code,piece,emotions,cps,incapacity,happy,im,her,news,going,nose,his,he,news,students,symptoms,influence,europeanen,sponge,accurate,liberals,new,vote,forget,traction,america,sex,talk,jesus,supposed,give,win,shots,accepts,your,demiboy,way,shop,actually,we,himself
1,people,spectrum,ownership,are,congo,question,stock,country,slavery,sharing,communicating,reunion,else,gas,price,like,frustrated,fairly,fought,dna,kid,looking,blown,miscarriages,foreskin,usd,reuse,convenient,affluent,racists,license,passed,hr,onto,phallic,doesn,gravy,usa,far,in,outages,strategize,seal,she,before,adhd,fears,your,fear,imagining,shit,laundry,clean,cordial,touched,life,ll,happy,over,make,ago,sounds,paragraph,saying,left,san,busting,friends,gallons,struggle,struggle,partner,humouring,hotter,bad,likelihood,another,life,rural,tmi,walked,bought,best,don,your,geopolitically,meeting,need,friends,he,toms,life,speechless,problem,everywhere,its,patient,wow,wow,help,only,unsolicited,honesty,so,going,like,we,cells,freedom,trousers,got,fact,level,sorry,immaculate,ve,own,gay,horny,corruption,tolerate,had,dont,introduce,are,offended,our,circumcized,get,loofa,wearing,still,eexpensive,comfortable,let,menopause,walla,know,him,way,point,year,world,amp,important,slept,sometimes,baby,day,personal,buy,crying,community,her,what,one,everyday,doin,long,18,decline,people,americas,or,assume,lot,49,should,unfortunatley,subconsciously,develop,ever,has,wanna,persons,your,manner,cookies,36,do,people,gets,insaneparents,new,else,striving,things,sleepover,practices,even,needed,videogames,around,wolf,make,over,each,62,nudity,escalate,thanks,reading,money,trump,lately,ve,are,are,we,plastic,hair,share,her,plenty,most,farts,these,want,gt,pass,their,having,what,attracted,vaccines,significant,sexologist,intercourse,her,realised,us,low,try,poc,digital,unfair,intimacy,boast,enemy,older,kinda,wouldnt,gain,anti,hunt,ok,we,answer,where,understand,red,united,rider,harley,fwb,world,ride,explaining,their,left,assertions,together,festival,muffler,podcast,justifying,question,compassion,click,showed,him,deciding,freak,quite,approx,spoonfed,some,cemented,warning,ve,we,organization,oppressing,pity,sextape,much,heterosexuality,ids,approves,necessary,trump,dont,joke,friend,potential,works,folks,civilians,absolute,friend,maybe,toss,kept,guy,react,odour,complying,indicators,any,use,inner,family,shearing,turning,off,often,now,national,video,frustration,men,invade,ballot,campaign,id,facials,id,local,dmv,isn,40,must,18,strange,attracted,next,over,work,him,masculinity,ignorance,mechanics,comment,at,politics,get,women,every,poisoning,throats,random,ventilator,individuals,now,hitting,their,vaccine,literally,fascist,crushing,ring,additional,ring,loans,include,battle,she,engagement,should,imo,forgive,govt,given,ask,forgiveness,ring,forgiveness,willing,bug,more,gattaca,breastfeeding,college,lt,soooo,time,my,thinking,career,insecurities,consent,down,which,whether,they,says,very,today,joking,50,lgbtq,ask,tell,we,swinger,lives,lives,chill,questions,week,choose,saudi,daddy,seconds,liberties,common,cya,an,smoking,telling,jesse,talking,addictive,amp,people,should,right,now,obeying,doing,one,on,switch,them,she,mentality,say,trust,bothered,dirt,as,allowed,she,friends,tooafraidtoasktooafraidtoask,sustainable,on,marriage,said,poorly,mcdonalds,locking,sure,has,isnt,shit,shit,he,feel,level,codes,show,hit,opportunity,relate,love,probably,sides,complete,mask,relationship,gonna,wrong,black,your,under,ra,scrub,psychiatrist,didn,always,unscrupulous,falling,alexandra,years,depends,about,comes,water,fishy,groaning,slow,boomer,disgust,thissexual,gfys,tax,bar,cops,physically
2,could,cage,petro,nuts,belgium,sleepy,interfere,on,learn,problems,effectively,series,someone,lighting,half,surgery,libido,inject,ancestors,embrace,penis,re,fetish,episodes,shaft,foggiest,worn,endeavors,plans,there,drivers,pass,foot,while,injuries,huffington,light,understanding,own,trouble,differs,prominent,silenced,her,one,few,downstairs,many,should,leak,trees,bay,toilet,judgmental,wash,feel,get,am,qanon,in,white,destroy,specifically,feel,leaves,700k,leap,can,average,perceive,perceive,own,lingerie,excitement,its,effective,talk,only,years,positions,tool,biggest,orgasm,also,hairy,complicated,age,holiday,recovering,when,four,things,16,lad,nope,say,beside,fuck,fuck,him,realise,women,streets,something,let,her,look,lung,they,johnny,they,denying,bodyweight,hater,29,been,as,thought,day,argentina,stoked,great,solicit,prefers,over,infidels,where,commenting,comfortable,af,shouldn,feel,switzerland,place,while,will,spice,kinda,he,tits,feel,old,my,best,him,he,bad,did,families,experience,used,herself,black,he,uses,pill,since,showers,basis,in,internship,begin,at,democracies,prada,appreciated,complacency,argue,allready,suppress,ends,balding,once,can,dramatic,doing,me,yadda,flashes,brothers,speaks,that,reacted,lol,are,praying,store,beats,starts,near,need,boardgames,or,posing,cheesy,law,we,height,ought,hand,assumed,re,structure,400,recommend,downsides,don,don,openly,biodegradable,gaslight,deeper,together,worry,spot,skidmarks,competitive,forced,photo,health,reflection,tf,shake,porn,are,energy,anyhow,finish,didn,dead,tell,surprised,saying,african,pump,scenarios,presumably,set,irish,comes,stuff,toxins,judged,me,wild,anything,dick,subreddit,presence,setup,worst,states,annoyed,noise,stranger,permission,don,randomly,sentence,right,accidents,were,funnel,fucking,maintenance,being,ask,crevices,overthinking,interest,asked,monogamous,test,deal,stabbing,nauseum,very,role,conversating,rights,really,tentacles,bigots,lovely,rewatch,oppresses,orientation,united,validated,nervous,in,us,hand,saying,forefront,wouldn,canada,weapons,buffoons,van,give,embarrassing,condoms,dumbass,received,smiling,lasted,of,they,reeked,recognize,we,cooling,fundamental,us,bad,lessen,identification,small,mismatched,our,universally,proven,democrats,their,anal,what,voting,id,or,based,country,world,just,white,coffee,win,wasting,messy,toxic,homie,disturbing,other,cook,scared,563,as,citizen,food,psychotic,malone,contagious,boo,vaccinations,victim,harm,say,politicians,facilities,spouse,rediculous,grants,live,70k,list,confederate,ring,towards,roof,buying,program,compete,spending,me,loans,too,against,amount,princess,should,quickly,restroom,uneducated,falsely,bandwagon,friends,uproar,bright,perspective,godamn,weird,clarity,into,relationship,people,your,same,corner,tank,that,throwing,been,flowing,doing,marriages,their,their,hangout,political,last,same,299,seduce,driving,liberty,stop,chump,be,fuck,black,prepped,other,fucked,hot,sam,btw,hrs,nipples,freely,while,amazement,walls,sigh,shows,raise,bargain,fire,living,disagreeing,males,need,men,with,actually,murder,sooo,deadline,portion,trashed,tv,successful,trusting,too,presents,cut,wit,wit,bald,payoff,life,vision,by,called,hes,lonely,life,as,outlet,narcissistic,wear,both,ll,propagandists,brown,been,help,racial,use,therapist,happen,things,controls,caught,proposes,been,about,women,give,fix,tested,petting,mixed,newsmax,cock,demigirl,he,every,hoped,us,feels
3,do,persecute,dam,dealing,outlawed,waiting,jim,going,in,explode,wind,delete,venting,unacceptable,him,dd,her,diseases,wars,embraced,within,ve,force,then,slides,converting,bet,aspect,maslow,mena,non,pandemic,flip,secrecy,fixation,threads,behind,another,your,splitting,windstorms,obligation,prolific,bam,scummy,treatment,he,as,006,sewer,smear,jail,water,enabling,your,via,shit,over,woke,uncle,black,multiple,incompatibility,brag,1600,100k,murdered,about,daily,reality,reality,my,intentionally,sheet,had,19,one,inform,usps,switching,tricky,sold,clitoris,do,lunch,crimes,era,chihuahuas,procedure,fuck,king,weepers,rise,ignoring,indian,liked,inappropriate,ll,ll,too,doomer,dick,slytherin,pretty,somnenly,saw,in,blood,ll,muscles,ve,believing,explanations,hace,frustrating,holding,been,also,or,security,salt,place,net,kink,spewing,apostate,thus,describe,he,outta,dull,notice,beat,reassurance,him,body,return,she,makes,karma,blasting,jennifer,him,buddy,time,claiming,do,ask,stigma,cocktail,bc,screaming,never,loved,prosper,longer,attend,toes,communication,22,good,intermediate,system,russia,blindly,brightening,warn,constitutes,spcific,consciously,ego,permanent,stealing,feel,quiet,among,in,environment,zero,30s,richest,they,pleased,lingo,by,build,loaded,bestie,cut,sir,born,implying,lovey,clothed,arms,cocktails,didn,underweight,stripping,can,cool,surprising,cause,saez,dynamic,bum,guys,guys,fiancé,cotton,harmless,bus,workout,shit,feminine,untrue,irrelevant,talked,creative,think,subby,alarming,ur,could,stats,ironic,arranged,victimize,now,ahead,biologically,cut,had,this,influencers,disappoint,closer,ability,suddenly,honey,preconsent,basicly,ridiculed,question,also,external,being,question,whiney,hobby,questioning,approximately,hear,hit,buddies,escaped,dual,hahaha,acknowledging,week,studies,they,cost,minor,stunningly,slightest,mission,toilets,encounter,lead,thinking,figured,unlikely,relationship,securities,parrot,also,compare,australian,news,belgian,terrorist,jack,sorta,good,gov,nor,voter,boil,breaks,remain,star,we,workers,innovation,men,statement,why,of,neanderthals,mimic,lesson,pregnant,downvote,responses,teachers,whiny,some,wmd,bo,telling,own,velour,baffles,dc,can,unvaxxed,confirm,mom,lifetime,prostate,frpm,advantage,remotely,tooth,young,besides,locally,solved,transportation,sex,nerves,teenager,curiosity,men,her,sense,11,besides,women,feigned,cashier,any,restaurants,answer,math,abusers,politocian,must,establishments,dr,covidiot,signed,results,classic,assistance,perfectly,osha,access,six,later,saved,cohabit,owe,italy,rebel,budge,paying,realistic,concerned,paid,auto,she,off,after,materialistic,souls,she,diamond,slightly,problematic,urinate,base,selves,numerous,recommend,bothers,necessarily,5k,expressing,verbal,fucks,in,toward,their,somthing,better,institution,civilian,both,its,mods,rubbers,flattered,lifestyle,much,much,aint,option,mask,way,venezuela,mommy,think,enforced,stupidest,awareness,swinger,everyone,mean,chuck,replying,its,chase,refer,holes,team,process,conservative,obligations,sincerely,sun,invisible,matter,mouse,1940,forgetting,us,aye,ck,harboring,offensive,rethink,about,losing,figuring,professional,bonds,filmmakers,readanotherbook,wildly,consume,captain,memory,hang,equally,equally,identifying,zone,laziness,prom,intimidated,parents,other,bday,nah,lifeline,personality,underestimate,literally,cunts,stopping,conflating,asian,ivermectin,legally,everything,soaped,continues,or,honeymoon,want,apart,cortez,here,highly,locker,where,hot,std,creeped,reference,scientists,sucking,pronounces,helped,single,incapable,them,he
4,should,border,ruling,organizations,oddly,that,crow,genocides,our,deleting,will,potter,propagating,bells,used,dangers,stressed,frenzy,wiped,explore,puberty,outcome,compatible,classic,owner,aussie,paper,impacted,payday,skirting,isn,knew,overconfident,imprisons,birth,cnn,plumbing,adding,infiltrated,attempt,wide,that,teen,controversial,cheated,fortunate,biggest,wipes,hysteria,smells,particles,milpitas,with,minded,filth,adulthood,on,listening,adorable,strain,less,friendship,from,cuddling,monthly,regularly,accents,oops,started,recently,recently,independent,payment,ftw,deadliest,robots,should,risks,assistant,likes,no,transfer,orgasms,about,christening,research,finished,northern,nose,going,hearing,preserve,85,wee,responses,ended,helpful,spray,spray,reconditions,shitting,stands,huffelpuff,fore,burbage,gave,drama,told,remove,desperate,all,because,recommended,dilemma,luckily,arm,feel,need,factors,third,hospitals,boiled,offered,mat,consuming,deflection,immensely,loss,sex,want,bags,erected,bucks,calm,bastard,relearn,discuss,her,challenging,mashy,photos,celebrity,envision,advise,when,father,hence,deprived,cases,just,drugstore,mine,fail,competing,entertained,chance,bucket,peel,daily,developmentally,changed,acceptable,involving,africa,polo,faith,quarantine,spouting,specific,replies,habits,adjustments,jacking,him,are,schedule,misogynistic,website,no,interests,pretending,lint,dogging,suggesting,consumed,kiddo,early,wholesome,religious,evening,functional,weed,goof,middle,opens,relax,had,60,organically,clothes,expect,scotsman,dont,economists,ultimately,teased,point,point,other,are,picture,worthy,she,rot,bros,quietly,titles,worth,nude,care,mirror,secondly,envy,be,transmit,mental,appointment,participant,previously,till,nurture,staring,him,multi,nft,interact,more,have,europeans,bruh,implied,clears,who,any,species,recent,never,is,idiots,performance,patriarchy,km,alternative,almost,trade,islamic,mid,digging,douche,angle,obnoxiousness,beach,yarn,norms,diet,justify,nowadays,cleaned,creepy,missing,out,research,why,which,investment,nuanced,landscapes,selfless,heli,police,wonder,cia,misleading,odd,explain,facts,obnoxious,voting,accepted,explanation,brexit,abput,about,brodcasting,enormous,out,boyfriend,indian,intelligence,zoned,products,overly,giving,haha,main,pre,tyranny,mentioned,flee,lord,self,together,minnesotan,cough,rolls,showering,misinformed,finding,them,judgmental,probed,ahat,identification,wing,courthouse,old,minorities,communism,she,hick,north,foreigners,assed,milk,haired,immediate,top,badly,cumming,by,dunk,lmfao,validating,gyms,subreddit,christ,order,digestable,been,republican,ponder,clue,ters,efficacy,bet,views,severe,ignore,experimental,esteem,7000,grades,upsets,50k,italians,folk,logical,budget,colleges,easily,mortgage,rent,situation,tell,broad,communicate,bankruptcy,same,mixing,max,hadn,bodily,scare,convincing,neutral,loose,typing,rooting,moved,cruel,verbally,vulnerable,nursing,road,trigger,asks,now,cheap,bomb,very,show,looks,that,roll,words,deli,deli,swinging,they,smothered,steal,44,bigger,street,want,appropriation,ain,style,hose,do,chucked,anymore,incredibly,perve,uncomfy,slippers,70,bleach,ol,none,gona,improve,belittle,being,smh,she,canadians,alot,harassment,available,disdain,irrational,magically,faces,negligent,admitting,picking,immoral,fit,watching,fast,homes,gong,planned,future,eminen,eminen,doctor,initiate,three,suit,shtt,your,struggling,belated,reach,crash,belonging,funding,glasses,poster,calls,malicious,word,hasn,make,guy,crevices,professional,delusional,surprises,advantage,responsibility,block,englishman,toys,grab,him,withhold,cheat,heavy,stick,facebook,skill,this,section,mean,reflecting,tickets,great
