In [19]:
import pandas as pd
import numpy as np
from nltk import FreqDist
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [30]:
df = pd.read_csv('./data/BothCom.csv')

In [4]:
df.head()

Unnamed: 0,body,created_utc,subreddit,id
0,Possibly be might have been arrested for somet...,1616898627,LegalAdviceUK,gsjhof6
1,It’s might be worth you contacting [victim’s s...,1616898600,LegalAdviceUK,gsjhmm1
2,"NAL, but in my experience charities require vo...",1616897619,LegalAdviceUK,gsjfvek
3,This is unanswerably broad. Please can you edi...,1616897023,LegalAdviceUK,gsjeu5h
4,\n---\n###Welcome to /r/LegalAdviceUK\n---\n\n...,1616895949,LegalAdviceUK,gsjcyza


In [5]:
df.shape

(10000, 4)

In [31]:
# replaces '\n' with a space
def replace_line_break(x):
    x = x.replace('\n',' ')
    return x

In [38]:
# replaced all \n with a space in the body column
df['body'] = df['body'].map(replace_line_break)

In [41]:
# this looks like a auto reply moderator comment
df['body'][4]



In [46]:
# Shows how many auto reply moderator comments are in the data frame
df[df['body'] ==df['body'][4]]

Unnamed: 0,body,created_utc,subreddit,id
4,--- ###Welcome to /r/LegalAdviceUK --- **To...,1616895949,LegalAdviceUK,gsjcyza
9,--- ###Welcome to /r/LegalAdviceUK --- **To...,1616893969,LegalAdviceUK,gsj7zx1
13,--- ###Welcome to /r/LegalAdviceUK --- **To...,1616893305,LegalAdviceUK,gsj6bzz
19,--- ###Welcome to /r/LegalAdviceUK --- **To...,1616891826,LegalAdviceUK,gsj2uif
25,--- ###Welcome to /r/LegalAdviceUK --- **To...,1616890947,LegalAdviceUK,gsj19zr
...,...,...,...,...
4966,--- ###Welcome to /r/LegalAdviceUK --- **To...,1615579844,LegalAdviceUK,gqq6xf2
4970,--- ###Welcome to /r/LegalAdviceUK --- **To...,1615579508,LegalAdviceUK,gqq67jc
4972,--- ###Welcome to /r/LegalAdviceUK --- **To...,1615579155,LegalAdviceUK,gqq5g6j
4975,--- ###Welcome to /r/LegalAdviceUK --- **To...,1615578753,LegalAdviceUK,gqq4l75


In [70]:
#Took out the auto reply moderator comments
df.drop(df[df['body'] ==df['body'][4]].index, inplace=True)

In [71]:
df[df["body"].str.find('legaladvice') > 0]

Unnamed: 0,body,created_utc,subreddit,id
707,"**Unfortunately, your submission has been remo...",1615973723,LegalAdviceUK,gr7ydf9
957,Aren't they a dodgy Russian developer with the...,1615930848,LegalAdviceUK,gr64m99
1177,Feel bad for just repeating the same thing but...,1615917582,LegalAdviceUK,gr5bkaw
1208,"**Unfortunately, your submission has been remo...",1615915630,LegalAdviceUK,gr573wt
1346,"**Unfortunately, your submission has been remo...",1615904483,LegalAdviceUK,gr4i9yw
...,...,...,...,...
9848,*Your post may have been removed for the follo...,1615881604,legaladvice,gr3o57y
9862,Try r/legaladviceaus,1615880786,legaladvice,gr3ncc3
9943,**Unanswerable Questions** Your post does not...,1615876410,legaladvice,gr3iwon
9948,**Unanswerable Questions** Your post does not...,1615876374,legaladvice,gr3ivad


In [73]:
df['body'][9948]

'**Unanswerable Questions**  Your post does not appear to contain an answerable question. Please see [our wiki](https://www.reddit.com/r/legaladvice/wiki/toosimple) for examples of questions that we cannot answer.  *Please [read our subreddit rules](https://www.reddit.com/r/legaladvice/wiki/index#wiki_general_rules).  If after doing so, you believe this was in error, or you’ve edited your post to comply with the rules, [message the moderators](https://www.reddit.com/message/compose?to=%2Fr%2FLegalAdvice).*  *Do not reach out to a moderator personally, and do not reply to this message as a comment.*'

In [74]:
df.drop(df[df['body'] ==df['body'][9948]].index, inplace=True)

In [75]:
df['body'][1346]

"**Unfortunately, your submission has been removed for the following reason(s):**  Your question is answered in our [Frequently Asked Questions page on Housing](https://www.reddit.com/r/legaladviceuk/wiki/faq_housing), which contains full answers on the following common topics, amongst others:  * [Can I end my tenancy early?](https://www.reddit.com/r/legaladviceuk/wiki/faq_housing#wiki_can_i_end_my_tenancy_early.3F) * [My rented accommodation has mould or is in disrepair, what can I do?](https://www.reddit.com/r/legaladviceuk/wiki/faq_housing#wiki_my_rented_accommodation_has_mould_or_is_in_disrepair.2C_what_can_i_do.3F) * [My landlord won't return my deposit, how can I get it back?](https://www.reddit.com/r/legaladviceuk/wiki/faq_housing#wiki_my_landlord_won.27t_return_my_deposit.2C_how_do_i_get_it_back.3F) (including: what can I do if I don't agree with what my landlord wants to take from my deposit?) * [Can my landlord/their agent come into my home whenever they want?](https://www.re

In [76]:
df.drop(df[df['body'] ==df['body'][1346]].index, inplace=True)

In [77]:
df[df["body"].str.find('*') > 0]

Unnamed: 0,body,created_utc,subreddit,id
7,It looks like you or OP may want to find a So...,1616894439,LegalAdviceUK,gsj98q1
8,**What to do if you have questions about COVI...,1616893970,LegalAdviceUK,gsj7zyi
66,A hostile witness is one who is called to give...,1616885204,LegalAdviceUK,gsiqt73
76,&gt; It confers on the parties the status of h...,1616883419,LegalAdviceUK,gsinged
100,&gt;Everyone has a chequered past. This isn...,1616881379,LegalAdviceUK,gsii3dx
...,...,...,...,...
9964,---\r &gt; http://imgur.com/a/myIAb\r \r ---\r...,1615875754,legaladvice,gr3i7oa
9968,You don't ever *have* to sign anything. Read t...,1615875522,legaladvice,gr3hyrt
9973,---\r &gt; http://imgur.com/a/myIAb\r \r ---\r...,1615875276,legaladvice,gr3hp8n
9980,---\r &gt; http://imgur.com/a/myIAb\r \r ---\r...,1615874915,legaladvice,gr3hbcf


In [78]:
df['body'][9973]

'---\r &gt; http://imgur.com/a/myIAb\r \r ---\r *I am a bot whose sole purpose is to improve the timeliness and accuracy of responses in this subreddit.*\r \r ---\r **It appears you forgot to include your location in the title or body of your post. Please update the body of your original post to include this information.**\r \r ---\r ***Do NOT delete this post - Instead, simply edit the post with the requested information.***\r \r ---\r \r Author: /u/throwingaway1864\r \r Title: **Being threatened revenge porn over on discord**\r \r Original Post: \r \r &gt; I(19) from SEA met a guy(23) from USA on discord, shared photos over at snap but soon ended the relationship and not on speaking terms. He blocked me on discord but before he did he threatened to share my private photos. (He texted me this so I have proof)  &gt;  &gt; He screenshots the images from snap and posted it on our PMs. So the images are on discord. What should I do? &gt;  &gt; I have not contacted Discord yet.\r \r \r \r 

In [79]:
df.drop(df[df['body'] ==df['body'][9973]].index, inplace=True)

In [81]:
df.shape

(9474, 4)

In [82]:
df[df["body"].str.find('legaladvice') > 0]

Unnamed: 0,body,created_utc,subreddit,id
707,"**Unfortunately, your submission has been remo...",1615973723,LegalAdviceUK,gr7ydf9
957,Aren't they a dodgy Russian developer with the...,1615930848,LegalAdviceUK,gr64m99
1177,Feel bad for just repeating the same thing but...,1615917582,LegalAdviceUK,gr5bkaw
1208,"**Unfortunately, your submission has been remo...",1615915630,LegalAdviceUK,gr573wt
3830,&gt; Can I ask too since you seem to know alot...,1615680857,LegalAdviceUK,gqumm8r
...,...,...,...,...
9813,*Your post may have been removed for the follo...,1615882741,legaladvice,gr3p8nb
9822,*Your post may have been removed for the follo...,1615882393,legaladvice,gr3owkg
9825,*Your post may have been removed for the follo...,1615882378,legaladvice,gr3ow1c
9848,*Your post may have been removed for the follo...,1615881604,legaladvice,gr3o57y


In [83]:
df['body'][1208]

'**Unfortunately, your submission has been removed for the following reason(s):**  Your submission may be asking or giving immigration or visa advice which are not allowed in this subreddit. It is *illegal* for anybody to give this advice unless they are a [registered OIC advisor](https://www.gov.uk/government/organisations/office-of-the-immigration-services-commissioner). If you are in need of immigration advice, [please seek a registered adviser through this website](https://www.gov.uk/find-an-immigration-adviser).  For more information about this, and what falls in scope of this rule, please see [our FAQ](https://www.reddit.com/r/legaladviceuk/wiki/faq_subreddit#wiki_why_can.27t_i_ask_questions_about_immigration.2C_citizenship.2C_or_other_related_topics.3F).    [Please familiarise yourself with our subreddit rules](https://www.reddit.com/r/LegalAdviceUK/about/rules/) before contributing further, and [message the mods](https://www.reddit.com/message/compose/?to=/r/LegalAdviceUK) if y

In [84]:
df.drop(df[df['body'] ==df['body'][1208]].index, inplace=True)

In [85]:
df['body'][9848]

'*Your post may have been removed for the following reason(s):*  **Advertising and Recommendations**   This is a forum for legal answers.  We do not allow any advice on specific lawyers, legal services or legal products.  Non-legal advice on products or services may be allowed at moderator discretion.  Please review the following rules before commenting further:  * [General Rules 3](https://www.reddit.com/r/legaladvice/wiki/index#wiki_3.__recommendations_for_specific_attorneys_or_attorney_services.2C_or_requests_for_same.2C_are_barred.), [4](https://www.reddit.com/r/legaladvice/wiki/index#wiki_4.__links_to_for-profit_attorneys.2C_attorney_sites.2C_blogs_and_the_like_are_barred.__links_to_not-for-profit_or_governmental_organizations_are_acceptable.), and [5](https://www.reddit.com/r/legaladvice/wiki/index#wiki_5.__recommendations_of_other_products_or_services_in_comments_will_be_reviewed_for_topicality_and_general_appropriateness.__the_mods_have_the_discretion_to_remove_any_such_recomme

In [86]:
df.drop(df[df['body'] ==df['body'][9848]].index, inplace=True)

In [87]:
df[df["body"].str.find('legaladvice') > 0]

Unnamed: 0,body,created_utc,subreddit,id
957,Aren't they a dodgy Russian developer with the...,1615930848,LegalAdviceUK,gr64m99
1177,Feel bad for just repeating the same thing but...,1615917582,LegalAdviceUK,gr5bkaw
3830,&gt; Can I ask too since you seem to know alot...,1615680857,LegalAdviceUK,gqumm8r
5021,/r/legaladviceofftopic,1616898391,legaladvice,gsjh95l
5214,*Your post may have been removed for the follo...,1616893808,legaladvice,gsj7nn7
...,...,...,...,...
9812,*Your post may have been removed for the follo...,1615882759,legaladvice,gr3p9bj
9813,*Your post may have been removed for the follo...,1615882741,legaladvice,gr3p8nb
9822,*Your post may have been removed for the follo...,1615882393,legaladvice,gr3owkg
9825,*Your post may have been removed for the follo...,1615882378,legaladvice,gr3ow1c


In [88]:
df['body'][9812]

'*Your post may have been removed for the following reason(s):*  **Must be 13 to have a Reddit account**  Your comment has been removed because [Reddit requires posters to be at least 13 years old](https://www.redditinc.com/policies/user-agreement). Based on your grammar, spelling, or the content of your comment/post it is clear that you are not yet 13.     *Please [read our subreddit rules](https://www.reddit.com/r/legaladvice/wiki/index#wiki_general_rules).  If after doing so, you believe this was in error, or you’ve edited your post to comply with the rules, [message the moderators](https://www.reddit.com/message/compose?to=%2Fr%2FLegalAdvice).* **Do not make a second post or comment.**  *Do not reach out to a moderator personally, and do not reply to this message as a comment.*'

In [89]:
df.drop(df[df['body'] ==df['body'][9812]].index, inplace=True)

In [90]:
df[df["body"].str.find('legaladvice') > 0]

Unnamed: 0,body,created_utc,subreddit,id
957,Aren't they a dodgy Russian developer with the...,1615930848,LegalAdviceUK,gr64m99
1177,Feel bad for just repeating the same thing but...,1615917582,LegalAdviceUK,gr5bkaw
3830,&gt; Can I ask too since you seem to know alot...,1615680857,LegalAdviceUK,gqumm8r
5021,/r/legaladviceofftopic,1616898391,legaladvice,gsjh95l
5214,*Your post may have been removed for the follo...,1616893808,legaladvice,gsj7nn7
...,...,...,...,...
9736,"First, this is r/legaladvice so you should try...",1615891978,legaladvice,gr3y9j7
9813,*Your post may have been removed for the follo...,1615882741,legaladvice,gr3p8nb
9822,*Your post may have been removed for the follo...,1615882393,legaladvice,gr3owkg
9825,*Your post may have been removed for the follo...,1615882378,legaladvice,gr3ow1c


In [99]:
df[df["body"].str.find('Your post may have been remove') > 0]

Unnamed: 0,body,created_utc,subreddit,id
5214,*Your post may have been removed for the follo...,1616893808,legaladvice,gsj7nn7
5215,*Your post may have been removed for the follo...,1616893797,legaladvice,gsj7mv4
5264,*Your post may have been removed for the follo...,1616892475,legaladvice,gsj458j
5266,*Your post may have been removed for the follo...,1616892421,legaladvice,gsj402g
5268,*Your post may have been removed for the follo...,1616892374,legaladvice,gsj3vi3
...,...,...,...,...
9629,*Your post may have been removed for the follo...,1615898423,legaladvice,gr46wui
9630,*Your post may have been removed for the follo...,1615898412,legaladvice,gr46w7a
9813,*Your post may have been removed for the follo...,1615882741,legaladvice,gr3p8nb
9822,*Your post may have been removed for the follo...,1615882393,legaladvice,gr3owkg


In [102]:
df.drop(df[df["body"].str.find('Your post may have been remove') > 0].index,inplace=True)

In [103]:
df[df["body"].str.find('Your post may have been remove') > 0]

Unnamed: 0,body,created_utc,subreddit,id


In [106]:
df[df["body"].str.find('*') > 0]

Unnamed: 0,body,created_utc,subreddit,id
7,It looks like you or OP may want to find a So...,1616894439,LegalAdviceUK,gsj98q1
8,**What to do if you have questions about COVI...,1616893970,LegalAdviceUK,gsj7zyi
66,A hostile witness is one who is called to give...,1616885204,LegalAdviceUK,gsiqt73
76,&gt; It confers on the parties the status of h...,1616883419,LegalAdviceUK,gsinged
100,&gt;Everyone has a chequered past. This isn...,1616881379,LegalAdviceUK,gsii3dx
...,...,...,...,...
9956,---\r &gt; http://imgur.com/a/myIAb\r \r ---\r...,1615876176,legaladvice,gr3inrs
9964,---\r &gt; http://imgur.com/a/myIAb\r \r ---\r...,1615875754,legaladvice,gr3i7oa
9968,You don't ever *have* to sign anything. Read t...,1615875522,legaladvice,gr3hyrt
9980,---\r &gt; http://imgur.com/a/myIAb\r \r ---\r...,1615874915,legaladvice,gr3hbcf


In [107]:
df['body'][9956]

"---\r &gt; http://imgur.com/a/myIAb\r \r ---\r *I am a bot whose sole purpose is to improve the timeliness and accuracy of responses in this subreddit.*\r \r ---\r **It appears you forgot to include your location in the title or body of your post. Please update the body of your original post to include this information.**\r \r ---\r ***Do NOT delete this post - Instead, simply edit the post with the requested information.***\r \r ---\r \r Author: /u/No_Journalist5009\r \r Title: **Need help with a divorce/a pro bono lawyer in RSA**\r \r Original Post: \r \r &gt; Hi everyone, I posted this on assistance and was told to post here for legal help. Of there are Lawyers from my country that provide Pro bono services, please reach out. &gt;  &gt; I really have no one else to ask or no where to go because apart from my daughter and soon to be ex, I don't really have family. I am a 30 year old who is about to go through a divorce and my spouse has not only taken my daughter and refuses to brin

In [108]:
df.drop(df[df['body'] ==df['body'][9956]].index, inplace=True)

In [109]:
df[df["body"].str.find('*') > 0]

Unnamed: 0,body,created_utc,subreddit,id
7,It looks like you or OP may want to find a So...,1616894439,LegalAdviceUK,gsj98q1
8,**What to do if you have questions about COVI...,1616893970,LegalAdviceUK,gsj7zyi
66,A hostile witness is one who is called to give...,1616885204,LegalAdviceUK,gsiqt73
76,&gt; It confers on the parties the status of h...,1616883419,LegalAdviceUK,gsinged
100,&gt;Everyone has a chequered past. This isn...,1616881379,LegalAdviceUK,gsii3dx
...,...,...,...,...
9953,---\r &gt; http://imgur.com/a/myIAb\r \r ---\r...,1615876202,legaladvice,gr3iotb
9964,---\r &gt; http://imgur.com/a/myIAb\r \r ---\r...,1615875754,legaladvice,gr3i7oa
9968,You don't ever *have* to sign anything. Read t...,1615875522,legaladvice,gr3hyrt
9980,---\r &gt; http://imgur.com/a/myIAb\r \r ---\r...,1615874915,legaladvice,gr3hbcf


In [112]:
df[df["body"].str.find('\r &gt;') > 0]

Unnamed: 0,body,created_utc,subreddit,id
5015,---\r &gt; http://imgur.com/a/myIAb\r \r ---\r...,1616898518,legaladvice,gsjhhac
5065,---\r &gt; http://imgur.com/a/myIAb\r \r ---\r...,1616897136,legaladvice,gsjf17u
5124,---\r &gt; http://imgur.com/a/myIAb\r \r ---\r...,1616895695,legaladvice,gsjchiz
5131,---\r &gt; http://imgur.com/a/myIAb\r \r ---\r...,1616895575,legaladvice,gsjc98l
5136,---\r &gt; http://imgur.com/a/myIAb\r \r ---\r...,1616895458,legaladvice,gsjc02s
...,...,...,...,...
9920,---\r &gt; http://imgur.com/a/myIAb\r \r ---\r...,1615877856,legaladvice,gr3ke22
9942,---\r &gt; http://imgur.com/a/myIAb\r \r ---\r...,1615876415,legaladvice,gr3iwve
9953,---\r &gt; http://imgur.com/a/myIAb\r \r ---\r...,1615876202,legaladvice,gr3iotb
9964,---\r &gt; http://imgur.com/a/myIAb\r \r ---\r...,1615875754,legaladvice,gr3i7oa


In [113]:
df.drop(df[df["body"].str.find('\r &gt;') > 0].index,inplace=True)

In [114]:
df.shape

(9015, 4)

In [116]:
df[df["body"].str.find('**') > 0]

Unnamed: 0,body,created_utc,subreddit,id
8,**What to do if you have questions about COVI...,1616893970,LegalAdviceUK,gsj7zyi
76,&gt; It confers on the parties the status of h...,1616883419,LegalAdviceUK,gsinged
131,"In that case, if signed off properly by the re...",1616878632,LegalAdviceUK,gsi9w2j
144,Your contract is with the heating company. Who...,1616877771,LegalAdviceUK,gsi7az3
254,**Important Notice** Your thread has be...,1615996491,LegalAdviceUK,gr8yngx
...,...,...,...,...
9316,"&gt; $15k in debt, besides my student loans O...",1615909341,legaladvice,gr4su9f
9583,IANAL but I've got a lot of experience with la...,1615900725,legaladvice,gr4av32
9610,Is this the UK consumer law that says: ***Whe...,1615899074,legaladvice,gr47zkp
9892,"Agreed. OP, either in addition to or potential...",1615879241,legaladvice,gr3lsyu


In [118]:
df['subreddit'].value_counts(normalize=True)

legaladvice      0.502829
LegalAdviceUK    0.497171
Name: subreddit, dtype: float64

In [119]:
df[df["body"].str.find('**Important Notice**') > 0]

Unnamed: 0,body,created_utc,subreddit,id
254,**Important Notice** Your thread has be...,1615996491,LegalAdviceUK,gr8yngx
290,**Important Notice** Your thread has be...,1615993884,LegalAdviceUK,gr8svqq
294,**Important Notice** Your thread has be...,1615993562,LegalAdviceUK,gr8s7g0
638,**Important Notice** Your thread has be...,1615977691,LegalAdviceUK,gr82b3t
822,**Important Notice** Your thread has be...,1615943170,LegalAdviceUK,gr6swn4
1866,**Important Notice** Your thread has be...,1615888115,LegalAdviceUK,gr3u9ww
2326,**Important Notice** Your thread has be...,1615834824,LegalAdviceUK,gr1khyt
2327,**Important Notice** Your thread has be...,1615834823,LegalAdviceUK,gr1khwa
2328,**Important Notice** Your thread has be...,1615834822,LegalAdviceUK,gr1khsy
2448,**Important Notice** Your thread has be...,1615824864,LegalAdviceUK,gr0y023


In [120]:
df.drop(df[df["body"].str.find('**Important Notice**') > 0].index,inplace=True)

In [121]:
df['subreddit'].value_counts(normalize=True)

legaladvice      0.503947
LegalAdviceUK    0.496053
Name: subreddit, dtype: float64

In [130]:
df["body"]

0       Possibly be might have been arrested for somet...
1       It’s might be worth you contacting [victim’s s...
2       NAL, but in my experience charities require vo...
3       This is unanswerably broad. Please can you edi...
5       That may be on the high side just for initial ...
                              ...                        
9995    No. This is wage theft and illegal. If your bo...
9996            You'd owe them for their time regardless.
9997    I imagine their issue is that if it's being us...
9998    Hi there! I’m the LegalFAQ bot. It looks like ...
9999    You need to apply to change your SSN. https://...
Name: body, Length: 8995, dtype: object

In [132]:
df

Unnamed: 0,body,created_utc,subreddit,id
0,Possibly be might have been arrested for somet...,1616898627,LegalAdviceUK,gsjhof6
1,It’s might be worth you contacting [victim’s s...,1616898600,LegalAdviceUK,gsjhmm1
2,"NAL, but in my experience charities require vo...",1616897619,LegalAdviceUK,gsjfvek
3,This is unanswerably broad. Please can you edi...,1616897023,LegalAdviceUK,gsjeu5h
5,That may be on the high side just for initial ...,1616895517,LegalAdviceUK,gsjc50e
...,...,...,...,...
9995,No. This is wage theft and illegal. If your bo...,1615874133,legaladvice,gr3gg61
9996,You'd owe them for their time regardless.,1615874127,legaladvice,gr3gfy0
9997,I imagine their issue is that if it's being us...,1615874089,legaladvice,gr3geez
9998,Hi there! I’m the LegalFAQ bot. It looks like ...,1615874032,legaladvice,gr3gc44


In [141]:
df[df["body"].str.find('removed') > 0]

Unnamed: 0,body,created_utc,subreddit,id
15,[removed],1616892472,LegalAdviceUK,gsj44vy
37,"**Unfortunately, your comment has been removed...",1616889505,LegalAdviceUK,gsiyo9e
40,[removed],1616889439,LegalAdviceUK,gsiyjvw
52,[removed],1616886252,LegalAdviceUK,gsispd3
55,[removed],1616886114,LegalAdviceUK,gsisgh9
...,...,...,...,...
9896,[removed],1615879023,legaladvice,gr3ll0d
9899,[removed],1615878902,legaladvice,gr3lgm5
9909,[removed],1615878525,legaladvice,gr3l2yw
9921,[removed],1615877807,legaladvice,gr3kccm


In [142]:
df.drop(df[df["body"].str.find('removed') > 0].index,inplace=True)

In [143]:
df[df["body"].str.find('deleted') > 0]

Unnamed: 0,body,created_utc,subreddit,id
87,[deleted],1616882370,LegalAdviceUK,gsil7fo
99,[deleted],1616881494,LegalAdviceUK,gsiie9p
119,[deleted],1616879609,LegalAdviceUK,gsicuqt
947,[deleted],1615931297,LegalAdviceUK,gr65kg3
949,[deleted],1615931087,LegalAdviceUK,gr654ga
...,...,...,...,...
9063,[deleted],1615914223,legaladvice,gr53wmc
9070,[deleted],1615914188,legaladvice,gr53tpc
9098,[deleted],1615913685,legaladvice,gr52noo
9150,[deleted],1615912650,legaladvice,gr50avg


In [144]:
df.drop(df[df["body"].str.find('deleted') > 0].index,inplace=True)

In [146]:
df.to_csv('./data/CleanBothCom.csv')

In [151]:
df['subreddit'].value_counts(normalize=True)

legaladvice      0.515961
LegalAdviceUK    0.484039
Name: subreddit, dtype: float64

In [26]:
df.shape

(8270, 5)

In [25]:
df= pd.read_csv('./data/CleanBothCom.csv')

In [13]:
df.head()

Unnamed: 0.1,Unnamed: 0,body,created_utc,subreddit,id
0,0,Possibly be might have been arrested for somet...,1616898627,LegalAdviceUK,gsjhof6
1,1,It’s might be worth you contacting [victim’s s...,1616898600,LegalAdviceUK,gsjhmm1
2,2,"NAL, but in my experience charities require vo...",1616897619,LegalAdviceUK,gsjfvek
3,3,This is unanswerably broad. Please can you edi...,1616897023,LegalAdviceUK,gsjeu5h
4,5,That may be on the high side just for initial ...,1616895517,LegalAdviceUK,gsjc50e


In [14]:
lemmatizer = WordNetLemmatizer()

In [26]:
df['body'].map(lemmatizer.lemmatize)

0       Possibly be might have been arrested for somet...
1       It’s might be worth you contacting [victim’s s...
2       NAL, but in my experience charities require vo...
3       This is unanswerably broad. Please can you edi...
4       That may be on the high side just for initial ...
                              ...                        
8265    You’re right. I redownloaded Zillow *sigh* I’d...
8266    No. This is wage theft and illegal. If your bo...
8267            You'd owe them for their time regardless.
8268    Hi there! I’m the LegalFAQ bot. It looks like ...
8269    You need to apply to change your SSN. https://...
Name: body, Length: 8270, dtype: object

In [27]:
df['body'] = df['body'].map(lemmatizer.lemmatize)

In [28]:
df.to_csv('./data/LemCleanBothCom.csv')

In [17]:
df= pd.read_csv('./data/CleanBothCom.csv')

In [20]:
p_stemmer = PorterStemmer()

In [22]:
df['body'] = df['body'].map(p_stemmer.stem)

In [23]:
df.to_csv('./data/stemCleanBothCom.csv')

In [24]:
df['body']

0       possibly be might have been arrested for somet...
1       it’s might be worth you contacting [victim’s s...
2       nal, but in my experience charities require vo...
3       this is unanswerably broad. please can you edi...
4       that may be on the high side just for initial ...
                              ...                        
8265    you’re right. i redownloaded zillow *sigh* i’d...
8266    no. this is wage theft and illegal. if your bo...
8267            you'd owe them for their time regardless.
8268    hi there! i’m the legalfaq bot. it looks like ...
8269    you need to apply to change your ssn. https://...
Name: body, Length: 8270, dtype: object