In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

# Import CountVectorizer and TFIDFVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import text

from imblearn.under_sampling import RandomUnderSampler

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem.porter import PorterStemmer
pd.options.display.max_colwidth = 200

In [6]:
def subreddit_to_df(subreddit):
    # Get file names in data folder for subreddit
    subreddit_files = []
    for file in os.listdir("./data"):
        split_filename = file.split("-")
        if split_filename[0] == subreddit and split_filename[1] == "full":
            subreddit_files.append(file)

    #    subreddit_files = [file for file in os.listdir("./data") if file.split("-")[0] == subreddit]
    
    # Read csvs into dataframes then concatenate each of them 
    df = pd.concat([pd.read_csv(f'./data/{file}') for file in subreddit_files]).drop(columns="Unnamed: 0")
    
    # Convert UTC time stamp into a datetime column
    df['datetime'] = pd.to_datetime(df['created_utc'], unit='s')
    
    # Create column with all text
    df['alltext'] = df['title'].str.cat(df['selftext'], sep = " ")
    
    return df

In [7]:
dating = subreddit_to_df("dating")

In [16]:
over_forty = subreddit_to_df("datingoverforty")

In [17]:
full_data = pd.concat([dating, over_forty])

In [134]:
full_data

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_text,author_flair_type,author_premium,created_utc,domain,id,is_crosspostable,...,thumbnail_height,thumbnail_width,url_overridden_by_dest,crosspost_parent,crosspost_parent_list,media,media_embed,secure_media,secure_media_embed,media_metadata
0,[],False,aRandomSet,,text,0,2021-06-01 23:48:59,self.dating,nq83ep,True,...,,,,,,,,,,
1,[],False,undeniablydigging,,text,0,2021-06-01 23:49:21,self.dating,nq83nn,True,...,,,,,,,,,,
2,[],False,PenguinBluebird,,text,0,2021-06-01 23:53:48,self.dating,nq86wv,True,...,,,,,,,,,,
3,[],False,West_Librarian3074,,text,0,2021-06-02 00:05:32,self.dating,nq8ffg,True,...,,,,,,,,,,
6,[],False,sadcherryicecream,,text,0,2021-06-02 00:10:44,self.dating,nq8j4k,True,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,[],False,labtech89,,text,0,2021-06-25 19:24:43,self.datingoverforty,o7ulrp,True,...,,,,,,,,,,
96,[],False,Loud-Pomegranate,,text,0,2021-06-25 20:32:54,self.datingoverforty,o7vxsd,True,...,,,,,,,,,,
97,[],False,Fo_Sho_77,,text,0,2021-06-25 20:39:53,self.datingoverforty,o7w2k6,False,...,,,,,,,,,,
98,[],False,MasterfulBJJ,,text,0,2021-06-25 20:48:42,i.redd.it,o7w8jr,True,...,70.0,140.0,https://i.redd.it/sojaw3645h771.png,,,,,,,


In [129]:
print(list(full_data.columns))

['all_awardings', 'allow_live_comments', 'author', 'author_flair_text', 'author_flair_type', 'author_premium', 'created_utc', 'domain', 'id', 'is_crosspostable', 'is_meta', 'is_original_content', 'is_reddit_media_domain', 'is_robot_indexable', 'is_self', 'is_video', 'link_flair_background_color', 'link_flair_css_class', 'link_flair_richtext', 'link_flair_template_id', 'link_flair_text', 'link_flair_text_color', 'link_flair_type', 'locked', 'media_only', 'no_follow', 'num_comments', 'num_crossposts', 'over_18', 'parent_whitelist_status', 'permalink', 'pinned', 'pwls', 'retrieved_on', 'score', 'selftext', 'send_replies', 'spoiler', 'stickied', 'subreddit', 'subreddit_id', 'subreddit_subscribers', 'subreddit_type', 'thumbnail', 'title', 'total_awards_received', 'treatment_tags', 'upvote_ratio', 'url', 'whitelist_status', 'wls', 'author_flair_background_color', 'author_flair_text_color', 'removed_by_category', 'post_hint', 'preview', 'author_is_blocked', 'author_flair_template_id', 'edited

In [133]:
cols = sorted(['is_meta', 'is_original_content', 'is_reddit_media_domain', 'is_robot_indexable', 'is_self', 'is_video', 'link_flair_background_color', 'link_flair_css_class', 'link_flair_richtext', 'link_flair_template_id', 'link_flair_text', 'link_flair_text_color', 'link_flair_type', 'locked', 'media_only', 'no_follow', 'num_comments', 'num_crossposts', 'over_18', 'parent_whitelist_status', 'permalink', 'pinned', 'pwls', 'retrieved_on', 'score', 'selftext', 'send_replies', 'spoiler', 'stickied', 'subreddit', 'subreddit_id', 'subreddit_subscribers', 'subreddit_type', 'thumbnail', 'title', 'total_awards_received', 'treatment_tags', 'upvote_ratio', 'url', 'whitelist_status', 'wls', 'author_flair_background_color', 'author_flair_text_color', 'removed_by_category', 'post_hint', 'preview', 'author_is_blocked', 'author_flair_template_id', 'edited', 'author_cakeday', 'banned_by', 'call_to_action', 'category', 'poll_data', 'distinguished', 'gilded', 'datetime', 'alltext', 'thumbnail_height', 'thumbnail_width', 'url_overridden_by_dest', 'crosspost_parent', 'crosspost_parent_list', 'media', 'media_embed', 'secure_media', 'secure_media_embed', 'media_metadata'])

for col in cols:
    print(f"********{col}********)")
    print("** DATING **")
    display(full_data[full_data["subreddit"]=="dating"][col].value_counts(normalize=True, dropna=False))
    print("")
    print("** DATING OVER 40 **")
    display(full_data[full_data["subreddit"]=="datingoverforty"][col].value_counts(normalize=True, dropna=False))
    print("")
    print("")
    print("")

********alltext********)
** DATING **


Dating [removed]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        


** DATING OVER 40 **


NaN                                                                                                                                                                                                                        0.160427
Dating [removed]                                                                                                                                                                                                           0.000659
What sites are people over 40 using? I keep dating men in their early thirties and all of the emotional immaturity that comes with that. Maybe I should branch out to more dating sites? (Bonus if there's no swiping.)    0.000659
DO YOU NEED LEGIT HACKING SERVICES? CONTACT VIA THE EMAIL ADDRESS BELOW [removed]                                                                                                                                          0.000527
Hello [removed]                                                                         




********author_cakeday********)
** DATING **


NaN     0.997675
True    0.002325
Name: author_cakeday, dtype: float64


** DATING OVER 40 **


NaN     0.998023
True    0.001977
Name: author_cakeday, dtype: float64




********author_flair_background_color********)
** DATING **


NaN        0.999989
#1a1a1b    0.000011
Name: author_flair_background_color, dtype: float64


** DATING OVER 40 **


NaN    1.0
Name: author_flair_background_color, dtype: float64




********author_flair_template_id********)
** DATING **


NaN                                     0.995178
49223cde-abc9-11e9-b518-0ee41f1cd37e    0.003608
5b92b088-abc9-11e9-a424-0e7443ab399e    0.000504
bd7c8dee-ac0c-11e9-a4fb-0e471e117d68    0.000321
6f9179b6-abc9-11e9-93e4-0e333220d0f0    0.000218
621ccfd8-abc9-11e9-8f91-0e5707ebc2ec    0.000103
4dc35a98-abc9-11e9-acc9-0e3adec6ab16    0.000023
510f5454-abc9-11e9-b4d1-0e0bbc704af6    0.000023
41e58aca-e2d9-11e3-bb49-12313b0d5a0a    0.000011
ac01cafc-501d-11e9-b857-0ea010f7c9a2    0.000011
Name: author_flair_template_id, dtype: float64


** DATING OVER 40 **


NaN                                     0.937516
4e44fdd2-276c-11e9-b1fb-0e88ed313b8c    0.007646
2594b2f8-26a2-11e9-991c-0e06dd3c7562    0.006987
454190e4-26b1-11e9-93d9-0e688abd8430    0.003955
de42d95c-26b0-11e9-8b8e-0eb1c6a44238    0.003823
504540fc-26a3-11e9-b938-0e88f38fcd68    0.003164
5679db6a-26b0-11e9-810f-0e55842fbebe    0.003164
ee5ea834-26b0-11e9-aff7-0ec851a27302    0.002636
4e9677be-26b0-11e9-906a-0e545a99bb52    0.002505
54c97442-26b0-11e9-a03f-0ef52665792a    0.002505
2eb700e4-26b0-11e9-9383-0e5da7387fd6    0.002241
436898c2-26b0-11e9-bd84-0e9b3104229e    0.001846
1fcb779e-26b1-11e9-9ef9-0e86147bb114    0.001846
0fbd3a04-26b1-11e9-8d4f-0e0410a403b6    0.001714
223fa3e8-26b0-11e9-b919-0efba316e530    0.001714
81f7d2e2-26b0-11e9-9c91-0ede4dba9cfe    0.001582
72e1c43e-26b0-11e9-82b5-0e8034929474    0.001582
278467f8-26b0-11e9-a41e-0ed02cff7b5c    0.001582
da136e32-26b0-11e9-a329-0e99aa7286c2    0.001318
ee93632e-26a3-11e9-96cb-0e1f42d59cb4    0.001318
e06641e0-26a3-11e9-a




********author_flair_text_color********)
** DATING **


NaN      0.995120
dark     0.004868
light    0.000011
Name: author_flair_text_color, dtype: float64


** DATING OVER 40 **


NaN     0.936066
dark    0.063934
Name: author_flair_text_color, dtype: float64




********author_is_blocked********)
** DATING **


False    0.730757
NaN      0.269243
Name: author_is_blocked, dtype: float64


** DATING OVER 40 **


False    0.671236
NaN      0.328764
Name: author_is_blocked, dtype: float64




********banned_by********)
** DATING **


NaN    1.0
Name: banned_by, dtype: float64


** DATING OVER 40 **


NaN    1.0
Name: banned_by, dtype: float64




********call_to_action********)
** DATING **


NaN    1.0
Name: call_to_action, dtype: float64


** DATING OVER 40 **


NaN    1.0
Name: call_to_action, dtype: float64




********category********)
** DATING **


NaN    1.0
Name: category, dtype: float64


** DATING OVER 40 **


NaN    1.0
Name: category, dtype: float64




********crosspost_parent********)
** DATING **


NaN    1.0
Name: crosspost_parent, dtype: float64


** DATING OVER 40 **


NaN          0.994727
t3_nza9ft    0.000132
t3_ljb26g    0.000132
t3_l3xv9c    0.000132
t3_lk112f    0.000132
t3_mmbsns    0.000132
t3_mq6sxz    0.000132
t3_nscwid    0.000132
t3_pcmos8    0.000132
t3_neo6fo    0.000132
t3_omupmx    0.000132
t3_nbm5oi    0.000132
t3_ogjhn5    0.000132
t3_mkjhhf    0.000132
t3_mm8ini    0.000132
t3_mu4ua1    0.000132
t3_of4eiq    0.000132
t3_ofio1e    0.000132
t3_ogad9l    0.000132
t3_o08esd    0.000132
t3_nnbcez    0.000132
t3_n2h169    0.000132
t3_oueru9    0.000132
t3_n2x0ip    0.000132
t3_n3ccvy    0.000132
t3_ob19y1    0.000132
t3_nm05gv    0.000132
t3_ojr3qn    0.000132
t3_oh7i8o    0.000132
t3_nv4icc    0.000132
t3_ntv36o    0.000132
t3_nvrkjd    0.000132
t3_nvxu34    0.000132
t3_nvmgxo    0.000132
t3_nxo1zh    0.000132
t3_nxs8wh    0.000132
t3_nxuigt    0.000132
t3_mwor12    0.000132
t3_o9dcm5    0.000132
t3_p5p08l    0.000132
t3_p87nc7    0.000132
Name: crosspost_parent, dtype: float64




********crosspost_parent_list********)
** DATING **


NaN    1.0
Name: crosspost_parent_list, dtype: float64


** DATING OVER 40 **


NaN                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     




********datetime********)
** DATING **


2021-08-12 23:40:09    0.000046
2021-05-27 14:22:12    0.000034
2021-08-12 23:40:10    0.000034
2022-02-24 20:36:38    0.000034
2021-11-22 22:27:33    0.000023
                         ...   
2021-10-25 00:14:45    0.000011
2021-10-25 00:04:26    0.000011
2021-10-24 23:56:34    0.000011
2021-10-24 23:48:48    0.000011
2021-10-28 00:19:06    0.000011
Name: datetime, Length: 87134, dtype: float64


** DATING OVER 40 **


2021-07-27 14:19:01    0.000264
2021-10-21 12:46:12    0.000264
2021-04-03 18:25:30    0.000264
2021-11-20 05:18:44    0.000132
2021-12-02 15:46:59    0.000132
                         ...   
2021-09-21 07:02:34    0.000132
2021-09-21 06:35:18    0.000132
2021-09-21 06:29:46    0.000132
2021-09-21 05:13:14    0.000132
2021-06-25 21:03:12    0.000132
Name: datetime, Length: 7583, dtype: float64




********distinguished********)
** DATING **


NaN          0.999989
moderator    0.000011
Name: distinguished, dtype: float64


** DATING OVER 40 **


NaN          0.999868
moderator    0.000132
Name: distinguished, dtype: float64




********edited********)
** DATING **


NaN             0.996323
1.623690e+09    0.000011
1.629487e+09    0.000011
1.629488e+09    0.000011
1.629476e+09    0.000011
                  ...   
1.629578e+09    0.000011
1.619645e+09    0.000011
1.619641e+09    0.000011
1.619640e+09    0.000011
1.620154e+09    0.000011
Name: edited, Length: 322, dtype: float64


** DATING OVER 40 **


NaN             0.995914
1.623868e+09    0.000132
1.629573e+09    0.000132
1.629549e+09    0.000132
1.635793e+09    0.000132
1.635790e+09    0.000132
1.639024e+09    0.000132
1.639001e+09    0.000132
1.638924e+09    0.000132
1.638938e+09    0.000132
1.638943e+09    0.000132
1.633581e+09    0.000132
1.633576e+09    0.000132
1.633458e+09    0.000132
1.622757e+09    0.000132
1.622440e+09    0.000132
1.632367e+09    0.000132
1.632352e+09    0.000132
1.623775e+09    0.000132
1.623735e+09    0.000132
1.623661e+09    0.000132
1.623631e+09    0.000132
1.623273e+09    0.000132
1.624155e+09    0.000132
1.624061e+09    0.000132
1.624054e+09    0.000132
1.623995e+09    0.000132
1.623953e+09    0.000132
1.623954e+09    0.000132
1.623889e+09    0.000132
1.623899e+09    0.000132
1.629596e+09    0.000132
Name: edited, dtype: float64




********gilded********)
** DATING **


NaN    0.999989
1.0    0.000011
Name: gilded, dtype: float64


** DATING OVER 40 **


NaN    1.0
Name: gilded, dtype: float64




********is_meta********)
** DATING **


False    1.0
Name: is_meta, dtype: float64


** DATING OVER 40 **


False    1.0
Name: is_meta, dtype: float64




********is_original_content********)
** DATING **


False    1.0
Name: is_original_content, dtype: float64


** DATING OVER 40 **


False    1.0
Name: is_original_content, dtype: float64




********is_reddit_media_domain********)
** DATING **


False    1.0
Name: is_reddit_media_domain, dtype: float64


** DATING OVER 40 **


False    0.931848
True     0.068152
Name: is_reddit_media_domain, dtype: float64




********is_robot_indexable********)
** DATING **


True     0.818473
False    0.181527
Name: is_robot_indexable, dtype: float64


** DATING OVER 40 **


True     0.709728
False    0.290272
Name: is_robot_indexable, dtype: float64




********is_self********)
** DATING **


True    1.0
Name: is_self, dtype: float64


** DATING OVER 40 **


True     0.886765
False    0.113235
Name: is_self, dtype: float64




********is_video********)
** DATING **


False    1.0
Name: is_video, dtype: float64


** DATING OVER 40 **


False    0.996309
True     0.003691
Name: is_video, dtype: float64




********link_flair_background_color********)
** DATING **


#ea0027    0.387084
#ff4500    0.227447
NaN        0.178491
#00a6a5    0.065289
#ffb000    0.063926
#ffd635    0.033756
#878a8c    0.019312
#46d160    0.015750
#24a0ed    0.008854
#94e044    0.000092
Name: link_flair_background_color, dtype: float64


** DATING OVER 40 **


NaN    1.0
Name: link_flair_background_color, dtype: float64




********link_flair_css_class********)
** DATING **


dating    0.896923
NaN       0.103077
Name: link_flair_css_class, dtype: float64


** DATING OVER 40 **


NaN    1.0
Name: link_flair_css_class, dtype: float64




********link_flair_richtext********)
** DATING **


[{'e': 'text', 't': 'I Need Advice'}]           0.458295
[{'e': 'text', 't': 'Question'}]                0.271019
[{'e': 'text', 't': 'Tinder/Online Dating'}]    0.078221
[{'e': 'text', 't': 'Support Needed'}]          0.065289
[{'e': 'text', 't': 'Venting'}]                 0.047432
[{'e': 'text', 't': 'Giving Advice'}]           0.040823
[{'e': 'text', 't': 'Success!'}]                0.015750
[{'e': 'text', 't': 'Other'}]                   0.014066
[{'e': 'text', 't': 'Long Distance'}]           0.008854
[]                                              0.000241
[{'e': 'text', 't': 'Vent/Rant'}]               0.000011
Name: link_flair_richtext, dtype: float64


** DATING OVER 40 **


[]    1.0
Name: link_flair_richtext, dtype: float64




********link_flair_template_id********)
** DATING **


19ec58c2-9f74-11e7-81a1-0e2bf15991f0    0.458295
a9531dce-194e-11e5-b177-0eff099c3d1f    0.271019
fd9c03b8-a1ce-11e7-9172-0e8c9f9ecf4c    0.079355
5214d2ac-61f1-11e8-8bb1-0ed15e4a39c0    0.078221
a616c322-194e-11e5-ab91-0ed1c06bdad7    0.047432
7bca2c58-4211-11e5-9ecb-0e60d8112001    0.040823
6b44dc18-cc88-11eb-95f2-0ed00be16b15    0.015750
5dc9d60a-cc75-11eb-86e9-0ea78081bf43    0.008854
NaN                                     0.000252
Name: link_flair_template_id, dtype: float64


** DATING OVER 40 **


NaN                                     0.605589
b50cbf0c-3935-11e9-a609-0e217b8f2048    0.143686
aa73f354-275f-11e9-9763-0ed84ca6f39e    0.080148
e06d24a4-275e-11e9-9b74-0e6a6c05416c    0.074875
628375fa-4425-11e9-9805-0e7c3aad8200    0.064724
08ec5ff2-2760-11e9-aac2-0ea14477eb18    0.019114
c21c5486-4467-11e9-b9aa-0e41ef1f1cc4    0.005009
20dfc2f2-74ad-11ea-899a-0e8269fe5db5    0.004746
fe6f5a4e-cab5-11e9-92ab-0e3fb8caf9f2    0.001055
931c8382-7d5f-11e9-9326-0e473f1428b0    0.000659
914f5cc2-438a-11ea-af78-0e0cb0297771    0.000395
Name: link_flair_template_id, dtype: float64




********link_flair_text********)
** DATING **


I Need Advice           0.458295
Question                0.271019
Tinder/Online Dating    0.078221
Support Needed          0.065289
Venting                 0.047432
Giving Advice           0.040823
Success!                0.015750
Other                   0.014066
Long Distance           0.008854
NaN                     0.000241
Vent/Rant               0.000011
Name: link_flair_text, dtype: float64


** DATING OVER 40 **


NaN                    0.605589
Seeking Advice         0.143686
Question               0.080148
Casual Conversation    0.074875
Discussion             0.064724
Giving Advice          0.019114
Venting                0.005009
Sharing                0.004746
Confession             0.001055
Meme                   0.000659
Article                0.000395
Name: link_flair_text, dtype: float64




********link_flair_text_color********)
** DATING **


light    0.633843
dark     0.366157
Name: link_flair_text_color, dtype: float64


** DATING OVER 40 **


dark    1.0
Name: link_flair_text_color, dtype: float64




********link_flair_type********)
** DATING **


richtext    0.999759
text        0.000241
Name: link_flair_type, dtype: float64


** DATING OVER 40 **


text    1.0
Name: link_flair_type, dtype: float64




********locked********)
** DATING **


False    0.998763
True     0.001237
Name: locked, dtype: float64


** DATING OVER 40 **


False    0.99855
True     0.00145
Name: locked, dtype: float64




********media********)
** DATING **


NaN    1.0
Name: media, dtype: float64


** DATING OVER 40 **


NaN                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     




********media_embed********)
** DATING **


NaN    1.0
Name: media_embed, dtype: float64


** DATING OVER 40 **


NaN                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               0.988004
{'content': '&lt;iframe width="356" height="200" src="https://www.youtube.com/embed/V9nEOilLbLg?feature=oembed&amp;enablejsapi=1" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen&gt;&lt;/iframe&gt;', 'height': 200, 'scrolling': False, 'width': 356}                     




********media_metadata********)
** DATING **


NaN    1.0
Name: media_metadata, dtype: float64


** DATING OVER 40 **


NaN                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     




********media_only********)
** DATING **


False    1.0
Name: media_only, dtype: float64


** DATING OVER 40 **


False    1.0
Name: media_only, dtype: float64




********no_follow********)
** DATING **


True     0.847372
False    0.152628
Name: no_follow, dtype: float64


** DATING OVER 40 **


True     0.865674
False    0.134326
Name: no_follow, dtype: float64




********num_comments********)
** DATING **


1       0.279346
2       0.098713
0       0.090683
3       0.066583
4       0.056114
          ...   
397     0.000011
607     0.000011
1021    0.000011
453     0.000011
916     0.000011
Name: num_comments, Length: 630, dtype: float64


** DATING OVER 40 **


0      0.397706
1      0.064197
2      0.063670
3      0.013841
4      0.009491
         ...   
697    0.000132
293    0.000132
366    0.000132
191    0.000132
347    0.000132
Name: num_comments, Length: 342, dtype: float64




********num_crossposts********)
** DATING **


0    0.999908
1    0.000080
3    0.000011
Name: num_crossposts, dtype: float64


** DATING OVER 40 **


0    1.0
Name: num_crossposts, dtype: float64




********over_18********)
** DATING **


False    0.969509
True     0.030491
Name: over_18, dtype: float64


** DATING OVER 40 **


False    0.97469
True     0.02531
Name: over_18, dtype: float64




********parent_whitelist_status********)
** DATING **


all_ads    0.558634
no_ads     0.436979
NaN        0.004387
Name: parent_whitelist_status, dtype: float64


** DATING OVER 40 **


some_ads    1.0
Name: parent_whitelist_status, dtype: float64




********permalink********)
** DATING **


/r/dating/comments/nq83ep/finally_went_on_my_first_date_in_forever_meaning/     0.000011
/r/dating/comments/q7r50o/updating_pictures_on_dating_apps/                     0.000011
/r/dating/comments/q7ruhr/40_year_old_giving_mixed_signals/                     0.000011
/r/dating/comments/q7ruh9/been_left_on_read_delivered/                          0.000011
/r/dating/comments/q7rqnt/do_you_care_about_weight/                             0.000011
                                                                                  ...   
/r/dating/comments/qf2uwp/girl_i_like/                                          0.000011
/r/dating/comments/qf2tid/girl_agreed_to_go_out_with_me_but_later_changed/      0.000011
/r/dating/comments/qf2r5p/someone_told_me_that_guys_that_facetime_you_a_lot/    0.000011
/r/dating/comments/qf2r2g/strted_dating_a_girl_a_week_ago_and_we_already/       0.000011
/r/dating/comments/qhag0b/dating_with_married_woman/                            0.000011
Name: permalink, Leng


** DATING OVER 40 **


/r/datingoverforty/comments/qxyk74/hairy_or_shaved/                                     0.000132
/r/datingoverforty/comments/r79w5u/what_changed/                                        0.000132
/r/datingoverforty/comments/r7o3kt/message_me_if_you_want_a_snapchat_account_hacked/    0.000132
/r/datingoverforty/comments/r7l864/male_needing_a_hug_gwinnett_county/                  0.000132
/r/datingoverforty/comments/r7kze2/casual_or_serious/                                   0.000132
                                                                                          ...   
/r/datingoverforty/comments/psdkod/are_there_any_men_under_6/                           0.000132
/r/datingoverforty/comments/psd7wf/dad_41_with_full_custody_of_my_three_young/          0.000132
/r/datingoverforty/comments/psd59m/marathon_1st_date/                                   0.000132
/r/datingoverforty/comments/psc3do/46m_just_want_to_share/                              0.000132
/r/datingoverforty/comments/o7




********pinned********)
** DATING **


False    1.0
Name: pinned, dtype: float64


** DATING OVER 40 **


False    1.0
Name: pinned, dtype: float64




********poll_data********)
** DATING **


NaN                                                                                                                                                                                                                                                                                                                                                                                                                                                                 0.999989
{'is_prediction': False, 'options': [{'id': '8678795', 'text': 'Yes, ban entirely'}, {'id': '8678796', 'text': 'Yes, limit to the weekends only'}, {'id': '8678797', 'text': 'Yes, limit to one day per week'}, {'id': '8678798', 'text': 'No, no restrictions'}], 'resolved_option_id': None, 'total_stake_amount': None, 'total_vote_count': 1, 'tournament_id': None, 'user_selection': None, 'user_won_amount': None, 'voting_end_timestamp': 1624833015349}    0.000011
Name: poll_data, dtype: float64


** DATING OVER 40 **


NaN    1.0
Name: poll_data, dtype: float64




********post_hint********)
** DATING **


NaN     0.982315
self    0.017685
Name: post_hint, dtype: float64


** DATING OVER 40 **


NaN             0.879779
image           0.066306
self            0.020169
link            0.019510
rich:video      0.010809
hosted:video    0.003427
Name: post_hint, dtype: float64




********preview********)
** DATING **


NaN                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     


** DATING OVER 40 **


NaN                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     




********pwls********)
** DATING **


6.0    0.558634
0.0    0.436979
NaN    0.004387
Name: pwls, dtype: float64


** DATING OVER 40 **


7.0    1.0
Name: pwls, dtype: float64




********removed_by_category********)
** DATING **


NaN                 0.818473
moderator           0.131197
reddit              0.027227
automod_filtered    0.023103
Name: removed_by_category, dtype: float64


** DATING OVER 40 **


NaN          0.709728
reddit       0.206697
moderator    0.083575
Name: removed_by_category, dtype: float64




********retrieved_on********)
** DATING **


1628811621    0.000080
1648259646    0.000034
1621554479    0.000023
1617645573    0.000023
1635300901    0.000023
                ...   
1635119339    0.000011
1635119110    0.000011
1635118889    0.000011
1635118585    0.000011
1635380357    0.000011
Name: retrieved_on, Length: 87120, dtype: float64


** DATING OVER 40 **


1622902348    0.000264
1637385534    0.000132
1638459583    0.000132
1638498923    0.000132
1638498837    0.000132
                ...   
1632206129    0.000132
1632205797    0.000132
1632201205    0.000132
1632200938    0.000132
1624655003    0.000132
Name: retrieved_on, Length: 7585, dtype: float64




********score********)
** DATING **


1      0.976507
2      0.009072
0      0.006220
3      0.002715
4      0.001317
         ...   
151    0.000011
89     0.000011
242    0.000011
340    0.000011
109    0.000011
Name: score, Length: 80, dtype: float64


** DATING OVER 40 **


1      0.970208
0      0.007382
2      0.003955
3      0.002636
6      0.001714
5      0.001318
8      0.001186
4      0.001055
7      0.001055
10     0.000923
9      0.000659
12     0.000659
13     0.000527
11     0.000527
23     0.000395
15     0.000395
18     0.000395
26     0.000395
25     0.000264
17     0.000264
44     0.000264
30     0.000264
96     0.000264
35     0.000264
14     0.000264
37     0.000132
41     0.000132
27     0.000132
141    0.000132
77     0.000132
28     0.000132
20     0.000132
150    0.000132
73     0.000132
16     0.000132
22     0.000132
67     0.000132
103    0.000132
36     0.000132
347    0.000132
47     0.000132
38     0.000132
46     0.000132
206    0.000132
101    0.000132
178    0.000132
Name: score, dtype: float64




********secure_media********)
** DATING **


NaN    1.0
Name: secure_media, dtype: float64


** DATING OVER 40 **


NaN                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     




********secure_media_embed********)
** DATING **


NaN    1.0
Name: secure_media_embed, dtype: float64


** DATING OVER 40 **


NaN                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    0.988004
{'content': '&lt;iframe width="356" height="200" src="https://www.youtube.com/embed/V9nEOilLbLg?feature=oembed&amp;enablejsapi=1" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen&gt;&lt;/iframe&gt;', 




********selftext********)
** DATING **


[removed]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               


** DATING OVER 40 **


[removed]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               




********send_replies********)
** DATING **


True     0.991913
False    0.008087
Name: send_replies, dtype: float64


** DATING OVER 40 **


True     0.975218
False    0.024782
Name: send_replies, dtype: float64




********spoiler********)
** DATING **


False    0.993345
True     0.006655
Name: spoiler, dtype: float64


** DATING OVER 40 **


False    0.993145
True     0.006855
Name: spoiler, dtype: float64




********stickied********)
** DATING **


False    1.0
Name: stickied, dtype: float64


** DATING OVER 40 **


False    0.995386
True     0.004614
Name: stickied, dtype: float64




********subreddit********)
** DATING **


dating    1.0
Name: subreddit, dtype: float64


** DATING OVER 40 **


datingoverforty    1.0
Name: subreddit, dtype: float64




********subreddit_id********)
** DATING **


t5_2qhb1    1.0
Name: subreddit_id, dtype: float64


** DATING OVER 40 **


t5_su6ij    1.0
Name: subreddit_id, dtype: float64




********subreddit_subscribers********)
** DATING **


1164919    0.000458
1164918    0.000401
1164445    0.000344
1164131    0.000332
1164129    0.000298
             ...   
942820     0.000011
942825     0.000011
942855     0.000011
942856     0.000011
1102565    0.000011
Name: subreddit_subscribers, Length: 61291, dtype: float64


** DATING OVER 40 **


63154    0.001318
56294    0.001186
60385    0.000923
63576    0.000923
69490    0.000791
           ...   
65862    0.000132
65845    0.000132
65840    0.000132
65838    0.000132
51530    0.000132
Name: subreddit_subscribers, Length: 5797, dtype: float64




********subreddit_type********)
** DATING **


public    1.0
Name: subreddit_type, dtype: float64


** DATING OVER 40 **


public    1.0
Name: subreddit_type, dtype: float64




********thumbnail********)
** DATING **


self       0.964801
nsfw       0.030216
default    0.002829
spoiler    0.002153
Name: thumbnail, dtype: float64


** DATING OVER 40 **


self                                                                                0.859610
nsfw                                                                                0.025310
default                                                                             0.017796
spoiler                                                                             0.001714
https://b.thumbs.redditmedia.com/Uf7YiehOIJa61irOkhbX5bA54WlSqAT38n-HYLzn42Q.jpg    0.000264
                                                                                      ...   
https://b.thumbs.redditmedia.com/ae7B8dE00hZ1bXSGFvJGCNahPhduvLWFsvvZWYSOZ1I.jpg    0.000132
https://b.thumbs.redditmedia.com/HxhVpXDEdiyxQ5R-uClcn6WKPRgZbYiUncSf3N1O3ps.jpg    0.000132
https://a.thumbs.redditmedia.com/oDMiajbbMDfZZjAzrntarME7cRjtUl6asIJQI5hJtP4.jpg    0.000132
https://b.thumbs.redditmedia.com/q2Pp4OsEXn-zZbhwwdAXbcImC1v-6ugQ8rCk5fwJeos.jpg    0.000132
https://b.thumbs.redditmedia.com/6NuddOuC2uckhyu8yRu856bEpWA9rjJAc1Yd8




********thumbnail_height********)
** DATING **


NaN    1.0
Name: thumbnail_height, dtype: float64


** DATING OVER 40 **


NaN      0.896784
140.0    0.061165
105.0    0.013578
93.0     0.004086
78.0     0.003427
           ...   
79.0     0.000132
111.0    0.000132
112.0    0.000132
86.0     0.000132
106.0    0.000132
Name: thumbnail_height, Length: 63, dtype: float64




********thumbnail_width********)
** DATING **


NaN    1.0
Name: thumbnail_width, dtype: float64


** DATING OVER 40 **


NaN      0.896784
140.0    0.102689
70.0     0.000527
Name: thumbnail_width, dtype: float64




********title********)
** DATING **


Dating                                                                                                                                                                               0.001180
What should I do?                                                                                                                                                                    0.001065
Help                                                                                                                                                                                 0.001042
Advice                                                                                                                                                                               0.000951
Need advice                                                                                                                                                                          0.000859
                                                  


** DATING OVER 40 **


Dating                                                                                      0.001846
Confused                                                                                    0.000791
OnlyFans                                                                                    0.000791
Hi                                                                                          0.000659
What sites are people over 40 using?                                                        0.000659
                                                                                              ...   
1 1/2 year update to "I won Bumble!"                                                        0.000132
A dvice on how 2 date after 40                                                              0.000132
I'm 48(F) and I don't know how to end a relationship. Honestly, I'm completely clueless.    0.000132
What happens next                                                                          




********total_awards_received********)
** DATING **


0     0.999439
1     0.000435
2     0.000046
4     0.000023
3     0.000023
5     0.000011
9     0.000011
34    0.000011
Name: total_awards_received, dtype: float64


** DATING OVER 40 **


0    0.999209
1    0.000659
2    0.000132
Name: total_awards_received, dtype: float64




********treatment_tags********)
** DATING **


[]    1.0
Name: treatment_tags, dtype: float64


** DATING OVER 40 **


[]    1.0
Name: treatment_tags, dtype: float64




********upvote_ratio********)
** DATING **


1.00    0.970162
0.99    0.012920
0.50    0.003872
0.67    0.003150
0.75    0.001672
          ...   
0.19    0.000011
0.48    0.000011
0.68    0.000011
0.52    0.000011
0.32    0.000011
Name: upvote_ratio, Length: 73, dtype: float64


** DATING OVER 40 **


1.00    0.971263
0.99    0.005668
0.50    0.003691
0.67    0.001714
0.80    0.001186
0.33    0.001055
0.88    0.001055
0.83    0.000923
0.90    0.000923
0.56    0.000659
0.86    0.000659
0.91    0.000659
0.87    0.000527
0.97    0.000527
0.84    0.000527
0.93    0.000395
0.89    0.000395
0.73    0.000395
0.60    0.000395
0.29    0.000395
0.38    0.000395
0.75    0.000395
0.70    0.000395
0.71    0.000264
0.81    0.000264
0.98    0.000264
0.92    0.000264
0.40    0.000264
0.43    0.000264
0.58    0.000264
0.78    0.000264
0.76    0.000264
0.82    0.000264
0.94    0.000264
0.57    0.000264
0.25    0.000264
0.79    0.000264
0.66    0.000132
0.63    0.000132
0.44    0.000132
0.45    0.000132
0.17    0.000132
0.19    0.000132
0.18    0.000132
0.77    0.000132
0.20    0.000132
0.64    0.000132
0.72    0.000132
0.85    0.000132
0.36    0.000132
0.08    0.000132
0.53    0.000132
0.74    0.000132
Name: upvote_ratio, dtype: float64




********url********)
** DATING **


https://www.reddit.com/r/dating/comments/nq83ep/finally_went_on_my_first_date_in_forever_meaning/     0.000011
https://www.reddit.com/r/dating/comments/q7r50o/updating_pictures_on_dating_apps/                     0.000011
https://www.reddit.com/r/dating/comments/q7ruhr/40_year_old_giving_mixed_signals/                     0.000011
https://www.reddit.com/r/dating/comments/q7ruh9/been_left_on_read_delivered/                          0.000011
https://www.reddit.com/r/dating/comments/q7rqnt/do_you_care_about_weight/                             0.000011
                                                                                                        ...   
https://www.reddit.com/r/dating/comments/qf2uwp/girl_i_like/                                          0.000011
https://www.reddit.com/r/dating/comments/qf2tid/girl_agreed_to_go_out_with_me_but_later_changed/      0.000011
https://www.reddit.com/r/dating/comments/qf2r5p/someone_told_me_that_guys_that_facetime_you_a_lot/    0.000011
h


** DATING OVER 40 **


https://sweetladies-now.life/?u=elbkd0x&amp;o=7gapkbc&amp;m=1&amp;t=CPA@13                              0.000264
https://cutt.ly/4b4n9eK                                                                                 0.000264
https://www.rentacyberfriend.com/                                                                       0.000264
https://hapiwoman.blogspot.com/2021/08/how-to-start-dating-after-divorce.html                           0.000264
https://i.redd.it/b1dfatd340r61.jpg                                                                     0.000264
                                                                                                          ...   
https://www.reddit.com/r/datingoverforty/comments/pshxha/varying_availability/                          0.000132
https://www.reddit.com/r/datingoverforty/comments/psdkod/are_there_any_men_under_6/                     0.000132
https://www.reddit.com/r/datingoverforty/comments/psd7wf/dad_41_with_full_custody_of_my_three_yo




********url_overridden_by_dest********)
** DATING **


NaN    1.0
Name: url_overridden_by_dest, dtype: float64


** DATING OVER 40 **


NaN                                                                              0.886765
https://hapiwoman.blogspot.com/2021/08/how-to-start-dating-after-divorce.html    0.000264
http://www.xaxnv.com/SHmHh                                                       0.000264
https://www.rentacyberfriend.com/                                                0.000264
https://sweetladies-now.life/?u=elbkd0x&amp;o=7gapkbc&amp;m=1&amp;t=CPA@13       0.000264
                                                                                   ...   
https://i.redd.it/34r48twycy171.jpg                                              0.000132
https://youtube.com/watch?v=eK1drF33ayE&amp;feature=share                        0.000132
https://i.imgur.com/eLNU86E.jpg                                                  0.000132
https://i.redd.it/3lx965ds52571.jpg                                              0.000132
https://i.redd.it/sojaw3645h771.png                                              0.000132
Name: url_




********whitelist_status********)
** DATING **


all_ads             0.543102
no_ads              0.436990
promo_adult_nsfw    0.015532
NaN                 0.004376
Name: whitelist_status, dtype: float64


** DATING OVER 40 **


some_ads            0.97469
promo_adult_nsfw    0.02531
Name: whitelist_status, dtype: float64




********wls********)
** DATING **


6.0    0.543102
0.0    0.436990
3.0    0.015532
NaN    0.004376
Name: wls, dtype: float64


** DATING OVER 40 **


7.0    0.97469
3.0    0.02531
Name: wls, dtype: float64






In [127]:
full_data[full_data["subreddit"]=="dating"]['is_original_content'].value_counts(normalize=True, dropna=False)

False    1.0
Name: is_original_content, dtype: float64

In [126]:
full_data[full_data["subreddit"]=="datingoverforty"]['is_original_content'].value_counts(normalize=True, dropna=False)

False    1.0
Name: is_original_content, dtype: float64

In [78]:
full_data['author_fullname'].value_counts()

t2_ezu4hwif    88
t2_ddtlt3g5    86
t2_cc54y       75
t2_c9vq38y9    70
t2_138ijx      70
               ..
t2_hsqktbln     1
t2_a2e1tl99     1
t2_hz3g43fs     1
t2_ogcey        1
t2_cxqqf2xs     1
Name: author_fullname, Length: 54811, dtype: int64

In [88]:
for i,g in full_data[['author', 'author_fullname']].groupby("author_fullname"):
    a = g['author'].to_numpy() # s.values (pandas<0.24)
    if not (a[0] == a).all():
        display(g)
#    if g['author'].value_counts(dropna=False, normalize=True)[0] < 1.0:
#        display(g)

In [38]:
full_data = full_data[full_data['author'] != "[deleted]"]

In [44]:
full_data.drop(columns=["author_flair_css_class"], inplace=True)

In [45]:
full_data

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_patreon_flair,author_premium,awarders,...,thumbnail_height,thumbnail_width,url_overridden_by_dest,crosspost_parent,crosspost_parent_list,media,media_embed,secure_media,secure_media_embed,media_metadata
0,[],False,aRandomSet,[],,text,t2_bnb9iikj,False,False,[],...,,,,,,,,,,
1,[],False,undeniablydigging,[],,text,t2_5fp97462,False,False,[],...,,,,,,,,,,
2,[],False,PenguinBluebird,[],,text,t2_5l69ma0d,False,False,[],...,,,,,,,,,,
3,[],False,West_Librarian3074,[],,text,t2_av5z15m4,False,False,[],...,,,,,,,,,,
6,[],False,sadcherryicecream,[],,text,t2_2kg67gur,False,False,[],...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,[],False,labtech89,[],,text,t2_73j83,False,False,[],...,,,,,,,,,,
96,[],False,Loud-Pomegranate,[],,text,t2_53sghr1f,False,False,[],...,,,,,,,,,,
97,[],False,Fo_Sho_77,[],,text,t2_cxqqf2xs,False,False,[],...,,,,,,,,,,
98,[],False,MasterfulBJJ,[],,text,t2_3l87qt76,False,False,[],...,70.0,140.0,https://i.redd.it/sojaw3645h771.png,,,,,,,
