## Imports

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


import regex as re
import nltk

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

import time

## This collection of cells were run multiple times to get the posts I needed

In [2]:
url_nsq = 'https://www.reddit.com/r/NoStupidQuestions'
url_tifu = 'https://www.reddit.com/r/tifu' 
header = {'User-agent': 'This is only a test 1.3'}

In [14]:
nsq_df = pd.read_csv('nsq.csv')
tifu_df = pd.read_csv('tifu.csv')


In [15]:
print(nsq_df.shape)
print(tifu_df.shape)

(1951, 5)
(846, 5)


In [6]:
def get_posts(red):
    posts = []
    for i in range(0,len(red['data']['children'])):
        post = {}
        post['title'] = red['data']['children'][i]['data']['title']
        post['selftext'] = red['data']['children'][i]['data']['selftext']
        post['subreddit'] = red['data']['children'][i]['data']['subreddit']
        post['author'] = red['data']['children'][i]['data']['author']
        post['id'] = red['data']['children'][i]['data']['id']

        posts.append(post)
    
    return pd.DataFrame(posts)

In [1]:
places = ['/hot.json', 
          '/new.json', 
          '/controversial.json', 
          '/top.json', 
          '/rising.json']

In [8]:
for _ in range(1,51):
    for place in places:
        for t in ['hour', 'day', 'week', 'month', 'year', 'all']:
            req_nsq = requests.get(url_nsq+place, params = {'limit': 100, 't': t}, headers = header)
            req_tifu = requests.get(url_tifu+place, params = {'limit': 100, 't': t}, headers = header)
            nsq = get_posts(req_nsq.json())
            tifu = get_posts(req_tifu.json())
            nsq_df = pd.concat([nsq_df, nsq], axis = 'index')
            tifu_df = pd.concat([tifu_df, tifu], axis = 'index')
            time.sleep(1)
    print(_)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50


In [9]:
print(nsq_df.drop_duplicates().shape)
print(tifu_df.drop_duplicates().shape)

(1951, 5)
(846, 5)


In [10]:
tifu_df.drop_duplicates(inplace = True)

In [11]:
nsq_df.drop_duplicates(inplace = True)

In [12]:
len(nsq_df) + len(tifu_df)

2797

In [13]:
nsq_df.to_csv('nsq.csv', index = False)
tifu_df.to_csv("tifu.csv", index = False)

In [16]:
tifu_df.sample(25)

Unnamed: 0,author,id,selftext,subreddit,title
645,OdinsBHole,bvu086,My son was born with a condition called Pectus...,tifu,TIFU by giving my son permission to beat his b...
188,thefriedeggro11,c8hqd3,Small FU but I'm sitting here in the restauran...,tifu,TIFU by paying for nothing just to avoid a cre...
505,chadowws,c89otk,So this happened 2 years ago and i'm still not...,tifu,TIFU by accidentally starting a group call whi...
115,IamcatMeowMeow,caeejc,"So yes, this wasn't today, but was about 3 wee...",tifu,TIFU by not having sex with my favourite porn ...
729,zjkelsey,7eueeg,"As usual, this did not happen today...but rath...",tifu,TIFU by not going to the hospital when I knew ...
743,not_homestuck,c92srv,This isn't a dramatic story but I'm cleaning o...,tifu,TIFU by accidentally causing a bomb scare
494,slimmyboy007,c96oa9,Just got back from a midweek trip for my Gfs b...,tifu,Tifu by taking holiday pictures with a dildo i...
20,Constant_Ponderer,ca9bi0,Note: English is not my first language.\n\n&am...,tifu,"TIFU by going to work while on acid, thinking ..."
377,xCuri0,bgctrh,So I have this cracked Mi A1 that I haven't u...,tifu,TIFU by sticking my old phone up my ass
100,Bloodgrimx,caheyq,TLDR at bottom\n\nSo earlier while my gf was a...,tifu,TIFU by trying to jack off Nsfw


In [17]:
nsq_df.sample(25)

Unnamed: 0,author,id,selftext,subreddit,title
1369,TheVeryWorstLuck,catf4z,,NoStupidQuestions,To what percentage would you say chewing gum i...
884,MrAntiAnti,c9168i,,NoStupidQuestions,How do they translate Yoda into other language...
637,UnlikelyWonder,bpzqli,What is even happening? I feel helpless. I don...,NoStupidQuestions,What can we do to stop these crazy abortion laws?
796,afmuente,2sy0jz,(Country) No googling please. \nI'd like to kn...,NoStupidQuestions,Have you guys ever heard of Peru?
1345,lenny_hernandez,cat6hp,"Trash cans are in all the bathrooms, but i've ...",NoStupidQuestions,"For the enviroment, is better to throw the TP ..."
1434,trialrun1,cb13n2,"For a super simplified example, Lets say that ...",NoStupidQuestions,What do servers do when the bill and tip are a...
1067,bokk6,b7y04k,If I were to yell at a bug through a paper tow...,NoStupidQuestions,How loud would I have to yell at a bug to kill...
1354,ISuckAtGaemz,catadc,I heard a friend say he got a cold from having...,NoStupidQuestions,Can Diseases Other Than STIs Spread Via Sex?
666,XerxesGWX,bdl506,"Okay, before I get hate... I'm black and I'm a...",NoStupidQuestions,Does anyone know where I can buy a KKK hood an...
423,gridzbispudvetch,caeeyo,"sex-crazed is not really the right word, i kno...",NoStupidQuestions,"Are straight guys really as ""sex-crazed"" as i ..."


## I had slightly unbalanced classes, so I upsampled one, downsampled another, until I had an even split

In [20]:
df_tifu_sam = tifu_df.sample(1200, replace = True, random_state = 42)
df_nsq_sam = nsq_df.sample(1200, random_state = 42)
print(df_tifu_sam.shape, df_nsq_sam.shape)

(1200, 5) (1200, 5)


In [35]:
df = pd.concat([df_tifu_sam, df_nsq_sam], axis = 0, ignore_index = True)
df.head()

Unnamed: 0,author,id,selftext,subreddit,title
0,Funguy0623,cafv59,This LITERALLY just happened. I was on the pho...,tifu,TIFU by accidentally causing a cat to get run ...
1,Throwaway678495,82vog8,"So unlike many of the stories here, this one d...",tifu,TIFU by getting a girls phone number
2,capj23,c3lqoh,"Obligatory, this happened 14 years ago and I w...",tifu,TIFU by climbing too far on my uncle
3,dognass,cacjwi,"Right not sure how to go about this, but here ...",tifu,TIFU Dog and Ass
4,SourBitchKids,caj1fz,I recently came into the possession of a new k...,tifu,TIFU by smoking too much weed and going into a...


In [36]:
df.shape

(2400, 5)

In [37]:
df['subreddit'].value_counts()

tifu                 1200
NoStupidQuestions    1200
Name: subreddit, dtype: int64

In [38]:
df.isnull().sum()

author         0
id             0
selftext     519
subreddit      0
title          0
dtype: int64

In [39]:
df.fillna('', inplace = True)
df.isnull().sum()

author       0
id           0
selftext     0
subreddit    0
title        0
dtype: int64

## Getting my predictor class set up

In [40]:
df = pd.get_dummies(df, columns = ['subreddit'], drop_first= True)
df.head()

Unnamed: 0,author,id,selftext,title,subreddit_tifu
0,Funguy0623,cafv59,This LITERALLY just happened. I was on the pho...,TIFU by accidentally causing a cat to get run ...,1
1,Throwaway678495,82vog8,"So unlike many of the stories here, this one d...",TIFU by getting a girls phone number,1
2,capj23,c3lqoh,"Obligatory, this happened 14 years ago and I w...",TIFU by climbing too far on my uncle,1
3,dognass,cacjwi,"Right not sure how to go about this, but here ...",TIFU Dog and Ass,1
4,SourBitchKids,caj1fz,I recently came into the possession of a new k...,TIFU by smoking too much weed and going into a...,1


In [41]:
df['text'] = df['title'] + " " + df['selftext']
df.loc[44]['text']

'TIFU by accidentally humiliating an old friend in social media after he announced he and his wife are having a child. TL;DR at the bottom.\n\nSo I have a friend from high-school that I haven\'t seen in forever. He is an absolute master of kindness. Driving for two hours to get you out of trouble after not seeing you for over three years type of kindness. Top bloke. Best dude you\'ll ever meet. Wouldn\'t hurt a fly. You get the picture.\n\nToday he posted a super ecstatic picture of him and his wife with the caption "About to be a family of three! Finally!"\n\nHis face is big. Double chin kind of big and very unlike him. He was always slim and fit growing up and this just looked fake and not at all like him. It looked so unlike him I automatically assumed this was on of those ridiculous Snapchat filters all the cool kids are using and that he had jokingly used it for a fun twist, so I commented something along the lines of "Congratulations man! Let\'s just hope the kid doesn\'t get you

## One feature I put in because it seemed noteworthy

In [43]:
df['line_count'] = df['text'].str.count('\n')
df.head()

Unnamed: 0,author,id,selftext,title,subreddit_tifu,text,line_count
0,Funguy0623,cafv59,This LITERALLY just happened. I was on the pho...,TIFU by accidentally causing a cat to get run ...,1,TIFU by accidentally causing a cat to get run ...,2
1,Throwaway678495,82vog8,"So unlike many of the stories here, this one d...",TIFU by getting a girls phone number,1,TIFU by getting a girls phone number So unlike...,2
2,capj23,c3lqoh,"Obligatory, this happened 14 years ago and I w...",TIFU by climbing too far on my uncle,1,TIFU by climbing too far on my uncle Obligator...,8
3,dognass,cacjwi,"Right not sure how to go about this, but here ...",TIFU Dog and Ass,1,TIFU Dog and Ass Right not sure how to go abou...,16
4,SourBitchKids,caj1fz,I recently came into the possession of a new k...,TIFU by smoking too much weed and going into a...,1,TIFU by smoking too much weed and going into a...,4


In [44]:
df.line_count.describe()

count    2400.000000
mean        9.667083
std        15.020023
min         0.000000
25%         0.000000
50%         4.000000
75%        14.000000
max       139.000000
Name: line_count, dtype: float64

In [48]:
df.to_csv('final.csv', index = False)