# Project 3:  Reddit Posts

### Muhammad Tahir @ Data Science Immersive at General Assembly

## Par-1:  Imports and Data Extraction

## Imports

In [2]:
# Required Imports 
import pandas as pd
import numpy as np
import requests
import time
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer
import regex as re
from sklearn.feature_extraction import text
import matplotlib.pyplot as plt
import seaborn as sns

## Data Extraction

In [3]:
# https://www.reddit.com/r/conspiracy/ & https://www.reddit.com/r/TheOnion/

In [4]:
url_onion = 'https://www.reddit.com/r/TheOnion/.json'
url_conspiracy = 'https://www.reddit.com/r/conspiracy/.json'
user_header = {'User-agent': 'JaneJacobs'} # header to prevent error 429

In [5]:
def get_subreddit(url, n_pulls, header):
    # initialize empty requirements
    posts = []
    after = None
    
    # Create a loop that does a max of 25 requests per pull
    for pull_num in range(n_pulls):
        print(f'Pulling data attempted {pull_num+1} times')
        
        if (after == None):
            new_url = url         # base case
        else:
            new_url = url+"?after="+after   # subsequent iterations
            
        res = requests.get(new_url, headers = header)
        
        if (res.status_code == 200):
            subreddit_json = res.json()                       # Pull json
            posts.extend(subreddit_json['data']['children'])  # Get subreddit posts
            after = subreddit_json['data']['after']           # after = ID of last post in iteration
        else:
            print(f'It would appear there has been an error. Status Code: {res.status_code}')
            break
        time.sleep(1)
    
    return(posts)

In [8]:
onion_posts = get_subreddit(url_onion, n_pulls = 40, header=user_header)

Pulling data attempted 1 times
Pulling data attempted 2 times
Pulling data attempted 3 times
Pulling data attempted 4 times
Pulling data attempted 5 times
Pulling data attempted 6 times
Pulling data attempted 7 times
Pulling data attempted 8 times
Pulling data attempted 9 times
Pulling data attempted 10 times
Pulling data attempted 11 times
Pulling data attempted 12 times
Pulling data attempted 13 times
Pulling data attempted 14 times
Pulling data attempted 15 times
Pulling data attempted 16 times
Pulling data attempted 17 times
Pulling data attempted 18 times
Pulling data attempted 19 times
Pulling data attempted 20 times
Pulling data attempted 21 times
Pulling data attempted 22 times
Pulling data attempted 23 times
Pulling data attempted 24 times
Pulling data attempted 25 times
Pulling data attempted 26 times
Pulling data attempted 27 times
Pulling data attempted 28 times
Pulling data attempted 29 times
Pulling data attempted 30 times
Pulling data attempted 31 times
Pulling data atte

In [9]:
len(onion_posts)

978

In [10]:
def pull_posts(posts):
    
    post_titles = []
    for i in range(len(posts)):
        titles = {}
        titles['title']     = posts[i]['data']['title']
        titles['subreddit'] = posts[i]['data']['subreddit']
        titles['id']        = posts[i]['data']['id']
        post_titles.append(titles)

    return(pd.DataFrame(post_titles))

In [11]:
df_onion = pull_posts(onion_posts)
df_onion.head()

Unnamed: 0,title,subreddit,id
0,Parliamentarian Cuts Minimum Wage From Stimulu...,TheOnion,lt7cvj
1,Democrat Reassures Friend This One Of The Good...,TheOnion,ltfhgo
2,Goals Of Biden Administration Reviewing U.S. S...,TheOnion,lszpl2
3,Biden Comforts Families Of Syrian Airstrike Vi...,TheOnion,lt6h9d
4,Florida GOP Introduces Ballotless Voting In Di...,TheOnion,lsekit


In [12]:
conspiracy_posts = get_subreddit(url_conspiracy, n_pulls = 41, header=user_header)

Pulling data attempted 1 times
Pulling data attempted 2 times
Pulling data attempted 3 times
Pulling data attempted 4 times
Pulling data attempted 5 times
Pulling data attempted 6 times
Pulling data attempted 7 times
Pulling data attempted 8 times
Pulling data attempted 9 times
Pulling data attempted 10 times
Pulling data attempted 11 times
Pulling data attempted 12 times
Pulling data attempted 13 times
Pulling data attempted 14 times
Pulling data attempted 15 times
Pulling data attempted 16 times
Pulling data attempted 17 times
Pulling data attempted 18 times
Pulling data attempted 19 times
Pulling data attempted 20 times
Pulling data attempted 21 times
Pulling data attempted 22 times
Pulling data attempted 23 times
Pulling data attempted 24 times
Pulling data attempted 25 times
Pulling data attempted 26 times
Pulling data attempted 27 times
Pulling data attempted 28 times
Pulling data attempted 29 times
Pulling data attempted 30 times
Pulling data attempted 31 times
Pulling data atte

In [14]:
df_conspiracy = pull_posts(conspiracy_posts)
df_conspiracy.head()

Unnamed: 0,title,subreddit,id
0,Never forget Gary Webb; The reporter who sacri...,conspiracy,lap3gy
1,Police are required to wear body cameras how d...,conspiracy,lucwwi
2,Stating The Obvious,conspiracy,luiepa
3,Name one time in history the people burning bo...,conspiracy,ltu54k
4,Germans scientists say that Vitamin D increase...,conspiracy,lu88lj


In [15]:
df_onion.shape

(978, 3)

In [16]:
df_onion.drop_duplicates(subset = 'id',inplace = True)

In [17]:
df_onion.shape

(928, 3)

In [18]:
df_conspiracy.shape

(1025, 3)

In [19]:
df_conspiracy.drop_duplicates(subset = 'id',inplace = True)

In [20]:
df_conspiracy.shape

(924, 3)

# Part-2:  Exploratory Data Analysis

### Tokenizing and Lemmatizing Data

In [21]:
def tokenize_lemmatize(text_col):
    # Instantiate tokenizer
    tokenizer = RegexpTokenizer(r'[A-z]+')
    
    # Tokenize text
    tokens = [tokenizer.tokenize(post.lower()) for post in text_col]
    
    # Instantiate lemmatizer
    lemmatizer = WordNetLemmatizer()
    tokens_lem = []
    # lemmatize words
    for words in tokens:
        lem_list = [lemmatizer.lemmatize(i) for i in words]
        lem_post = ''
        for word in lem_list:
            lem_post += (word+' ')
        tokens_lem.append(lem_post.strip())
    
    return (tokens_lem)

In [22]:
lemma_token_list = tokenize_lemmatize(df_onion['title'])

In [23]:
lemma_token_list[2]

'goal of biden administration reviewing u s supply chain'

In [24]:
cvec = CountVectorizer()
lemma_features = cvec.fit_transform(lemma_token_list)


In [26]:
lemma_features

<928x3897 sparse matrix of type '<class 'numpy.int64'>'
	with 11458 stored elements in Compressed Sparse Row format>