# Project 3: Web APIs & Classification

## Problem Statement

Given posts from two Using Reddit's API, you'll collect posts from two subreddits, r/worldnews and r/todayilearned, we will use NLP to train a classifier on which subreddit a given post came from.

## Executive Summary

### Contents:
- [2017 Data Import & Cleaning](#Data-Import-and-Cleaning)
- [2018 Data Import and Cleaning](#2018-Data-Import-and-Cleaning)
- [Exploratory Data Analysis](#Exploratory-Data-Analysis)
- [Data Visualization](#Visualize-the-data)
- [Descriptive and Inferential Statistics](#Descriptive-and-Inferential-Statistics)
- [Outside Research](#Outside-Research)
- [Conclusions and Recommendations](#Conclusions-and-Recommendations)

In [215]:
import pandas as pd
import requests
import time
import random

## Data Import and Cleaning

In [216]:
url = 'https://www.reddit.com/r/worldnews.json'
url2 = 'https://www.reddit.com/r/todayilearned.json'

In [217]:
headers = {'User-agent': 'panzer 2.01'}

In [218]:
posts = []
after = None

# Number of times the API is scraped.
for i in range(int(1000/25)):
    
    # Load indicator.
    if i%5 == 0:
        print(f'scrape {i} in progress...')
    
    # Use original url for first scrape.
    if after is None:
        params = {}
    else:
        params = {'after': after}

    r = requests.get(url, params=params, headers=headers)
    
    if r.status_code == 200:
        js = r.json()
        posts.extend(js['data']['children'])
        after = js['data']['after']
    else:
        print(r.status_code)
        break
    
    # Randomize sleep timing.
    time.sleep(random.random()*10)   
    

scrape 0 in progress...
scrape 5 in progress...
scrape 10 in progress...
scrape 15 in progress...
scrape 20 in progress...
scrape 25 in progress...
scrape 30 in progress...
scrape 35 in progress...


In [220]:
len(set([p['data']['name'] for p in posts]))

779

In [221]:
len(posts)

979

In [8]:
r = requests.get(url, headers=headers)
if r.status_code == 200:
    js = r.json()
else:
    print('unable to retrieve link.')

In [23]:
df = pd.DataFrame(js)

In [26]:
df

Unnamed: 0,kind,data
after,Listing,t3_e4w0yz
before,Listing,
children,Listing,"[{'kind': 't3', 'data': {'approved_at_utc': No..."
dist,Listing,25
modhash,Listing,


In [22]:
js['data']['after']

't3_e4w0yz'

In [27]:
url = url + '?after=' + df['data']['after']

In [28]:
url

'https://www.reddit.com/r/worldnews.json?after=t3_e4w0yz'

In [29]:
r = requests.get(url, headers=headers)
if r.status_code == 200:
    js = r.json()
else:
    print('unable to retrieve link.')

In [141]:
len(js['data']['children'])

25

In [171]:
js['data']['children']

[{'kind': 't3',
  'data': {'approved_at_utc': None,
   'subreddit': 'worldnews',
   'selftext': '',
   'author_fullname': 't2_4h1tivss',
   'saved': False,
   'mod_reason_title': None,
   'gilded': 0,
   'clicked': False,
   'title': 'Politician who opposes mandatory chickenpox vaccinations contracts chickenpox',
   'link_flair_richtext': [],
   'subreddit_name_prefixed': 'r/worldnews',
   'hidden': False,
   'pwls': 6,
   'link_flair_css_class': None,
   'downs': 0,
   'hide_score': True,
   'name': 't3_e5c07x',
   'quarantine': False,
   'link_flair_text_color': 'dark',
   'author_flair_background_color': None,
   'subreddit_type': 'public',
   'ups': 80,
   'total_awards_received': 0,
   'media_embed': {},
   'author_flair_template_id': None,
   'is_original_content': False,
   'user_reports': [],
   'secure_media': None,
   'is_reddit_media_domain': False,
   'is_meta': False,
   'category': None,
   'secure_media_embed': {},
   'link_flair_text': None,
   'can_mod_post': False,
  

In [169]:
df2 = pd.DataFrame(js['data']['children'])

In [168]:
df2

Unnamed: 0,kind,data
0,t3,"{'approved_at_utc': None, 'subreddit': 'worldn..."
1,t3,"{'approved_at_utc': None, 'subreddit': 'worldn..."
2,t3,"{'approved_at_utc': None, 'subreddit': 'worldn..."
3,t3,"{'approved_at_utc': None, 'subreddit': 'worldn..."
4,t3,"{'approved_at_utc': None, 'subreddit': 'worldn..."
5,t3,"{'approved_at_utc': None, 'subreddit': 'worldn..."
6,t3,"{'approved_at_utc': None, 'subreddit': 'worldn..."
7,t3,"{'approved_at_utc': None, 'subreddit': 'worldn..."
8,t3,"{'approved_at_utc': None, 'subreddit': 'worldn..."
9,t3,"{'approved_at_utc': None, 'subreddit': 'worldn..."


In [181]:
df2['data']

0     {'approved_at_utc': None, 'subreddit': 'worldn...
1     {'approved_at_utc': None, 'subreddit': 'worldn...
2     {'approved_at_utc': None, 'subreddit': 'worldn...
3     {'approved_at_utc': None, 'subreddit': 'worldn...
4     {'approved_at_utc': None, 'subreddit': 'worldn...
5     {'approved_at_utc': None, 'subreddit': 'worldn...
6     {'approved_at_utc': None, 'subreddit': 'worldn...
7     {'approved_at_utc': None, 'subreddit': 'worldn...
8     {'approved_at_utc': None, 'subreddit': 'worldn...
9     {'approved_at_utc': None, 'subreddit': 'worldn...
10    {'approved_at_utc': None, 'subreddit': 'worldn...
11    {'approved_at_utc': None, 'subreddit': 'worldn...
12    {'approved_at_utc': None, 'subreddit': 'worldn...
13    {'approved_at_utc': None, 'subreddit': 'worldn...
14    {'approved_at_utc': None, 'subreddit': 'worldn...
15    {'approved_at_utc': None, 'subreddit': 'worldn...
16    {'approved_at_utc': None, 'subreddit': 'worldn...
17    {'approved_at_utc': None, 'subreddit': 'wo

In [178]:
dft = pd.DataFrame(df2['data'].apply(pd.Series))

In [179]:
dft

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,author_flair_text_color,permalink,parent_whitelist_status,stickied,url,subreddit_subscribers,created_utc,num_crossposts,num_comments,is_video
0,,worldnews,,t2_4h1tivss,False,,0,False,Politician who opposes mandatory chickenpox va...,[],...,,/r/worldnews/comments/e5c07x/politician_who_op...,all_ads,False,https://www.independent.co.uk/news/world/europ...,22594648,1575349000.0,0,12,False
1,,worldnews,,t2_osv4,False,,0,False,Ukrainian politician’s three-year-old son kill...,[],...,,/r/worldnews/comments/e53mx4/ukrainian_politic...,all_ads,False,https://www.independent.co.uk/news/world/europ...,22594648,1575313000.0,0,31,False
2,,worldnews,,t2_sejl69r,False,,0,False,Hong Kong protests: over 380K people rally in ...,[],...,,/r/worldnews/comments/e4vg0s/hong_kong_protest...,all_ads,False,https://www.aljazeera.com/news/2019/12/hong-ko...,22594648,1575271000.0,0,262,False
3,,worldnews,,t2_4h1tivss,False,,0,False,'We Don't Know a Planet Like This': CO2 Levels...,[],...,,/r/worldnews/comments/e5bymf/we_dont_know_a_pl...,all_ads,False,https://www.commondreams.org/news/2019/05/13/w...,22594648,1575349000.0,0,9,False
4,,worldnews,,t2_612zd,False,,0,False,Millionaires pay little inheritance tax in Ger...,[],...,,/r/worldnews/comments/e548jn/millionaires_pay_...,all_ads,False,https://www.dw.com/en/millionaires-pay-little-...,22594648,1575316000.0,0,94,False
5,,worldnews,,t2_4vvkkjr,False,,0,False,Hong Kong Democracy Slogans Heard at Mainland ...,[],...,,/r/worldnews/comments/e4x4n0/hong_kong_democra...,all_ads,False,https://time.com/5742165/maoming-protest-hong-...,22594648,1575282000.0,1,149,False
6,,worldnews,,t2_4h1tivss,False,,0,False,Ships owned by cruise giant - Carnival Corpora...,[],...,,/r/worldnews/comments/e5cbu6/ships_owned_by_cr...,all_ads,False,https://ecohustler.com/article/one-corporation...,22594648,1575351000.0,0,13,False
7,,worldnews,,t2_2yqt,False,,0,False,Trump lifted a mysterious hold on military aid...,[],...,,/r/worldnews/comments/e59yaa/trump_lifted_a_my...,all_ads,False,https://www.businessinsider.com/trump-lifts-my...,22594648,1575340000.0,0,20,False
8,,worldnews,,t2_chlm7,False,,0,False,Half of all homeless people may have had traum...,[],...,,/r/worldnews/comments/e59ncq/half_of_all_homel...,all_ads,False,https://www.theguardian.com/society/2019/dec/0...,22594648,1575338000.0,0,26,False
9,,worldnews,,t2_4wcibogo,False,,0,False,Lawyer for Giuliani associate petitions court ...,[],...,,/r/worldnews/comments/e56axj/lawyer_for_giulia...,all_ads,False,http://abcnews.go.com/Politics/lawyer-giuliani...,22594648,1575324000.0,0,9,False


In [44]:
df_posts = pd.DataFrame(df['data']['children'])

TypeError: list indices must be integers or slices, not str

In [73]:
len(df_posts['data'][0].keys())

98

In [62]:
df_test = pd.DataFrame(list(df_posts['data']))

In [72]:
len(df_test.columns)

101

In [76]:
diff = []
for k in list(df_test.columns):
    if k not in df_posts['data'][0].keys():
        diff.append(k)

In [77]:
diff

['crosspost_parent_list', 'crosspost_parent', 'link_flair_template_id']

In [96]:
js['data']['children']

[{'kind': 't3',
  'data': {'approved_at_utc': None,
   'subreddit': 'worldnews',
   'selftext': '',
   'author_fullname': 't2_4h1tivss',
   'saved': False,
   'mod_reason_title': None,
   'gilded': 0,
   'clicked': False,
   'title': 'Politician who opposes mandatory chickenpox vaccinations contracts chickenpox',
   'link_flair_richtext': [],
   'subreddit_name_prefixed': 'r/worldnews',
   'hidden': False,
   'pwls': 6,
   'link_flair_css_class': None,
   'downs': 0,
   'hide_score': True,
   'name': 't3_e5c07x',
   'quarantine': False,
   'link_flair_text_color': 'dark',
   'author_flair_background_color': None,
   'subreddit_type': 'public',
   'ups': 80,
   'total_awards_received': 0,
   'media_embed': {},
   'author_flair_template_id': None,
   'is_original_content': False,
   'user_reports': [],
   'secure_media': None,
   'is_reddit_media_domain': False,
   'is_meta': False,
   'category': None,
   'secure_media_embed': {},
   'link_flair_text': None,
   'can_mod_post': False,
  