# This script :
## loads all data into one dataframe,
## it gets rid of empty invalid articles,
## creates new json with only valid data and more fields
## calculates popularity for all candidates

In [2]:
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import string
import re

## Load all the data into a dataframe

In [3]:
def return_dataframes(cnn, fn, nyt, state_id):
    CNN_df = pd.read_json(cnn)
    FN_df = pd.read_json(fn)
    NYT_df = pd.read_json(nyt)

    FN_df["candidate_name"]="NULL"
    CNN_df["candidate_name"]="NULL"

    FN_df=FN_df[["articles_date","article_text", "articles_title", "newspaper_name", "candidate_name"]]
    CNN_df.rename(columns={'articles_text': 'article_text'}, inplace=True)
    
    CNN_df["state_fk"] = state_id
    FN_df["state_fk"] = state_id
    NYT_df["state_fk"] = state_id

    return [CNN_df, FN_df, NYT_df]

frames = []
frames.extend(return_dataframes("RAW_DATA/cnn_westvirginia.json","RAW_DATA/jsfoxnews_westvirginia.json","RAW_DATA/nyt_westvirginia.json",1 ))
frames.extend(return_dataframes("RAW_DATA/cnn_virginia.json","RAW_DATA/jsfoxnews_virginia.json","RAW_DATA/nyt_westvirginia.json",2 ))
frames.extend(return_dataframes("RAW_DATA/cnn_texas.json","RAW_DATA/jsfoxnews_texas.json","RAW_DATA/nyt_texas.json",3 ))

all_data = pd.concat(frames, ignore_index=True)

In [4]:
all_data

Unnamed: 0,article_text,articles_date,articles_link,articles_title,candidate_name,first_name,last_name,newspaper_name,state_fk
0,"[Huntington, West Virginia (CNN), It was as if...","[Updated 6:14 AM ET, Thu April 27, 2017 ]",https://www.cnn.com/2017/04/27/politics/joe-ma...,[Manchin charts own path in Trump-era West Vir...,,Patrick,Morrisey,CNN,1
1,[],[],https://www.cnn.com/videos/bestoftv/2014/01/15...,[],,Patrick,Morrisey,CNN,1
2,[The legacy of the Hatfields vs. the McCoys lo...,"[Updated 10:15 PM ET, Fri May 16, 2014 ]",https://www.cnn.com/2014/05/16/us/west-virgini...,[W. Virginia AG: Middle school concealed alleg...,,Patrick,Morrisey,CNN,1
3,[West Virginia's attorney general and state le...,"[Updated 9:30 PM ET, Tue January 14, 2014 ]",https://www.cnn.com/2014/01/14/us/west-virgini...,"[More investigations launched as 180,000 West ...",,Patrick,Morrisey,CNN,1
4,"[Subscribe to these events on your , Google, ...","[Updated 8:50 AM ET, Wed March 21, 2018]",https://www.cnn.com/2018/03/06/politics/midter...,[What you need to know right now about the 201...,,Patrick,Morrisey,CNN,1
5,"[Washington (CNN), Democratic West Virginia Se...","[Updated 5:32 PM ET, Mon August 7, 2017 ]",https://www.cnn.com/2017/08/07/politics/joe-ma...,[Joe Manchin: 'Don't give a s--t' about attacks],,Patrick,Morrisey,CNN,1
6,"[Washington (CNN), President Donald Trump's Fr...","[Updated 12:54 PM ET, Fri March 23, 2018 ]",https://www.cnn.com/2018/03/23/politics/trump-...,[Trump's veto threat on spending bill gives am...,,Patrick,Morrisey,CNN,1
7,"[Washington (CNN), Several Republican candidat...","[Updated 8:32 PM ET, Wed January 3, 2018 ]",https://www.cnn.com/2018/01/03/politics/steve-...,[Some Bannon-backed Senate hopefuls begin dist...,,Patrick,Morrisey,CNN,1
8,"[Washington (CNN), Democratic incumbents, chal...","[Updated 6:06 AM ET, Mon February 5, 2018 ]",https://www.cnn.com/2018/02/05/politics/senate...,[The 10 Senate seats most likely to flip in 20...,,Patrick,Morrisey,CNN,1
9,"[ (CNN), The West Virginia Supreme Court of Ap...","[Updated 3:26 PM ET, Fri May 12, 2017 ]",https://www.cnn.com/2017/05/12/us/west-virgini...,[Court: West Virginia hate crime law doesn't i...,,Patrick,Morrisey,CNN,1


In [5]:
all_data.is_copy = False
pd.options.mode.chained_assignment = None  ## to allow references to original objects and not copies

In [6]:
for i in range(len(all_data)):
    all_data["article_text"][i] = ' '.join(all_data["article_text"][i])
    all_data["articles_title"][i] = ' '.join(all_data["articles_title"][i])

all_data=all_data[all_data.article_text != ""]
all_data = all_data.reset_index(drop=True) ## had to set it over otherwise change didn't apply
last_id = all_data.shape[0] + 1
all_data['id'] = list(range(1,last_id))

In [7]:
all_data

Unnamed: 0,article_text,articles_date,articles_link,articles_title,candidate_name,first_name,last_name,newspaper_name,state_fk,id
0,"Huntington, West Virginia (CNN) It was as if a...","[Updated 6:14 AM ET, Thu April 27, 2017 ]",https://www.cnn.com/2017/04/27/politics/joe-ma...,Manchin charts own path in Trump-era West Virg...,,Patrick,Morrisey,CNN,1,1
1,The legacy of the Hatfields vs. the McCoys loo...,"[Updated 10:15 PM ET, Fri May 16, 2014 ]",https://www.cnn.com/2014/05/16/us/west-virgini...,W. Virginia AG: Middle school concealed allege...,,Patrick,Morrisey,CNN,1,2
2,West Virginia's attorney general and state leg...,"[Updated 9:30 PM ET, Tue January 14, 2014 ]",https://www.cnn.com/2014/01/14/us/west-virgini...,"More investigations launched as 180,000 West V...",,Patrick,Morrisey,CNN,1,3
3,Subscribe to these events on your Google or ...,"[Updated 8:50 AM ET, Wed March 21, 2018]",https://www.cnn.com/2018/03/06/politics/midter...,What you need to know right now about the 2018...,,Patrick,Morrisey,CNN,1,4
4,Washington (CNN) Democratic West Virginia Sen....,"[Updated 5:32 PM ET, Mon August 7, 2017 ]",https://www.cnn.com/2017/08/07/politics/joe-ma...,Joe Manchin: 'Don't give a s--t' about attacks,,Patrick,Morrisey,CNN,1,5
5,Washington (CNN) President Donald Trump's Frid...,"[Updated 12:54 PM ET, Fri March 23, 2018 ]",https://www.cnn.com/2018/03/23/politics/trump-...,Trump's veto threat on spending bill gives amm...,,Patrick,Morrisey,CNN,1,6
6,Washington (CNN) Several Republican candidates...,"[Updated 8:32 PM ET, Wed January 3, 2018 ]",https://www.cnn.com/2018/01/03/politics/steve-...,Some Bannon-backed Senate hopefuls begin dista...,,Patrick,Morrisey,CNN,1,7
7,"Washington (CNN) Democratic incumbents, challe...","[Updated 6:06 AM ET, Mon February 5, 2018 ]",https://www.cnn.com/2018/02/05/politics/senate...,The 10 Senate seats most likely to flip in 201...,,Patrick,Morrisey,CNN,1,8
8,(CNN) The West Virginia Supreme Court of Appe...,"[Updated 3:26 PM ET, Fri May 12, 2017 ]",https://www.cnn.com/2017/05/12/us/west-virgini...,Court: West Virginia hate crime law doesn't in...,,Patrick,Morrisey,CNN,1,9
9,"White Sulphur Springs, West Virginia (CNN) The...","[Updated 4:45 PM ET, Thu April 5, 2018 ]",https://www.cnn.com/2018/04/05/politics/donald...,Trump targets Democratic senator in free-wheel...,,Patrick,Morrisey,CNN,1,10


In [8]:
len(all_data)

544