In [99]:
import os
import pandas as pd
import numpy as np

In [100]:
# Setup working directory and relative filepaths
current_dir = os.curdir
data_dir = os.path.join(current_dir, "data")
output_dir = os.path.join(current_dir, "data_clean")

# Get data path
abc_data_dir = os.path.join(data_dir, "abc-news.csv")
bbc_data_dir = os.path.join(data_dir, "bbc-news.csv")
cbs_data_dir = os.path.join(data_dir, "cbs-news.csv")
cnn_data_dir = os.path.join(data_dir, "cnn-news.csv")

# Read csv data
abc_data_csv = pd.read_csv(abc_data_dir, encoding="utf-16")
bbc_data_csv = pd.read_csv(bbc_data_dir, encoding="utf-16")
cbs_data_csv = pd.read_csv(cbs_data_dir, encoding="utf-16")
cnn_data_csv = pd.read_csv(cnn_data_dir, encoding="utf-16")

In [101]:
abc_data_df = pd.DataFrame(abc_data_csv.copy())
bbc_data_df = pd.DataFrame(bbc_data_csv.copy())
cbs_data_df = pd.DataFrame(cbs_data_csv.copy())
cnn_data_df = pd.DataFrame(cnn_data_csv.copy())

In [102]:
print("abc: " + str(len(abc_data_df)))
print("bbc: " + str(len(bbc_data_df)))
print("cbs: " + str(len(cbs_data_df)))
print("cnn: " + str(len(cnn_data_df)))

abc: 43280
bbc: 21119
cbs: 35085
cnn: 31696


In [103]:
col_ls = ['id', 'page_id', 'caption', 'link', 'picture']
abc_data_df.drop(columns = col_ls, inplace = True) 
bbc_data_df.drop(columns = col_ls, inplace = True) 
cbs_data_df.drop(columns = col_ls, inplace = True)
cnn_data_df.drop(columns = col_ls, inplace = True)


In [104]:
# Determine how many post_types and status_types and factorize types
post_types_unique = pd.unique(abc_data_df['post_type'])
post_types_factorize = pd.factorize(post_types_unique, sort=True)[0]
status_types_unique = pd.unique(abc_data_df['status_type'])
status_types_unique = [i for i in status_types_unique if str(i) != 'nan']
status_types_factorize = pd.factorize(status_types_unique, sort=True)[0]

# Create dictionary for post_types and status_types
post_types_dict = dict(zip(post_types_unique, post_types_factorize))
status_types_dict = dict(zip(status_types_unique, status_types_factorize))

# Define function to assign kv pairs to post_types and status_types
def post_cat(x):
    for type, category in post_types_dict.items():
        if x in type:
            return category

def status_cat(x):
    for type, category in status_types_dict.items():
        try:
            if x in type:
                return category
        except TypeError:
            return -1

# Apply function to DataFrame
abc_data_df['post_category'] = abc_data_df['post_type'].apply(post_cat)
abc_data_df['status_category'] = abc_data_df['status_type'].apply(status_cat)


['shared_story', 'published_story', 'added_photos', 'mobile_status_update', 'added_video', 'created_event']


In [105]:
abc_data_df

Unnamed: 0,name,message,description,post_type,status_type,likes_count,comments_count,shares_count,love_count,wow_count,haha_count,sad_count,thankful_count,angry_count,posted_at,post_category,status_category
0,Chief Justice Roberts Responds to Judicial Eth...,Roberts took the unusual step of devoting the ...,PAUL J. RICHARDS/AFP/Getty Images Chief Justic...,link,shared_story,61,27,12,0,0,0,0,0,0,2012-01-01 00:30:26,1,5
1,"With Reservations, Obama Signs Act to Allow De...",Do you agree with the new law?,"In his last official act of business in 2011, ...",link,shared_story,120,523,171,0,0,0,0,0,0,2012-01-01 01:08:58,1,5
2,Wishes For 2012 to Fall on Times Square,Some pretty cool confetti will rain down on Ne...,The wishes of thousands of people will flutter...,link,published_story,271,31,0,0,0,0,0,0,0,2012-01-01 02:00:37,1,4
3,Mitt Romney Vows to Veto Dream Act if President,,"Eric Gay/AP Photo SIOUX CITY, Iowa – Mitt Romn...",link,shared_story,140,188,23,0,0,0,0,0,0,2012-01-01 02:35:20,1,5
4,"NY Pharmacy Shootout Leaves Suspect, ATF Agent...",The pharmacy was held up by a man seeking pres...,A shootout at a suburban New York family pharm...,link,shared_story,59,51,34,0,0,0,0,0,0,2012-01-01 03:36:01,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43275,New Jersey Family Protects Their Donald Trump ...,This New Jersey family went to extraordinary m...,,video,added_video,1703,642,485,207,66,718,12,0,46,2016-11-07 23:03:33,4,1
43276,ABC News Politics,,Hours before the pivotal state of Pennsylvania...,video,mobile_status_update,1102,500,0,246,10,64,11,0,457,2016-11-07 23:19:27,4,3
43277,Trump's Tax Returns Remain Election Mystery,"As Election Day arrives, one of the campaign's...",,link,shared_story,4772,4011,3470,91,244,1232,65,0,2592,2016-11-07 23:27:30,1,5
43278,Donald Trump's Possible Paths to Victory,Donald J. Trump's path to victory isn't clear ...,,link,shared_story,1953,645,99,444,11,161,22,0,434,2016-11-07 23:45:07,1,5
