# Data Consolidation Script

In [174]:
from newsapi import NewsApiClient
import pandas as pd
import json
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import PorterStemmer

## Gathering API Data

In [102]:
newsapi = NewsApiClient(api_key='fcf462084cb645dfa3c112afadf3f50d')
new_lst = []
for j in ['relevancy','popularity', 'publishedAt']:
    for i in ['Biden', 'Trump', "Ballot",
              "Vote", "GOP", "Democrat" 'President']:
        for page in range(1, 6):
            articles = newsapi.get_everything(q=i,
                                              language='en',
                                              page=page, sort_by=j)
            new_lst += articles['articles']

In [116]:
# create a copy of list and remove all duplicates 
org = new_lst.copy()
no_duplicates = []
r = [no_duplicates.append(i) for i in org if i not in no_duplicates]

In [122]:
# filter data based on null/uncompatible data
filtered = [i for i in no_duplicates if (i["description"] is not None) 
            & (i["author"] != "CBS News") 
            & (i["title"] != "[Removed]")]

## Data Cleaning

In [None]:
# create function to match keywords to articles using stemming capabilities
def match(content, categories):
            for category, keywords in categories.items():
                for keyword in keywords:
                    if content is not None: 
                        # stem words to match keywords
                        stemmed_content = [stemmer.stem(word) for word in content]
                        if stemmer.stem(keyword.lower())in stemmed_content:
                            return category
                        
            return "Other U.S. News"  

In [138]:
# initalizations
final = []
sia = SentimentIntensityAnalyzer()

categories = {
            "Trump": ["trump", "donald",'republican', 'conservative', 'abortion'],
            "Biden": ["biden", "joe", 'democrat', 'liberal', 'democrats', 'clinton'],
            "Election": ["election", "vote", '2024', 'primaries', 'u.s.', 'ballot', 
                         'campaign', 'candidate', 'voting', 'presidential'],
            "Policy": ["policy", "government", 'congress', 'senate', 
                       'president', 'court', 'state', 'supreme', 'house',
                       "legislation", "law", "regulation"],
        }

stemmer = PorterStemmer()

# loop through results
for i in range(len(filtered)):
    
    # remove unnecessary data columns
    for key in ["urlToImage", "source.id"]:
        filtered[i].pop(key, None)
        
        # only append data that can be cleaned/has good results
        try:
            
            # clean titles
            filtered[i]["title"] = filtered[i]["title"].strip()
            
            # calculate total character counts
            filtered[i]["character_count"] = int((filtered[i]["content"].split()[-2]).strip("[+")) + 200
            
            # split and clean content/description
            filtered[i]["content"] = filtered[i]["content"].lower().split()[:-3]
            filtered[i]["description"] = filtered[i]["description"].lower().split()[:-1]
            
            # calculate sentiment scores based on each description
            filtered[i]["sentiment"] = sia.polarity_scores(" ".join(i["description"]))["compound"]
            
            # categorize data
            filtered[i]["category"] = match(i["description"], categories)
            
            final.append(filtered[i])
        except:
            pass
        


In [179]:
# output json file
out_file = open("final_data.json", "w") 
json.dump(filtered, out_file, indent = 1) 
out_file.close() 