In [20]:
import os
from pathlib import Path
import pandas as pd
from newsapi import NewsApiClient
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from dotenv import load_dotenv
load_dotenv()
analyzer = SentimentIntensityAnalyzer()

In [21]:
# news api for collecting data
api_key = os.getenv("news_api")
newsapi = NewsApiClient(api_key=api_key)

In [26]:
# import our initial batch of 20 companies (10 bankrupt, 10 healthy)
filepath = Path("../Project_2/Stocks_MasterList_Test.csv")
stocks_df = pd.read_csv(filepath, parse_dates=True, infer_datetime_format=True)
stocks_df.head()

Unnamed: 0,Symbol,Company Name,Price Performance (4 Weeks),Sector,Market Capitalization,Binary
0,Test,Test,Test,Test,Test,Test


In [23]:
company_info_df = stocks_df[['Symbol', 'Company Name']].copy()
company_info_df.head()

Unnamed: 0,Symbol,Company Name
0,Test,Test


In [24]:
search_list = []

for index, row in company_info_df.iterrows():
    ticker = row['Symbol']
    company_name = row['Company Name']
    search_string = ticker + ' ' + "AND" + ' ' + company_name
    search_list.append(search_string)

print(search_list)

['Test AND Test']


In [16]:
# using this list to test stocks, given stocks only work with newsapi if they are common

search_list = ['facebook', 
               'apple', 
               'tesla', 
               'google', 
               'palantir', 
               'amc', 
               'amazon', 
               'netflix', 
               'at&t', 
               'nvidia', 
               'pfizer', 
               'moderna', 
               'nikola', 
               'beyond meat', 
               'gamestop', 
               'starbucks', 
               'wells fargo', 
               'salesforce', 
               'microsoft',
               'verizon',    
               'carnival cruise',
               'macys',
               'nordstrom',
               'teladoc',
               'norwegian cruise',
               'peloton',
               'lululemon',
               'wayfair',
               'chevron',
               'royal caribbean cruise',
               'softbank',
               'astraZeneca',
               'johnson & johnson',
               'twitter',
               'costco',
               'zillow',
               'gilead',
               'boeing',
               'okta',
               'nike',
               'home depot',
               'shopify',
               'uber',
               'lyft',
               'doordash',
               'airbnb',
               'paypal',
               'snowflake',
               'quantumscape',
               'activision',
               'draftKings',
               'lordstown',
               'walgreens',
               'spotify',
               'fubotv',
               'northrop grumman',
               'berkshire hathaway',
               'cisco',
               'ebay',
               'qualcomm',
               'corning',
               'oracle',
               'intel',
               'seagate',
               'nextera',
               'zscaler',
               'crowdstrike',
               'redfin',
               'roblox',
               'citigroup',
               'docusign',
               'amgen',
               'regeneron',
               'medtronic',
               'microsoft',
               'arcelormittal',
               'mattel',
               'schlumberger',
               'ford',
               'intuit',
               'pepsi',
               'stryker',
               'walmart',
               'hasbro',
               'novartis',
               'toyota',
               'mastercard',
               'fortinet',
               'groupon',
               'autodesk',
               'serviceNow',
               'fireeye',
               'proofpoint',
               'qualys',
               'fastly']

# begin the actual function for NLP
final_sentiment_df = pd.DataFrame(columns=['name', 'compound', 'positive', 'negative', 'neutral'])
company_sentiments = []

# after having run the our NLP function on the list below, I am applying the "#" so it does not run everytime we test the code

#for search_string in search_list:
    #company_headlines = newsapi.get_everything(
        q = search_string, 
        language = "en", 
        page_size = 100,
        sort_by = "relevancy")

    for article in company_headlines["articles"]:
        try: 
            text = article["content"]
            date = article["publishedAt"][:10] 
            sentiment = analyzer.polarity_scores(text)
            compound = sentiment["compound"]
            pos = sentiment["pos"]
            neu = sentiment["neu"]
            neg = sentiment["neg"]
        
            company_sentiments.append({
                "text": text,
                "date": date,
                "compound": compound,
                "positive": pos,
                "negative": neg,
                "neutral": neu})
        except AttributeError:
            pass
            
    sentiment_df = pd.DataFrame(company_sentiments) 
    cols = ["date", "text", "compound", "positive", "negative", "neutral"]
    sentiment_df = sentiment_df[cols]
    sentiment_stats_df = sentiment_df.describe()
    sentiment_stats_df.reset_index(drop=True, inplace=True)
    
    not_final_sentiment_df = pd.DataFrame()
    not_final_sentiment_df["name"] = [search_string]
    not_final_sentiment_df["compound"] = [sentiment_stats_df.loc[1]['compound']]
    not_final_sentiment_df["positive"] = [sentiment_stats_df.loc[1]['positive']]
    not_final_sentiment_df["negative"] = [sentiment_stats_df.loc[1]['negative']]
    not_final_sentiment_df["neutral"] = [sentiment_stats_df.loc[1]['neutral']]
    
    final_sentiment_df = final_sentiment_df.append(not_final_sentiment_df, ignore_index=True)
    
print(final_sentiment_df)

          name  compound  positive  negative   neutral
0     facebook  0.079503  0.054388  0.034112  0.911541
1        apple  0.113186  0.057615  0.030918  0.911482
2        tesla  0.118182  0.057990  0.030068  0.908539
3       google  0.143941  0.062779  0.029588  0.905092
4     palantir  0.153429  0.064429  0.029014  0.904505
..         ...       ...       ...       ...       ...
90  serviceNow  0.183240  0.075081  0.030655  0.894028
91     fireeye  0.180912  0.075055  0.031120  0.893592
92  proofpoint  0.178847  0.075045  0.031633  0.893090
93      qualys  0.177082  0.075070  0.032103  0.892485
94      fastly  0.177527  0.075252  0.032078  0.892332

[95 rows x 5 columns]


In [25]:
# export to csv so we can build a concrete data set to work off of for our model building
# final_sentiment_df.to_csv(r'../Project_2/final_sentiment_df.csv')

In [29]:
# now importing our final data set on which we are going to build the model
filepath = Path("../Project_2/monthly_modeling_df.csv")
stocks_df = pd.read_csv(filepath, parse_dates=True, infer_datetime_format=True)
stocks_df.head()

Unnamed: 0,name,compound,positive,negative,neutral,sector,market cap,target
0,facebook,0.079503,0.054388,0.034112,0.911541,Communication Services,827,1
1,apple,0.113186,0.057615,0.030918,0.911482,Information Technology,2019,0
2,tesla,0.118182,0.05799,0.030068,0.908539,Consumer Discretionary,629,0
3,google,0.143941,0.062779,0.029588,0.905092,Communication Services,1380,1
4,palantir,0.153429,0.064429,0.029014,0.904505,Information Technology,44,1


In [27]:
# BEGIN MODEL BUILD
# Note: our sentiments were pulled as of 3/20/21