# DEMO solution
This notebook serves as first step in using NLP and machine learning to predict stock movements
This first iteration will focus on scraping headlines using newsapi.org API and create a dataset for sentiment analysis

## Getting data
Using newsapi.org to get headlines

In [1]:
# Getting API-key

import sys    

sys.path.append("C:/Users/peter/Documents/NLP-stock-project")

from keys import APIkey

# some more imports

import pprint
import requests 

In [2]:
# define endpoint
url = 'https://newsapi.org/v2/everything?'

# parameters
parameters = {
    'q': 'microsoft', # query phrase
    'from': '2023-10-08',  # Start date (YYYY-MM-DD) - specify your desired date here
    'to': '2023-10-10',    # End date (YYYY-MM-DD) - specify your desired end date
    'sortBy': 'publishedAt', # sort by publishingdate # number of articles (100 max)
    'apiKey': APIkey # your own API key
}

In [3]:
# try:
#     response = requests.get(url, params=parameters)
#     response.raise_for_status()  # Raise HTTPError for bad requests
#     response_json = response.json()
#     print(response_json)
# except requests.exceptions.HTTPError as errh:
#     print("HTTP Error:", errh)
# except requests.exceptions.ConnectionError as errc:
#     print("Error Connecting:", errc)
# except requests.exceptions.Timeout as errt:
#     print("Timeout Error:", errt)
# except requests.exceptions.RequestException as err:
#     print("OOps: Something Else", err)

In [4]:
# Make the request
response = requests.get(url, params=parameters)

# Convert the response to JSON format
response_json = response.json()

# Check out the dictionaries keys
#print(response_json.keys())

# get articles
articles = response_json["articles"]

In [5]:
len(articles)

100

In [6]:
# Get the total number of results
total_results = response_json["totalResults"]

# Print the total number of results
print(f"Total number of results: {total_results}")

Total number of results: 2038


In [7]:
titles = list(map(lambda x: x["title"], articles))

In [8]:
pprint.pprint(response_json['articles'])

[{'author': 'Damien Fisher',
  'content': 'In a major move, EU antitrust regulators seek the opinion of '
             'Microsofts users and rival companies if Bing should fall under '
             'the new tech rules it rolled out. The rules, known as the '
             'Digital Markets A… [+3081 chars]',
  'description': 'In a major move, EU antitrust regulators seek the opinion of '
                 'Microsoft’s users and rival companies if Bing should fall '
                 'under the new tech rules it rolled out. The rules,...\n'
                 'The post EU Antitrust Regulators Seek Input on Tech Giants’ '
                 'Compliance with Digital Ma…',
  'publishedAt': '2023-10-10T23:59:39Z',
  'source': {'id': None, 'name': 'Techreport.com'},
  'title': 'EU Antitrust Regulators Seek Input on Tech Giants’ Compliance with '
           'Digital Markets Act',
  'url': 'https://techreport.com/news/eu-antitrust-regulators-seek-input-on-tech-giants-compliance-with-digital-markets-ac

#### pagination loop

In [9]:
# importing datetime
from datetime import datetime, timedelta, date
import time

# bool
flag = True

# api endpoint
url = 'https://newsapi.org/v2/everything?'

# query params
query = 'microsoft'
start_date = date.today() -timedelta(days=14)
end_date = date.today()
sortBy = 'publishedAt'
api = APIkey
pagesize = 100

# empty list for titles
ms_titles = []

# creating date var for loop
current_date = start_date

while current_date < end_date:
    # to not crash API
    time.sleep(1)
    # api params
    parameters = {
        'q': query, # query phrase
        'from': current_date,  # Start date (YYYY-MM-DD) - specify your desired date here
        'to': current_date,    # End date (YYYY-MM-DD) - specify your desired end date
        'sortBy': sortBy, # sort by publishingdate # number of articles (100 max)
        'pageSize': pagesize,
        'apiKey': api, # your own API key
    }
    
    # calling the api and getting json
    response = requests.get(url, params=parameters)
    response_json = response.json()
    # get articles
    articles = response_json["articles"]
    
    # getting titles
    titles_per_page = map(lambda x: x["title"], articles)
    # extending list
    ms_titles.extend(titles_per_page)
    
    # update while variable
    current_date = current_date + timedelta(days=1)



In [10]:
len(ms_titles)

1400

In [14]:
print(ms_titles[:100])

['Microsoft LinkedIn cuts 668 jobs across engineering, talent and finance', 'Security Must Empower AI Developers Now', 'Xbox, Special Olympics Executives Talk Ongoing Partnership, Inclusivity In Gaming, More In New Interview', 'What’s New and Familiar in Windows 11 Version 23H2', 'AI model training rekindles interest in on-premises infrastructure', 'LinkedIn Cuts 668 Jobs in Second Layoff Round This Year - The New York Times', 'Israel-Hamas Conflict Spills Over in Newsroom Shouting Matches', '株式会社アイビステレワーク時代に対応 社内情報の漏えいを防御\u3000画期的ストレージサービスをアイビスがリリース', 'LinkedIn cuts more than 600 workers, about 3% of workforce', 'LinkedIn Cuts 668 Jobs in Second Layoff Round This Year', 'LinkedIn to cut 668 jobs in second round of layoffs this year - The Guardian', 'How Birkenstock’s lackluster debut mistimed the shaky IPO market', '定番音楽プレイヤー「foobar2000」次期バージョンv2.1が「Microsoft Store」に登場／プレビュー扱い、32bitアドオンが動作しなくなるので注意', 'Windows launching Arm Advisory Service for developers', 'Tori Spelling Sends a Messa

## Trying unsupervised learning models on my small dataset

In [20]:
# start by filtering out non-english headlines

#import
from langdetect import detect
    
headlines = ms_titles
cleaned_headlines = [title for title in headlines if detect(title) == "en"]

"""original dataset: {0} headlines, 
cleaned dataset: {1} headlines, {2} 
headlines removed""".format(len(headlines),
                               len(cleaned_headlines),
                               len(headlines) - len(cleaned_headlines)
                               ).replace("\n", "")

'original dataset: 1400 headlines, cleaned dataset: 858 headlines, 542 headlines removed'

In [45]:
# import nltk.sentiments.vader.SentimentIntensityAnalyzer

from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

score_dict = {"pos": 0, "neg": 0, "neu": 0}

for headline in cleaned_headlines:
    sentiment_scores = sid.polarity_scores(headline)
    
    #max_sentiment = max(sentiment_scores, key=lambda k: sentiment_scores[k])
    
    if sentiment_scores["compound"] >= 0.1:
        score_dict["pos"] += 1
    elif sentiment_scores["compound"] <= -0.1:
        score_dict["neg"] += 1
    else:
        score_dict["neu"] += 1
        
score_dict
    
    #for k in sorted(sentiment_scores):
    #    print('{0}: {1}, '.format(k, sentiment_scores[k]), end='')
    #print()
#sentiment_scores["neu"]

{'pos': 256, 'neg': 136, 'neu': 466}