# DEMO solution
This notebook serves as first step in using NLP and machine learning to predict stock movements
This first iteration will focus on scraping headlines using newsapi.org API and create a dataset for sentiment analysis

## Getting data
Using newsapi.org to get headlines

In [1]:
# Getting API-key

import sys    

sys.path.append("C:/Users/peter/Documents/NLP-stock-project")

from keys import APIkey

# some more imports

import pprint
import requests 

In [2]:
# define endpoint
url = 'https://newsapi.org/v2/everything?'

# parameters
parameters = {
    'q': 'microsoft', # query phrase
    'from': '2023-10-08',  # Start date (YYYY-MM-DD) - specify your desired date here
    'to': '2023-10-10',    # End date (YYYY-MM-DD) - specify your desired end date
    'sortBy': 'publishedAt', # sort by publishingdate # number of articles (100 max)
    'apiKey': APIkey # your own API key
}

In [3]:
# try:
#     response = requests.get(url, params=parameters)
#     response.raise_for_status()  # Raise HTTPError for bad requests
#     response_json = response.json()
#     print(response_json)
# except requests.exceptions.HTTPError as errh:
#     print("HTTP Error:", errh)
# except requests.exceptions.ConnectionError as errc:
#     print("Error Connecting:", errc)
# except requests.exceptions.Timeout as errt:
#     print("Timeout Error:", errt)
# except requests.exceptions.RequestException as err:
#     print("OOps: Something Else", err)

In [4]:
# Make the request
response = requests.get(url, params=parameters)

# Convert the response to JSON format
response_json = response.json()

# Check out the dictionaries keys
#print(response_json.keys())

# get articles
articles = response_json["articles"]

In [5]:
len(articles)

100

In [6]:
# Get the total number of results
total_results = response_json["totalResults"]

# Print the total number of results
print(f"Total number of results: {total_results}")

Total number of results: 2038


In [7]:
titles = list(map(lambda x: x["title"], articles))

In [8]:
#pprint.pprint(response_json['articles'])

#### pagination loop

In [9]:
# importing datetime
from datetime import datetime, timedelta, date
import time

# bool
flag = True

# api endpoint
url = 'https://newsapi.org/v2/everything?'

# query params
query = 'microsoft'
start_date = date.today() -timedelta(days=14)
end_date = date.today()
sortBy = 'publishedAt'
api = APIkey
pagesize = 100

# empty list for titles
ms_titles = []

# creating date var for loop
current_date = start_date

while current_date < end_date:
    # to not crash API
    time.sleep(1)
    # api params
    parameters = {
        'q': query, # query phrase
        'from': current_date,  # Start date (YYYY-MM-DD) - specify your desired date here
        'to': current_date,    # End date (YYYY-MM-DD) - specify your desired end date
        'sortBy': sortBy, # sort by publishingdate # number of articles (100 max)
        'pageSize': pagesize,
        'apiKey': api, # your own API key
    }
    
    # calling the api and getting json
    response = requests.get(url, params=parameters)
    response_json = response.json()
    # get articles
    articles = response_json["articles"]
    
    # getting titles
    titles_per_page = map(lambda x: x["title"], articles)
    # extending list
    ms_titles.extend(titles_per_page)
    
    # update while variable
    current_date = current_date + timedelta(days=1)



In [10]:
len(ms_titles)

1400

In [11]:
print(ms_titles[:10])

["Microsoft's quarterly results and guidance showcase its AI prowess", 'Marketmind: A welcome bounce, but mixed big tech signals', 'Microsoft posts strong results on growing demand for AI services', "Microsoft's AI bets boost cloud business, Alphabet yet to find silver lining", 'Alphabet Revenues Up 11 Percent in Q3 on Stronger Ad Sales', 'Policía internacional detiene a grupo criminal que hackeó a Capcom', 'Cloud revenue miss drags on Alphabet’s stock, despite strong results overall', 'Galaxy Book 4 Ultra and Snapdragon X Elite can be a match made in heaven', 'デル、新開発のAzureハイブリッドクラウド基盤を発表', 'Microsoft รายงานผลประกอบการ เติบโตสูงโดยเฉพาะธุรกิจคลาวด์']


## Trying unsupervised learning models on my small dataset

In [12]:
# start by filtering out non-english headlines

#import
from langdetect import detect
    
headlines = ms_titles
cleaned_headlines = [title for title in headlines if detect(title) == "en"]

"""original dataset: {0} headlines, 
cleaned dataset: {1} headlines, {2} 
headlines removed""".format(len(headlines),
                               len(cleaned_headlines),
                               len(headlines) - len(cleaned_headlines)
                               ).replace("\n", "")

'original dataset: 1400 headlines, cleaned dataset: 809 headlines, 591 headlines removed'

In [13]:
# import nltk.sentiments.vader.SentimentIntensityAnalyzer

from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

score_dict = {"pos": 0, "neg": 0, "neu": 0}

for headline in cleaned_headlines:
    sentiment_scores = sid.polarity_scores(headline)
    
    #max_sentiment = max(sentiment_scores, key=lambda k: sentiment_scores[k])
    
    if sentiment_scores["compound"] >= 0.1:
        score_dict["pos"] += 1
    elif sentiment_scores["compound"] <= -0.1:
        score_dict["neg"] += 1
    else:
        score_dict["neu"] += 1
        
score_dict
    
    #for k in sorted(sentiment_scores):
    #    print('{0}: {1}, '.format(k, sentiment_scores[k]), end='')
    #print()
#sentiment_scores["neu"]

C:\Users\peter\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
C:\Users\peter\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.GK7GX5KEQ4F6UYO3P26ULGBQYHGQO7J4.gfortran-win_amd64.dll
C:\Users\peter\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll
C:\Users\peter\Anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


{'pos': 250, 'neg': 135, 'neu': 424}

In [16]:
#import csv
#import os

# Define the header for your CSV file
#header = ['Title', 'Description', 'Source', 'Published At']

# Specify the file path for your CSV file in the data folder
#csv_file_path = os.path.join('..', 'data', 'news_headlines.csv')

# Open the CSV file in write mode and write the header
#with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
#    csv_writer = csv.writer(csvfile)
#    csv_writer.writerow(header)

#print(f'CSV file "{csv_file_path}" created successfully with header: {header}')


CSV file "..\data\news_headlines.csv" created successfully with header: ['Title', 'Description', 'Source', 'Published At']


In [15]:
import os
print(os.getcwd())


C:\Users\peter\Documents\NLP-stock-project\notebooks
