In [1]:
import pandas as pd
import requests
import json
from datetime import datetime
import time

In [10]:
def get_stock_news_headlines(ticker, news_count=800, sleep=2):
    """
    Function that scrapes stock news headlines from NASDAQ. -> https://www.nasdaq.com/market-activity/stocks/aapl/news-headlines
    Make sure news_count is not greater than the number of available news.
    
    
    ticker: str
    news_count: int
    sleep: int
    
    returns: list[dict]
    """
    _offset = 0
    _limit = 8
    _pages = int(news_count / limit)
    _headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/107.0'}
    
    news = []
    
    for i in range(_pages):
        api_call = f"https://api.nasdaq.com/api/news/topic/articlebysymbol?q={ticker.lower()}|stocks&offset={_offset}&limit={_limit}&fallback=false"
        _offset += _limit
        
        r = requests.get(api_call, headers=_headers).json()
        
        for row in r["data"]["rows"]:
            news.append(
                {
                    "title": row["title"],
                    "uploaded": row["ago"],
                    "url": row["url"],
                    "timestamp": datetime.now().strftime("%d.%m.%Y;%H:%M:%S"),
                }
            )
            
        print(f"No. of scraped news: {len(news)}")
        print("______________________________________________")
        print(f"API call -> {api_call}...")
        time.sleep(sleep)
        
    return news


def convert_scraped_news_to_df(news):
    return pd.Dataframe(news)


def export_news_data(news_dataframe, out_file="output.csv"):
    """
    Function that exports news data returned from 'get_stock_news'.
    
    news_dataframe: pd.DataFrame 
    out_file: str
    
    returns: pd.DataFrame
    """
    df = news_dataframe
    _file_extension = out_file.split(".")[-1]
    
    if _file_extension == "xlsx":
        df.to_excel(out_file, sheet_name="news")
    else:
        df.to_csv(out_file)
   
    return pd.DataFrame(news)


def generate_correct_upload_date_values(news_dataframe):
    """
    This functions transforms upload times (1 day ago, 2 days ago, etc) to correct dates
    
    news_dataframe: pd.DataFrame
    
    returns: pd.DataFrame
    """
    df = news_dataframe
    
    df["timestamp"] = df["timestamp"].str.split(";").apply(lambda x: x[0])
    df["timestamp"] = pd.to_datetime(df["timestamp"])

    df.loc[
        df["uploaded"].str.endswith("mins ago") |
        df["uploaded"].str.endswith("hours ago"),
        "upload_date"
    ] = df["timestamp"]

    df["digits_in_uploaded"] = df["uploaded"].str.extract(r"(\d+)").astype(int)

    df.loc[
        df["uploaded"].str.endswith("days ago")|
        df["uploaded"].str.endswith("day ago"),
        "upload_date"
    ] = df["timestamp"] - pd.to_timedelta(df["digits_in_uploaded"], unit="d")

    df["upload_date"] = df["upload_date"].fillna(
        pd.to_datetime(df["uploaded"], errors="coerce", infer_datetime_format=True)
    )
    
    df = df.drop(columns=["digits_in_uploaded", "timestamp", "uploaded"])
    
    return df