In [11]:
#Importing necessary libraries
from dotenv import load_dotenv
from datetime import datetime, timedelta
import requests
import os
import time
import pandas as pd 
from SML.news_preprocessing import process_news_articles    #Importing everything from 'news_preprocessing'
from SML.news_preprocessing import exponential_moving_average
load_dotenv()

True

In [2]:
#Defining a function for fetching news

def fetch_news(api_key, ticker, start_date, end_date):
    base_url = os.environ.get("endpointnewsp")
    headers = {"Authorization": f"Bearer {api_key}"}
    all_news = []
    
    current_date = start_date

    while current_date <= end_date:
        batch_end_date = current_date + timedelta(days=50)
        if batch_end_date > end_date:
            batch_end_date = end_date

        params = {
            "ticker": ticker,
            "published_utc.gte": current_date.strftime('%Y-%m-%d'),
            "published_utc.lte": batch_end_date.strftime('%Y-%m-%d'),
            "limit": 50,
            "sort": "published_utc"
        }

        try:
            response = requests.get(base_url, headers=headers, params=params)
            if response.status_code == 200:
                data = response.json()
                articles = data.get('results', [])
                
                # Creating a DataFrame from articles
                df = pd.DataFrame(articles)
                
                # Adding primary_key column if ticker is found
                df['ticker'] = df['tickers'].apply(lambda x: ticker if ticker in x else None)
                
                all_news.append(df)  # Append DataFrame to the list
                print(f"Fetched {len(articles)} articles from {current_date.strftime('%Y-%m-%d')} to {batch_end_date.strftime('%Y-%m-%d')}")
                current_date = batch_end_date + timedelta(days=1)
            elif response.status_code == 429:
                print("Rate limit reached. Waiting to retry...")
                time.sleep(60)  # Wait for 60 seconds or as recommended by the API
                continue  # Retry the current request
            else:
                print(f"Failed to fetch data: {response.status_code}, {response.text}")
                break
        except Exception as e:
            print(f"An error occurred: {e}")
            break

    return pd.concat(all_news, ignore_index=True)

#Usage
api_key = os.environ.get('newsp_api')
ticker = 'TSLA'
end_date = datetime.now() - timedelta(days=1)  # Yesterday's date
start_date = end_date - timedelta(days=365 * 2)
news_articles = fetch_news(api_key, ticker, start_date, end_date)
print(f"Total articles fetched: {len(news_articles)}")


Fetched 50 articles from 2022-05-09 to 2022-06-28
Fetched 50 articles from 2022-06-29 to 2022-08-18
Fetched 50 articles from 2022-08-19 to 2022-10-08
Fetched 50 articles from 2022-10-09 to 2022-11-28
Fetched 50 articles from 2022-11-29 to 2023-01-18
Rate limit reached. Waiting to retry...
Fetched 50 articles from 2023-01-19 to 2023-03-10
Fetched 50 articles from 2023-03-11 to 2023-04-30
Fetched 50 articles from 2023-05-01 to 2023-06-20
Fetched 50 articles from 2023-06-21 to 2023-08-10
Fetched 50 articles from 2023-08-11 to 2023-09-30
Rate limit reached. Waiting to retry...
Fetched 50 articles from 2023-10-01 to 2023-11-20
Fetched 50 articles from 2023-11-21 to 2024-01-10
Fetched 50 articles from 2024-01-11 to 2024-03-01
Fetched 50 articles from 2024-03-02 to 2024-04-21
Fetched 50 articles from 2024-04-22 to 2024-05-08
Total articles fetched: 750


In [6]:
# Process the news articles
df = process_news_articles(news_articles)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   date       75 non-null     object 
 1   ticker     75 non-null     object 
 2   sentiment  75 non-null     float64
dtypes: float64(1), object(2)
memory usage: 1.9+ KB


In [8]:
df.head()

Unnamed: 0,date,ticker,sentiment
0,2022-06-24,TSLA,-0.084224
1,2022-06-25,TSLA,0.25
2,2022-06-26,TSLA,0.000556
3,2022-06-27,TSLA,0.150126
4,2022-06-28,TSLA,0.126298


In [9]:
df= df.sort_index(ascending=False)

In [10]:
#Putting the news articles into a csv
df.to_csv('news_articles.csv', index=False)

In [12]:
df_processed = exponential_moving_average(df, window=7)

In [13]:
df_processed.to_csv('news_articles_ema.csv', index=False)

In [14]:
df_processed.head()

Unnamed: 0,date,ticker,sentiment,exp_mean_7_days
74,2024-05-08,TSLA,0.010694,0.010694
73,2024-05-07,TSLA,0.032778,0.016215
72,2024-05-06,TSLA,0.152492,0.050285
71,2024-05-05,TSLA,0.03619,0.046761
70,2024-05-04,TSLA,0.062665,0.050737


In [15]:
df_processed.tail()

Unnamed: 0,date,ticker,sentiment,exp_mean_7_days
4,2022-06-28,TSLA,0.126298,0.079978
3,2022-06-27,TSLA,0.150126,0.097515
2,2022-06-26,TSLA,0.000556,0.073275
1,2022-06-25,TSLA,0.25,0.117456
0,2022-06-24,TSLA,-0.084224,0.067036


In [16]:
print(df_processed['date'].min())
print(df_processed['date'].max())

2022-06-24
2024-05-08


In [17]:
print(df_processed['date'].max() - df_processed['date'].min()) 

684 days, 0:00:00


In [18]:
df_processed.shape

(75, 4)

In [19]:
duplicates = df_processed[df_processed.duplicated('date')]

In [20]:
duplicates.shape

(0, 4)

In [21]:
df_processed.head()

Unnamed: 0,date,ticker,sentiment,exp_mean_7_days
74,2024-05-08,TSLA,0.010694,0.010694
73,2024-05-07,TSLA,0.032778,0.016215
72,2024-05-06,TSLA,0.152492,0.050285
71,2024-05-05,TSLA,0.03619,0.046761
70,2024-05-04,TSLA,0.062665,0.050737
