Fetching data from marketaux API (https://www.marketaux.com/documentation)


-Change or add to the stock_symbols variable to find news of specific companies

In [1]:
import os
import requests
import json

def fetch_market_data_and_save(symbols, save_folder, language='en', filter_entities=True, api_token=None):
    base_url = 'https://api.marketaux.com/v1/news/all'
    
    params = {
        'symbols': ','.join(symbols),
        'language': language,
        'filter_entities': str(filter_entities).lower(),
        'api_token': api_token,
    }
    
    response = requests.get(base_url, params=params)
    
    if response.status_code == 200:
        # Save the JSON data to a file
        data = response.json()
        save_path = os.path.join(save_folder, 'new_data.json')
        with open(save_path, 'w') as file:
            json.dump(data, file, indent=2)
        print(f"Data saved successfully to {save_path}")
        return data
    else:
        print(f"Error {response.status_code}: {response.text}")
        return None

symbols = [
    'AAPL', 'GOOGL', 'AMZN', 'MSFT', 'TSLA',  # Large Tech Companies
    'JPM', 'GS', 'C', 'BAC', 'WFC',  # Major Banks
    'V', 'MA', 'PYPL', 'AXP', 'SQ',  # Payment and Financial Services
    'KO', 'PEP', 'MCD', 'SBUX', 'NKE',  # Consumer Goods and Retail
    'XOM', 'CVX', 'BP', 'TOT', 'RDS.A',  # Energy Companies
    'BA', 'LMT', 'RTX', 'NOC', 'GD',  # Aerospace and Defense
    'IBM', 'INTC', 'AMD', 'NVDA', 'QCOM',  # Semiconductors
    'DIS', 'NFLX', 'CMCSA', 'FOXA', 'DISCA',  # Media and Entertainment
    'VZ', 'T', 'TMUS', 'S', 'CHTR',  # Telecommunications
    'WMT', 'COST', 'AMT', 'PLD', 'SPG',  # Real Estate and Retail
    'PG', 'UNH', 'MRK', 'PFE', 'JNJ',  # Healthcare and Pharmaceuticals
    'GS', 'BLK', 'BLKB', 'PNC', 'MS',  # Financial Services
    'AAL', 'DAL', 'UAL', 'LUV', 'SAVE',  # Airlines and Transportation
    'MMM', 'CAT', 'DE', 'BA', 'HON',  # Industrial and Manufacturing
]
save_folder = '/Users/mariaeusse/MEH/Clases/Fall-2023/Ds-ML/Final_Project/Model_US/API-data'

result = fetch_market_data_and_save(symbols, save_folder, api_token='vuOf55jy5CTjkccu6UxMWxctylNrt0tpYS3cmZBy')

if result:
    print(result)


Data saved successfully to /Users/mariaeusse/MEH/Clases/Fall-2023/Ds-ML/Final_Project/Model_US/API-data/new_data.json


The data comes as a json file. Converting it to csv

In [2]:
#converting json file into csv file
import pandas as pd
import json

json_file_path = '/Users/mariaeusse/MEH/Clases/Fall-2023/Ds-ML/Final_Project/Model_US/API-data/new_data.json'

with open(json_file_path, 'r') as json_file:
    data = json.load(json_file)

data_list = data.get('data', [])

df = pd.json_normalize(data_list)
csv_file_path = '/Users/mariaeusse/MEH/Clases/Fall-2023/Ds-ML/Final_Project/Model_US/API-data/new_data.csv'
df.to_csv(csv_file_path, index=False)

In [3]:
import pandas as pd
df = pd.read_csv('/Users/mariaeusse/MEH/Clases/Fall-2023/Ds-ML/Final_Project/Model_US/API-data/new_data.csv')
df.head()

Unnamed: 0,uuid,title,description,keywords,snippet,url,image_url,language,published_at,source,relevance_score,entities,similar
0,a675c3d6-08fe-4253-bde9-285c29a05442,13 Best Nancy Pelosi Stocks To Buy Now,"In this article, we will take a look at the 13...",,"In this article, we will take a look at the 13...",https://finance.yahoo.com/news/13-best-nancy-p...,https://media.zenfs.com/en/insidermonkey.com/7...,en,2023-11-27T17:08:29.000000Z,finance.yahoo.com,,"[{'symbol': 'PYPL', 'name': 'PayPal Holdings, ...",[]
1,4e4521ff-2e47-416e-8617-81678485ec10,Pro Research: Wall Street dives into Enphase E...,Pro Research: Wall Street dives into Enphase E...,,"Published Nov 27, 2023 12:05PM ET\n\n© Reuters...",https://www.investing.com/news/stock-market-ne...,https://i-invdn-com.investing.com/news/LYNXNPE...,en,2023-11-27T17:05:42.000000Z,investing.com,,"[{'symbol': 'TSLA', 'name': 'Tesla, Inc.', 'ex...",[{'uuid': 'b08f7a42-31ee-461f-a98c-1a501009366...
2,4261b03e-601f-4a3b-8816-d54845c21a85,Pro Research: Wall Street eyes Alphabet's robu...,Pro Research: Wall Street eyes Alphabet's robu...,,"Published Nov 27, 2023 12:03PM ET\n\n© Reuters...",https://www.investing.com/news/stock-market-ne...,https://i-invdn-com.investing.com/news/LYNXMPE...,en,2023-11-27T17:03:16.000000Z,investing.com,,"[{'symbol': 'GOOGL', 'name': 'Alphabet Inc.', ...",[]


In [4]:
df.description[0]



from the description feature we could make the sentiment analysis

In [5]:
df.shape

(3, 13)

In [6]:
df.columns

Index(['uuid', 'title', 'description', 'keywords', 'snippet', 'url',
       'image_url', 'language', 'published_at', 'source', 'relevance_score',
       'entities', 'similar'],
      dtype='object')

pre-processing data to then put it in the BERT model 

In [7]:
#only keeping desired collumns (description = text) to merge with database data
import pandas as pd

df = pd.read_csv('/Users/mariaeusse/MEH/Clases/Fall-2023/Ds-ML/Final_Project/Model_US/API-data/new_data.csv')

selected_columns = ['description']
df = df[selected_columns]

df.to_csv('/Users/mariaeusse/MEH/Clases/Fall-2023/Ds-ML/Final_Project/Model_US/API-data/new_data.csv', index=False)
df.head()

Unnamed: 0,description
0,"In this article, we will take a look at the 13..."
1,Pro Research: Wall Street dives into Enphase E...
2,Pro Research: Wall Street eyes Alphabet's robu...


In [8]:
#adding colum labels to df
df['labels'] = None
df.head()

Unnamed: 0,description,labels
0,"In this article, we will take a look at the 13...",
1,Pro Research: Wall Street dives into Enphase E...,
2,Pro Research: Wall Street eyes Alphabet's robu...,


In [9]:
#assigning the correct label to each article
df['labels'][0] = 'positive'
df['labels'][1] = 'neutral'
df['labels'][2] = 'positive'

df.to_csv('/Users/mariaeusse/MEH/Clases/Fall-2023/Ds-ML/Final_Project/Model_US/API-data/new_data.csv', index=False)
df.head()

Unnamed: 0,description,labels
0,"In this article, we will take a look at the 13...",positive
1,Pro Research: Wall Street dives into Enphase E...,neutral
2,Pro Research: Wall Street eyes Alphabet's robu...,positive


In [11]:
df_all = pd.read_csv('/Users/mariaeusse/MEH/Clases/Fall-2023/Ds-ML/Final_Project/Model_US/data/API_data.csv', encoding='latin1')
df_all.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...
3,positive,According to the company 's updated strategy f...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...


In [12]:
#naming the columns of the database data to later merge with the API dataset
df_all.columns = ['labels', 'description']
df_all.head()

Unnamed: 0,labels,description
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...
3,positive,According to the company 's updated strategy f...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...


In [13]:
#merging database data with API data to make the model better to generalizing when its presented with not such good data
import pandas as pd

merged_df = pd.merge(df_all, df, on=['description', 'labels'], how='outer')
merged_df

Unnamed: 0,labels,description
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...
3,positive,According to the company 's updated strategy f...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...
...,...,...
4846,negative,The cryptocurrency Ripple (XRP) aims to provid...
4847,neutral,The Investing Club holds its Morning Meeting e...
4848,positive,"In this article, we will take a look at the 13..."
4849,neutral,Pro Research: Wall Street dives into Enphase E...


In [15]:
#checking the API data was added 
last_three_rows = merged_df.tail(6)
print(last_three_rows)

        labels                                        description
4845  negative  NEW YORK, Nov.  21, 2023  (GLOBE NEWSWIRE) -- ...
4846  negative  The cryptocurrency Ripple (XRP) aims to provid...
4847   neutral  The Investing Club holds its Morning Meeting e...
4848  positive  In this article, we will take a look at the 13...
4849   neutral  Pro Research: Wall Street dives into Enphase E...
4850  positive  Pro Research: Wall Street eyes Alphabet's robu...


In [16]:
#deleting the names of the columns, so we can do the preprocessing later in the model and add new names
merged_df.columns = ['', '']

merged_df.head()
merged_df.to_csv('/Users/mariaeusse/MEH/Clases/Fall-2023/Ds-ML/Final_Project/Model_US/API-data/API_data.csv', index=False)