<a href="https://colab.research.google.com/github/nprimavera/Python/blob/main/Machine_Learning_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Machine Learning Project

In [None]:
# Machine Learning Project

Ideas:
- Aerospace
- Sports
- Stock Market

Steps:
1. Build a pipeline and prepare the data
2. Train the data (use a Neural Network)
3. Evaluate the model
4. Automatically retrain the model to improve it

In [None]:
# Basic libraries
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
import seaborn as sns
import statsmodels.formula.api as smf
import statsmodels.api as sm

from typing import List
from typing import Tuple
from typing import Union
from typing import Callable
from typing import Dict
from sklearn.linear_model import LinearRegression
from scipy.stats import binom
from scipy.stats import norm
from scipy.stats import ttest_ind
from tqdm import tqdm
from urllib.error import URLError
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from matplotlib import cm

sns.set(font_scale=1.5)
sns.set_style("whitegrid", {'grid.linestyle':'--'})

# Stock Market - Importing data using Yahoo Finance to get Historic Prices of Stocks

In [None]:
# Importing data using Yahoo Finance

In [None]:
import urllib.request
import json
import time
import os
import difflib
import itertools
import pandas as pd

from multiprocessing.dummy import Pool
from datetime import datetime

try:
    import httplib
except:
    import http.client as httplib

In [None]:
# Check internet connection

In [None]:
def check_internet():                                               # define the check internet function
    conn = httplib.HTTPConnection("www.google.com", timeout=5)      # creates an HTTP connection to "www.google.com" with a timeout of 5 seconds
    try:
        conn.request("HEAD", "/")                                   # sends an HTTP HEAD request to the root ("/") of the Google website
                                                                    # the HEAD request is used here because it retrieves only the headers, not the entire content, which makes it faster
        conn.close()                                                # closes the connection
        return True                                                 # print("True") - there is an active internet connection
    except:
        conn.close()                                                # if there is an exception it closes the connection
        return False                                                # print("False") - there is no active internet connection

In [None]:
# Create a function which will get_historic_price for given query_url
# It will save the stock data as json and csv inside a folder named "historic_data"

In [None]:
def get_historic_price(query_url, json_path, csv_path):               # download historical stock price data from a specified query_url, process the data, and save it as both a JSON file and a CSV file

    stock_id = query_url.split("&period")[0].split("symbol=")[1]      # extracts the stock symbol from the query_url

    if os.path.exists(csv_path+stock_id+'.csv') and os.stat(csv_path+stock_id+'.csv').st_size != 0: # checks if a CSV file with the historical data for the stock already exists in the specified csv_path and is not empty
        print("<<<  Historical data of "+stock_id+" already exists")  # if it exists, the function prints a message and returns early, indicating that the historical data already exists
        return

    while not check_internet():                                       # wait until there is an active internet connection by repeatedly checking with the check_internet() function
        print("Could not connect, trying again in 5 seconds...")      # print while checking connection
        time.sleep(5)                                                 # waits 5 seconds

    try:
        with urllib.request.urlopen(query_url) as url:                # tries to open the specified query_url and parse the JSON data
            parsed = json.loads(url.read().decode())                  # if successful, the data is stored in the parsed variable

    except:
        print("|||  Historical data of "+stock_id+" doesn't exist")   # prints if no file exists it
        return

    else:
        if os.path.exists(json_path+stock_id+'.json') and os.stat(json_path+stock_id+'.json').st_size != 0:   # # checks if a JSON file for the stock already exists in the specified json_path and is not empty
            os.remove(json_path+stock_id+'.json')                     # if it exists, it is deleted

        with open(json_path+stock_id+'.json', 'w') as outfile:
            json.dump(parsed, outfile, indent=4)                      # writes the parsed JSON data to a JSON file in the specified json_path.

        try:                                                          # attempts to extract relevant information from the parsed JSON data (such as Date, Low, Open, Volume, High, Close, and Adjusted Close)

            Date = []                                                                                 # Date
            for i in parsed['chart']['result'][0]['timestamp']:
                Date.append(datetime.utcfromtimestamp(int(i)).strftime('%d-%m-%Y'))

            Low = parsed['chart']['result'][0]['indicators']['quote'][0]['low']                       # Low
            Open = parsed['chart']['result'][0]['indicators']['quote'][0]['open']                     # Open
            Volume = parsed['chart']['result'][0]['indicators']['quote'][0]['volume']                 # Volume
            High = parsed['chart']['result'][0]['indicators']['quote'][0]['high']                     # High
            Close = parsed['chart']['result'][0]['indicators']['quote'][0]['close']                   # Close
            Adjusted_Close = parsed['chart']['result'][0]['indicators']['adjclose'][0]['adjclose']    # Adjusted Close

            # creates a pandas DataFrame (df) with this information
            df = pd.DataFrame(list(zip(Date,Low,Open,Volume,High,Close,Adjusted_Close)),columns =['Date','Low','Open','Volume','High','Close','Adjusted Close'])

            if os.path.exists(csv_path+stock_id+'.csv'):               # checks if a CSV file for the stock already exists in the specified csv_path
                os.remove(csv_path+stock_id+'.csv')                    # deletes it if it does
            df.to_csv(csv_path+stock_id+'.csv', sep=',', index=None)   # writes the DataFrame to a CSV file in the specified csv_path
            print(">>>  Historical data of "+stock_id+" saved")        # prints if historical data is saved

        except:
            print(">>>  Historical data of "+stock_id+" could not be saved")  # prints if historical data could not be saved

        return

In [None]:
# Set where the json and csv files will be saved which have been passed to the function get_historic_price()

- os.getcwd(): Gets the current working directory.
- os.sep: Represents the separator used in the file system paths. On Windows,it is '\' (backslash), and on Unix-like systems (including Linux and macOS), it is '/' (forward slash).
- "..": Represents the parent directory.
- "historic_data": Represents a directory named "historic_data" within the parent directory.
- "json" and "csv": Represent subdirectories within the "historic_data" directory, where JSON and CSV files will be stored, respectively.

In [None]:
json_path = os.getcwd()+os.sep+".."+os.sep+"historic_data"+os.sep+"json"+os.sep   # full path to the directory where JSON files will be stored
csv_path = os.getcwd()+os.sep+".."+os.sep+"historic_data"+os.sep+"csv"+os.sep     # full path to the directory where CSV files will be stored

In [None]:
json_path  # see which directory we are in

'/content/../historic_data/json/'

In [None]:
csv_path  # see which directory we are in

'/content/../historic_data/csv/'

In [None]:
# Check if the above directories exist, if not, then we will use os.mkdir - use an if statement to create/make them

In [None]:
# check JSON path
if not os.path.isdir(json_path):  # if the condition is true, it means the directory is not present
    os.makedirs(json_path)        # creates path if it doesnt exist

# check CSV path
if not os.path.isdir(csv_path):   # if the condition is true, it means the directory is not present
    os.makedirs(csv_path)         # creates path if it doesnt exist

In [None]:
# check
print({json_path})
print({csv_path})

{'/content/../historic_data/json/'}
{'/content/../historic_data/csv/'}


In [None]:
# Create the funciton to shrink the ticker list.

In [None]:
print("Current working directory:", os.getcwd())      # See which directory we are in to know how to call the file path

Current working directory: /content


In [None]:
ticker_file_path = "/content/Yahoo Ticker Symbols - September 2017.xlsx"      # define the file path for the excel file
temp_df = pd.read_excel(ticker_file_path)                                     # uses pandas to read the excel file into a dataframe
print("Total stocks:", len(temp_df))                                          # print number of total stocks by printing the amount of rows
temp_df.head(10)                                                              # print the first 10 rows

Total stocks: 106331


Unnamed: 0,Yahoo Stock Tickers,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,http://investexcel.net,,,,,,,
1,,,,,,,,
2,Ticker,Name,Exchange,Category Name,Country,,,
3,OEDV,"Osage Exploration and Development, Inc.",PNK,,USA,,,Samir Khan
4,AAPL,Apple Inc.,NMS,Electronic Equipment,USA,,,simulationconsultant@gmail.com
5,BAC,Bank of America Corporation,NYQ,Money Center Banks,USA,,,
6,AMZN,"Amazon.com, Inc.",NMS,Catalog & Mail Order Houses,USA,,,This ticker symbol list was downloaded from
7,T,AT&T Inc.,NYQ,Telecom Services - Domestic,USA,,,http://investexcel.net/all-yahoo-finance-stock...
8,GOOG,Alphabet Inc.,NMS,Internet Information Providers,USA,,,and was updated on 2nd September 2017
9,MO,"Altria Group, Inc.",NYQ,Cigarettes,USA,,,


In [None]:
# Refine the data to organize it

In [None]:
temp_df = temp_df.drop(temp_df.columns[[5, 6, 7]], axis=1)  # drops columns at indices 5, 6, and 7 from the DataFrame temp_df, and axis=1 argument indicates that columns are being dropped
headers = temp_df.iloc[2]                                   # extracts the third row of the modified DataFrame (temp_df) and assigns it to the variable headers
df = pd.DataFrame(temp_df.values[3:], columns=headers)      # creates a new DataFrame (df) using the values from the rows starting from index 3 of the modified temp_df. The columns=headers argument assigns the extracted headers to the DataFrame
print("Total stocks:",len(df))                              # prints the amount of rows
df.head(10)                                                 # print the first 10 rows

Total stocks: 106328


2,Ticker,Name,Exchange,Category Name,Country
0,OEDV,"Osage Exploration and Development, Inc.",PNK,,USA
1,AAPL,Apple Inc.,NMS,Electronic Equipment,USA
2,BAC,Bank of America Corporation,NYQ,Money Center Banks,USA
3,AMZN,"Amazon.com, Inc.",NMS,Catalog & Mail Order Houses,USA
4,T,AT&T Inc.,NYQ,Telecom Services - Domestic,USA
5,GOOG,Alphabet Inc.,NMS,Internet Information Providers,USA
6,MO,"Altria Group, Inc.",NYQ,Cigarettes,USA
7,DAL,"Delta Air Lines, Inc.",NYQ,Major Airlines,USA
8,AA,Alcoa Corporation,NYQ,Aluminum,USA
9,AXP,American Express Company,NYQ,Credit Services,USA


In [None]:
# Create query urls for the stock tickers
# This will bring the query pages, where yahoo finance holds it's historical stock data

In [None]:
query_urls=[]                 # initializes an empty list called query_urls to store the constructed query URLs
for ticker in df['Ticker']:   # iterates through each value in the 'Ticker' column of the DataFrame df
    # constructs a query url for each stock ticker
    query_urls.append("https://query1.finance.yahoo.com/v8/finance/chart/"+ticker+"?symbol="+ticker+"&period1=0&period2=9999999999&interval=1d&includePrePost=true&events=div%2Csplit")

In [None]:
# Get stock data with multithreading/multiprocessing

In [26]:
# creates a pool of worker processes using the Pool class from the multiprocessing module
with Pool(processes=10) as pool: # the processes=10 argument specifies that there should be 10 parallel processes
    # uses the starmap method of the pool to apply the get_historic_price function to each tuple of arguments in parallel
    # the arguments are obtained by zipping together the query_urls list and repeating json_path and csv_path for each URL using itertools.repeat
    # this parallelization allows for faster execution of the function for multiple stocks.
    pool.starmap(get_historic_price, zip(query_urls, itertools.repeat(json_path), itertools.repeat(csv_path)))
print("<|>  Historical data of all stocks saved") # prints upon completion

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
|||  Historical data of RLF.BO doesn't exist
|||  Historical data of MEHSECU.BO doesn't exist
|||  Historical data of LAXMIMACH.BO doesn't exist
|||  Historical data of PEKB.BE doesn't exist
|||  Historical data of 2ZC.F doesn't exist
|||  Historical data of NICO-BTA.ST doesn't exist
|||  Historical data of 9939.TW doesn't exist
|||  Historical data of KIY.F doesn't exist
|||  Historical data of 21712619.SW doesn't exist
|||  Historical data of SOW.DE doesn't exist
|||  Historical data of MED.DE doesn't exist
|||  Historical data of RLGT-PA.A doesn't exist
|||  Historical data of WHRL3.SA doesn't exist
|||  Historical data of PEOP doesn't exist
|||  Historical data of C08.BE doesn't exist
|||  Historical data of 3317.TWO doesn't exist
|||  Historical data of 9937.TW doesn't exist
|||  Historical data of 57A809.KS doesn't exist|||  Historical data of PROZONINTU.NS doesn't exist

|||  Historical data of SW5.DU doesn't exist

In summary, the code is designed to overwrite existing JSON and CSV files for each stock, but it doesn't explicitly delete all data before saving. It checks and removes existing files on a per-stock basis. If there is any existing data for a stock, the code overwrites it with the new data.

# TESTING

In [28]:
# Use the previously defined csv_path
csv_path_example = os.getcwd() + os.sep + ".." + os.sep + "historic_data" + os.sep + "csv" + os.sep

# Example usage for stock symbol "AAPL"
stock_symbol_example = "AAPL"
view_historical_data(stock_symbol_example, csv_path_example)

NameError: name 'view_historical_data' is not defined

In [29]:
import requests

def get_latest_stock_data(api_key, stock_symbol):
    base_url = "https://www.alphavantage.co/query"

    # Define the API parameters for the time series intraday request
    params = {
        "function": "TIME_SERIES_INTRADAY",
        "symbol": stock_symbol,
        "interval": "1min",  # Adjust the interval as needed (e.g., "5min", "15min", "1d")
        "apikey": api_key,
    }

    try:
        # Make the API request
        response = requests.get(base_url, params=params)
        data = response.json()

        # Extract the most recent stock data
        latest_data = data["Time Series (1min)"]
        latest_timestamp = max(latest_data.keys())
        latest_stock_info = latest_data[latest_timestamp]

        # Display the latest stock information
        print(f"Latest data for {stock_symbol} at {latest_timestamp}:")
        print(f"Open: {latest_stock_info['1. open']}")
        print(f"High: {latest_stock_info['2. high']}")
        print(f"Low: {latest_stock_info['3. low']}")
        print(f"Close: {latest_stock_info['4. close']}")
        print(f"Volume: {latest_stock_info['5. volume']}")

    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage:
# Replace 'YOUR_API_KEY' with your actual Alpha Vantage API key
api_key_example = "YOUR_API_KEY"
stock_symbol_example = "AAPL"  # Change this to the desired stock symbol

# Call the function
get_latest_stock_data(api_key_example, stock_symbol_example)


An error occurred: 'Time Series (1min)'
