# Forex Factory
---
Scrapper & Data Cleansing/Modelling

In [9]:
import logging
import sys
import datetime
import requests
import bs4
import numpy as np
import pandas as pd

### To do
test etoro api

In [None]:
def logger_settings():

    # Setting Logger
    #logger = logging.getLogger(__name__)
    logger = logging.getLogger('ForexFactoryLog')

    # Setting Handlers
    c_handler = logging.StreamHandler()
    c_handler.setLevel(logging.INFO)

    f_handler = logging.FileHandler(f'{__name__}.log')
    f_handler.setLevel(logging.INFO)

    # Formatting log messages
    log_message = '%(asctime)s :: %(name)s :: %(levelname)-8s :: %(message)s'
    log_format = logging.Formatter(log_message, datefmt='%Y-%m-%d %H:%M:%S')
    
    c_handler.setFormatter(log_format)
    f_handler.setFormatter(log_format)

    # Assigning handlers to logger object
    logger.addHandler(c_handler)
    logger.addHandler(f_handler)

    return logger

In [10]:
def fetch_html(url, headers, logger):
    '''
    Sends request message to url and returns raw html.
    '''
    try:
        resp = requests.get(url, headers=headers)
        assert resp.status_code == 200
        
        logging.info(f'{url}. Connection established.')
        
        return resp
        
    except Exception as get_error:
        logger.error(f'URL: {url}, Connection error: {str(get_error)}')
        sys.exit()


In [11]:
def parse_forexfactory_page(response, logger):
    '''
    Parses raw html from ForexFactory.com/calendar. Returns bs4 object containing calendar table information.
    '''
    try:
        soup = bs4.BeautifulSoup(response.content)
        table = soup.find('table', class_='calendar__table')
        rows_raw = table.find_all('tr')
        
        # Removing unnecessary rows.
        rows = [row for row in rows_raw if not isinstance(row, bs4.element.NavigableString) and row['class'][0] == 'calendar__row']
        
        logger.info(f'Forex Factory calendar page parsed successfully.')    
        return rows
        
    except Exception as initial_parsing_error:
        logger.error(f'Error parsing Forex Factory calendar table. Error: {str(initial_parsing_error)}')
        sys.exit()


In [12]:
def parse_forexfactory_calendar(raw_calendar, logger):
    '''
    Parses previously fetched data and creates pandas dataframe
    '''
    
    element_errors = []
    table_data = []

    # Iterate over each row element getting the text. Appends to a list at the end of every loop.
    for indx, row in enumerate(raw_calendar):#self._rows):
        
        # Date
        try:
            date = row.find('td' , class_='calendar__cell calendar__date').text.strip()
        except AttributeError:
            date = None
        except Exception as element_error:
            element_errors.append([indx, 'date'])

        # Time
        try:
            time = row.find('td' , class_='calendar__cell calendar__time').text.strip()
        except AttributeError:
            time = None
        except Exception as element_error:
            element_errors.append([indx, 'time'])

        # Currency
        try:
            currency = row.find('td' , class_='calendar__cell calendar__currency').text.strip()
        except AttributeError:
            currency = None
        except Exception as element_error:
            element_errors.append([indx, 'currency'])

        # Impact
        try:
            impact_color = row.find('td' , class_='calendar__cell calendar__impact').find('span')['class'][1].split('-')[-1]
            
            if impact_color == 'ora':
                impact = 'Medium'
            elif impact_color == 'yel':
                impact = 'Low'
            elif impact_color == 'red':
                impact = 'High'
            elif impact_color == 'gra':
                impact = 'Non-Economic'
                
        except AttributeError:
            impact = None
        except Exception as element_error:
            element_errors.append([indx, 'impact'])

        # Event
        try:
            event = row.find('span' , class_='calendar__event-title').text.strip()
        except AttributeError:
            event = None
        except Exception as element_error:
            element_errors.append([indx, 'event'])

        # Actual Value
        try:
            actual_val = row.find('td' , class_='calendar__cell calendar__actual').text.strip()
        except AttributeError:
            actual_val = None
        except Exception as element_error:
            element_errors.append([indx, 'actual_val'])

        # Actual Status
        try:
            actual_status = row.find('td' , class_='calendar__cell calendar__actual').span['class']
            
            if len(actual_status) > 0:
                actual_status = actual_status[0]
            else:
                actual_status = 'same'
                
        except AttributeError:
            actual_status = None
        except TypeError:
            actual_status = None
        except Exception as element_error:
            element_errors.append([indx, 'actual_st'])

        # Forecast
        try:
            forecast = row.find('td' , class_='calendar__cell calendar__forecast').text.strip()
        except AttributeError:
            forecast = None
        except Exception as element_error:
            element_errors.append([indx, 'forecast'])

        # Previous
        try:
            previous = row.find('td' , class_='calendar__cell calendar__previous').text.strip()
        except AttributeError:
            previous = None
        except Exception as element_error:
            element_errors.append([indx, 'previous'])

        # Appending new row to list of rows
        table_data.append([date, time, currency, impact, event, actual_val, actual_status, forecast, previous])
    
    cols = ['event_date', 'event_time', 'currency', 'impact', 'event', 
            'actual_val', 'actual_status', 'forecast', 'previous']
    df = pd.DataFrame(data=table_data, columns=cols)
        
    # Checking whether the number of rows in the html is the same in the table.
    try:
        assert len(raw_calendar) == len(table_data)
        logger.info(f'Records found: {len(table_data)-1}.')
    except AssertionError as parsing_error:
        logger.error(f'Parsing Error: Parsed data has {len(raw_calendar)-1} rows, table has {len(table_data)-1}.')

    # Checking all elements were parsed correctly.
    try:
        assert len(element_errors) == 0
        logger.info(f'Forex Factory calendar table parsed successfully.')
    except AssertionError as data_mismatch:
        for err in element_errors:
            logger.error(f'Data mismatch: {element_errors[0]}, {element_errors[1]}')
            
    return df
            

In [13]:
def save_csv(df, pathfile, logger):
    '''
    Save CSV file
    '''
    
    try: 
        df.to_csv(pathfile, sep=';', index=False)
        logger.info(f'CSV file saved: {pathfile}.')
        
    except Exception as csv_save_error:
        logger.error(f'CSV saving error: {str(csv_save_error)}')
        sys.exit()
        

In [14]:
def forexfactory_calendar(logger):
    
    now = datetime.datetime.now(datetime.timezone.utc).strftime("%Y%m%dT%H%M%SZ")
    logging.info('Forex Factory Calendar.')
    
    # Setting main variables
    url = 'https://www.forexfactory.com/calendar?month=last'
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'}
    
    # Fetching calendar data
    response = fetch_html(url, headers, logger=logging)
    raw_calendar = parse_forexfactory_page(response, logger=logging)
    
    # Creating calendar table
    df_calendar = parse_forexfactory_calendar(raw_calendar, logger=logging)
    
    # Saving calendar as CSV
    pathfile = f'ff_calendar_{now}.csv'
    save_csv(df_calendar, pathfile, logger=logging)
    
    return df_calendar


In [15]:
if __name__ == "__main__":
    
    logging.basicConfig(level=logging.INFO, filename='ForexFactory.log', filemode='w', 
                        format='%(asctime)s :: %(name)s :: %(levelname)-8s :: %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S')
    
    ff_cal = forexfactory_calendar(logging)

In [16]:
ff_cal

Unnamed: 0,event_date,event_time,currency,impact,event,actual_val,actual_status,forecast,previous
0,,,,,,,,,
1,Tue Aug 1,2:45am,NZD,Low,Building Consents m/m,3.5%,same,,-2.3%
2,,3:01am,GBP,Low,BRC Shop Price Index y/y,7.6%,same,,8.4%
3,,3:30am,JPY,Low,Unemployment Rate,2.5%,better,2.6%,2.6%
4,,4:30am,JPY,Low,Final Manufacturing PMI,49.6,same,49.4,49.4
...,...,...,...,...,...,...,...,...,...
407,,,USD,High,Unemployment Claims,228K,better,236K,232K
408,,,USD,Low,Personal Income m/m,0.2%,worse,0.3%,0.3%
409,,,USD,Low,Personal Spending m/m,0.8%,better,0.7%,0.6%
410,,5:45pm,USD,Low,Chicago PMI,48.7,better,44.3,42.8


In [None]:
# Crawler Class

class Crawler(object):

  def __init__(self, url, logger):
    """
    Downloads data from ForexFactory.com
    Parameters:
    - url: String. ForexFactory.com url to be parsed.
    - logger: Log instance from logging module.
    """
    self.url = url
    self.logger = logger

    # Hidden attributes
    self._rows = None
    self._table_data = []
    self._element_errors = []


  def retrieve_raw(self):
    """
    Retrieves raw html from ForexFactory.com. Returns bs4 object containing all table rows.
    """
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'}

    resp = requests.get(self.url, headers=headers)

    # Checking the connection was successful
    if resp.status_code == 200:
      self.logger.info(f'URL: {self.url}, status code: 200')
    else:
      self.logger.error(f'URL: {self.url}, status code: {resp.status_code}')

    # Checking whether the page has the right information
    try:
      soup = bs4.BeautifulSoup(resp.text)
      table = soup.find('table', class_='calendar__table')
      rows_raw = table.find_all('tr')
      # Removing unnecessary rows.
      self._rows = [row for row in rows_raw if not isinstance(row, bs4.element.NavigableString) and row['class'][0] == 'calendar__row']
    except Exception as initial_parsing_error:
      self.logger.error('Exception occurred', exc_info=True)

  def parse_raw(self):
    '''
    Parses previously fetched data and creates pandas dataframe
    '''

    # Iterate over each row element getting the text. Appends to a list at the end of every loop.
    for indx, row in enumerate(self._rows):
      #if not isinstance(row, bs4.element.NavigableString) and row['class'][0] == 'calendar__row':

      # Date
      try:
        date = row.find('td' , class_='calendar__cell calendar__date').text.strip()
      except AttributeError:
        date = None
      except Exception as element_error:
        self._element_errors.append([indx, 'date'])

      # Time
      try:
        time = row.find('td' , class_='calendar__cell calendar__time').text.strip()
      except AttributeError:
        time = None
      except Exception as element_error:
        self._element_errors.append([indx, 'time'])

      # Currency
      try:
        currency = row.find('td' , class_='calendar__cell calendar__currency').text.strip()
      except AttributeError:
        currency = None
      except Exception as element_error:
        self._element_errors.append([indx, 'currency'])

      # Impact
      try:
        impact_color = row.find('td' , class_='calendar__cell calendar__impact').find('span')['class'][1].split('-')[-1]
        if impact_color == 'ora':
          impact = 'Medium'
        elif impact_color == 'yel':
          impact = 'Low'
        elif impact_color == 'red':
          impact = 'High'
        elif impact_color == 'gra':
          impact = 'Non-Economic'
      except AttributeError:
        impact = None
      except Exception as element_error:
        self._element_errors.append([indx, 'impact'])

      # Event
      try:
        event = row.find('span' , class_='calendar__event-title').text.strip()
      except AttributeError:
        event = None
      except Exception as element_error:
        self._element_errors.append([indx, 'event'])

      # Actual Value
      try:
        actual_val = row.find('td' , class_='calendar__cell calendar__actual').text.strip()
      except AttributeError:
        actual_val = None
      except Exception as element_error:
        self._element_errors.append([indx, 'actual_val'])

      # Actual Status
      try:
        actual_status = row.find('td' , class_='calendar__cell calendar__actual').span['class']
        if len(actual_status) > 0:
          actual_status = actual_status[0]
        else:
          actual_status = 'same'
      except AttributeError:
        actual_status = None
      except TypeError:
        actual_status = None
      except Exception as element_error:
        self._element_errors.append([indx, 'actual_st'])

      # Forecast
      try:
        forecast = row.find('td' , class_='calendar__cell calendar__forecast').text.strip()
      except AttributeError:
        forecast = None
      except Exception as element_error:
        self._element_errors.append([indx, 'forecast'])

      # Previous
      try:
        previous = row.find('td' , class_='calendar__cell calendar__previous').text.strip()
      except AttributeError:
        previous = None
      except Exception as element_error:
        self._element_errors.append([indx, 'previous'])

      # Appending new row to list of rows
      self._table_data.append([date, time, currency, impact, event, actual_val, actual_status, forecast, previous])

    # Checking whether the number of rows in the html is the same in the table.
    try:
      assert len(self._rows) == len(self._table_data)
      self.logger.info(f'New entries: {len(self._table_data)-1}')
    except AssertionError as parsing_error:
      self.logger.error(f'Parsing Error: Parsed data has {len(self._rows)-1} rows, table has {len(self._table_data)-1}')

    # Checking all elements were parsed correctly.
    try:
      assert len(self._element_errors) == 0
      self.logger.info(f'No errors while parsing rows/elements')
    except AssertionError as data_mismatch:
      for err in self._element_errors:
        self.logger.error(f'Data Mismatch: {self._element_errors[0]}, {self._element_errors[1]}')


In [None]:
url = 'https://www.forexfactory.com/calendar?day=today'
url = 'https://www.forexfactory.com/calendar?month=last'

scraper = Crawler(url, logger)
scraper.retrieve_raw()
scraper.parse_raw()


In [None]:
table_headers = ['Date', 'Time', 'Currency', 'Impact', 'Event', 'Actual', 'ActualStatus', 'Forecast', 'Previous']
table_data = scraper._table_data
df = pd.DataFrame(data=table_data, columns=table_headers)
df

In [None]:
df.to_csv('forex_factory_202307.csv', index=False)

In [None]:
# DataFrame Class

class Calendar_Table(obj):




In [None]:
datetime.datetime.now(datetime.timezone.utc).strftime("%Y%m%dT%H:%M:%SZ")

In [None]:
url = 'https://www.forexfactory.com/calendar?day=aug20.2023'
#url = 'https://www.forexfactory.com/calendar?day=today'
#url = 'https://www.forexfactory.com/calendar?week=last'
#url = 'https://www.forexfactory.com/calendar?week=next'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'}

r = requests.get(url, headers=headers)
print(r)

# Time is US/Canada Eastern Time

In [None]:
soup = bs4.BeautifulSoup(r.text)
table = soup.find('table', class_='calendar__table')
rows_raw = table.find_all('tr')
rows = [row for row in rows_raw if not isinstance(row, bs4.element.NavigableString) and row['class'][0] == 'calendar__row']

In [None]:
data = []

for row in rows:
  #if not isinstance(row, bs4.element.NavigableString) and row['class'][0] == 'calendar__row':

    # Date
    try:
      date = row.find('td' , class_='calendar__cell calendar__date').text.strip()
    except AttributeError:
      date = ''

    # Time
    try:
      time = row.find('td' , class_='calendar__cell calendar__time').text.strip()
    except AttributeError:
      time = ''

    # Currency
    try:
      currency = row.find('td' , class_='calendar__cell calendar__currency').text.strip()
    except AttributeError:
      currency = ''

    # Impact
    try:
      impact_color = row.find('td' , class_='calendar__cell calendar__impact').find('span')['class'][1].split('-')[-1]
      if impact_color == 'ora':
        impact = 'Medium'
      elif impact_color == 'yel':
        impact = 'Low'
      elif impact_color == 'red':
        impact = 'High'
      elif impact_color == 'gra':
        impact = 'Non-Economic'
    except AttributeError:
      impact = ''

    # Event
    try:
      event = row.find('span' , class_='calendar__event-title').text.strip()
    except AttributeError:
      event = ''

    # Actual Value
    try:
      actual_val = row.find('td' , class_='calendar__cell calendar__actual').text.strip()
    except AttributeError:
      actual_val = ''

    # Actual Status
    try:
      actual_status = row.find('td' , class_='calendar__cell calendar__actual').span['class']
      if len(actual_status) > 0:
        actual_status = actual_status[0]
      else:
        actual_status = 'same'
    except AttributeError:
      actual_status = ''
    except TypeError:
      actual_status = ''

    # Forecast
    try:
      forecast = row.find('td' , class_='calendar__cell calendar__forecast').text.strip()
    except AttributeError:
      forecast = ''

    # Previous
    try:
      previous = row.find('td' , class_='calendar__cell calendar__previous').text.strip()
    except AttributeError:
      previous = ''

    data.append([date, time, currency, impact, event, actual_val, actual_status, forecast, previous])

df = pd.DataFrame(data=data)
df

In [None]:
len(rows)

# Yahoo Finance
## 10 years Treasury Bond Yield

In [None]:
url = 'https://query1.finance.yahoo.com/v8/finance/chart/%5ETNX?region=US&lang=en-US&includePrePost=false&interval=1mo&useYfid=true&range=1d&corsDomain=finance.yahoo.com&.tsrc=finance'
url = 'https://www.marketwatch.com/investing/bond/tmubmusd10y/downloaddatapartial?startdate=07/23/2023%2000:00:00&enddate=08/22/2023%2023:59:59&daterange=d30&frequency=p1d&csvdownload=true&downloadpartial=false&newdates=false&countrycode=bx'
url = 'https://query1.finance.yahoo.com/v7/finance/download/%5ETNX?period1=1661208486&period2=1692744486&interval=1d&events=history&includeAdjustedClose=true'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'}

r = requests.get(url, headers=headers)
print(r)

In [None]:
rows = r.text.split('\n')
data = [row.split(',') for row in rows]
df = pd.DataFrame(data=data[1:], columns=data[0])
df = df.applymap(lambda x: None if x.lower() == 'null' else x)
df[df.columns[0]] = pd.to_datetime(df[df.columns[0]], format='%Y-%m-%d') # Changing Date column without calling it using its name
df[df.columns[1:]] = df[df.columns[1:]].applymap(pd.to_numeric) # Converting numbers into floats
df

In [None]:
df.plot(x='Date', y='Adj Close')

# Stocks Earnings
## Nvidia

In [None]:
url = 'https://finance.yahoo.com/calendar/earnings?symbol=nvda'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'}

r = requests.get(url, headers=headers)
print(r)

In [None]:
soup = bs4.BeautifulSoup(r.content)
table = soup.find('table' , class_='W(100%)')
table_headers = table.find_all('th')
row_elements = table.find_all('td')

In [None]:
headers = [header.text for header in table_headers]
headers

In [None]:
elements = [element.text for element in row_elements]
elements_arr = np.array(elements)
table_cols = len(headers)
table_len = int(len(elements_arr)/table_cols)
rows = elements_arr.reshape(table_len, table_cols)
rows[:5]

In [None]:
earnings = pd.DataFrame(data=rows, columns=headers)
earnings

In [None]:
url = 'https://www.forexfactory.com/calendar'

r = requests.get(url)
print(r)

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


#day = input('Enter Day: ')
#month = input('Enter Month: ')
#year = input('Enter Year: ')

day, month, year = ('17', 'aug', '2023')

url = f'https://www.forexfactory.com/calendar?day={month}{day}.{year}'
url = f'https://www.forexfactory.com/calendar'
useragent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
headers = {'User-Agent': useragent}

In [None]:
url

In [None]:
Total_Data = []

def getdata(link):
    r = requests.get(link, headers=headers).text
    soup = BeautifulSoup( r , 'lxml')
    table = soup.find('table' , class_='calendar__table')
    rows = table.find_all('tr')
    tr = table.select('tr[class*="calendar__row calendar__row"]')
    for i in range(1,len(tr)):

        row = tr[i]
        # Date
        try:
            date = row.find('td' , class_='calendar__cell calendar__date').text.strip()
        except:
            date = ""

        # Event ID


        # Time
        try:
            time = row.find('td' , class_='calendar__cell calendar__time').text.strip()
        except:
            time = ""


        # Currency
        try:
            currency = row.find('td' , class_='calendar__cell calendar__currency').text.strip()
        except:
            currency = ""

        # Impact
        try:
            impact_color = row.find('td' , class_='calendar__cell calendar__impact').find('span')['class'][1].split('-')[-1]
            if impact_color == 'ora':
                impact = 'Medium'
            elif impact_color == 'yel':
                impact = 'Low'
            elif impact_color == 'red':
                impact = 'High'
            elif impact_color == 'gra':
                impact = 'Non-Economic'
        except:
            impact = ''

        # Event
        try:
            event = row.find('span' , class_='calendar__event-title').text.strip()
        except:
            event = ''

        # Actual
        try:
            actual = row.find('td' , class_='calendar__cell calendar__actual').text.strip()
        except:
            actual = " "

        # Forecast
        try:
            forecast = row.find('td' , class_='calendar__cell calendar__forecast').text.strip()
        except:
            forecast = ''

        # Previous
        try:
            previous = row.find('td' , class_='calendar__cell calendar__previous').text.strip()
        except:
            previous = ''

        myData = {
            'Time' : time ,
            'Currency' : currency,
            'Impact' : impact,
            'Event Title' : event,
            'Actual' : actual,
            'Forecast' : forecast,
            'Previous' : previous
        }
        Total_Data.append(myData)



    return

# Call the function
print('Scrapping the Data . . . . . .')
getdata(url)

# Store data to CSV

print('Scrapping Completed . . . . . .')
df = pd.DataFrame(Total_Data)
df.to_csv(f'{day}.{month}.{year}-Forex_Factory.csv' , index=False)

print('Data Stored to a CSV . . . . ')

In [None]:
df