https://catalog.data.gov/dataset?tags=real-time

https://www.google.com/finance
    

In [1]:
import requests, re
from bs4 import BeautifulSoup
import pandas as pd
from IPython.display import display, HTML
#from pandas.io.json import json_normalize

In [2]:
#https://github.com/redklouds/Finviz-API/blob/master/finviz.py
#request = requests.get('http://finviz.com/')

class FinViz:

    def __init__(self):
        """
        Function Name: Default Constructor
        PRECONDITIONS: None
        POSTCONDITIONS: 
            ->Initializes the object data at the current time
        ASSUMPTIONS: None
        """
        #make the request
        request = requests.get('http://finviz.com/')
        #maxe the soup
        soup = BeautifulSoup(request.text,'html5lib')
        #soup has been brewed
        self._html = soup
        #self._html = BeautifulSoup(requests.get('http://finviz.com/').text,'html5lib')
        self._data = list()


    def refresh(self):
        """
            Function Name: refresh
            Descriptions:
                -> Polls the webservice for new/updated data
            PRECONDITIONS: None
            POSTCONDITIONS: 
                -> Object reinitialized with new data
            ASSUMPTIONS: None
        """        
        self.__reinitialize()
    
    def __reinitialize(self):
        """
            Function Name: (Helper) _reinitialize 
            Description:
                -> calls another get requsests to poll refreshed data manually
            PRECONDITIONS: None
            POSTCONDITIONS: 
                -> Refreshes data
            ASSUMPTIONS: None
        """        
        request = requests.get('http://finviz.com/')
        #maxe the soup
        soup = BeautifulSoup(request.text,'html5lib')
        #soup has been brewed
        self._html = soup
        #self._html = BeautifulSoup(requests.get('http://finviz.com/').text,'html5lib')
        self._data = list()


    def _parseColumnData(self, data):
        """
            Function Name: 
            PRECONDITIONS:
            POSTCONDITIONS:
            ASSUMPTIONS:
        """        
        ret_data = data.findChild()
        ret_data = list(ret_data.children)
        result = list()
        for idx in ret_data:
            try:
                #parse the given data, into 
                result.append(self._parseText(idx.getText()))                
            except:
                #None Type, scrapping None Object, skip.
                pass
        return result  

    def _parseText(self, text):
        """
            Function Name: 
            PRECONDITIONS:
            POSTCONDITIONS:
            ASSUMPTIONS:
        """        
        #use regex for faster parsing of text, searching
        #for numbers and words, better and faster.

        #define regEx pattern
        #"find all alpha upper and lower words
        #+ one and unlimited timees
        #match words who may or may not have spcaes betweent hem and
        #are mixed caes zero to unliited times
        regExText = '[A-Z a-z]+'
        #match a number with at least 1 to unlimited length
        ##the number must have a period and 1 through 2 numbers after it
        #match fully if there is a '+' OR '-' ZERO or 1 times
        regExDigit = '(\+|-?\d+.\d{1,2})'
        #regExDigit = '(\d+.\d{1,2})'
        listText = re.findall(regExText, text)
        listDigit = re.findall(regExDigit, text)
        resultSet = {
                     'index':listText[0],
                     'price':listDigit[0],
                     'change':listDigit[1],
                     'volume':listDigit[2],
                     'signal':listText[1]
                     }
        #return the resulting dictionary
        return resultSet
    
    def getLeftColumn(self):
        """
            Function Name: getLeftColumn
            Description:
                -> get get's the raw data in the left column of the website
            PRECONDITIONS: None
            POSTCONDITIONS:
                ->returns a dictionary containing all the parsed data
                -> in the form:
                    Stock Index
                    Current Price
                    Percent Change
                    Volume
                    Signal
            ASSUMPTIONS: None
        """        
        data = self._getMainColumnData(0)
        a = self._parseColumnData(data)
        return a
        
    def getRightColumn(self):
        """
            Function Name: getLeftColumn
            Description:
                -> get get's the raw data in the right column of the website
            PRECONDITIONS: None
            POSTCONDITIONS:
                ->returns a dictionary containing all the parsed data
                -> in the form:
                    Earnings before
                    Stock Index
                    Current Price
                    Percent Change
                    Volume Signal
            ASSUMPTIONS: None
        """  
        data = self._getMainColumnData(1)
        a = self._parseColumnData(data)
        return a

    def _getMainColumnData(self,column):
        """
            Function Name: _getMainColumnData
            Description:
               -> our switch helper function depending on which 
               parameter we get, this function returns the respective data set
               pertaining to that column
               
            PRECONDITIONS: Integer 1 or 0( left will be 0, 1 will be right)
            POSTCONDITIONS:
                Left:
                Top Gainers
                New high
                Overbought
                Unusual Volume
                Upgrades
                Earnings Before
        
                The column on the right are:
                Top Losers
                New Low
                Oversold
                Most Volitile
                Most Active
                Downgrades
                Earnings After
                Insider Selling
            ASSUMPTIONS: None
        """        
        #scrape the specific elements
        searchResult = self._html.findAll('table', {'class':'t-home-table'})
        
        # we just want the first or second matches
        return searchResult[column]
        
    def marketStatus():
        """
            TODO: get Market status |Positive|Negative
            Function Name: 
            PRECONDITIONS:
            POSTCONDITIONS:
            ASSUMPTIONS:
        """        
        pass
    def getTrends(self):
        left_col = self.getLeftColumn()
        right_col = self.getRightColumn()
        
        combined_dict = list()
  
        for i in left_col:
            combined_dict.append(i)
        for i in right_col:
            combined_dict.append(i)
            
        #print(combined_dict)
        #return_dict = {"right_column": right_col, "left_column":left_col}
        return combined_dict



In [3]:
testObject = FinViz()
data = testObject.getTrends()
df = pd.DataFrame(data)
df['change'] = df['change'].astype(float)
df['price'] = df['price'].astype(float)
df['volume'] = df['volume'].astype(int)
df.head()

Unnamed: 0,change,index,price,signal,volume
0,37.54,PCG,24.4,Top Gainers,54600127
1,35.05,NGVC,22.5,Top Gainers,1734537
2,31.48,TSRO,34.96,Top Gainers,7638300
3,30.26,EMCI,31.25,Top Gainers,541848
4,20.01,ASFI,4.2,Top Gainers,45580


In [4]:
interval = 30
def periodic_work(interval):
    while True:
        testObject.refresh()
        data = testObject.getTrends()
        df2 = pd.DataFrame(data)
        df2['change'] = df2['change'].astype(float)
        df2['price'] = df2['price'].astype(float)
        df2['volume'] = df2['volume'].astype(int)
        df2.head()
        time.sleep(30)


In [5]:
testObject.refresh()
data = testObject.getTrends()
df3 = pd.DataFrame(data)
df3['change'] = df3['change'].astype(float)
df3['price'] = df3['price'].astype(float)
df3['volume'] = df3['volume'].astype(int)
df3.head()

Unnamed: 0,change,index,price,signal,volume
0,37.54,PCG,24.4,Top Gainers,54600127
1,35.05,NGVC,22.5,Top Gainers,1734537
2,31.48,TSRO,34.96,Top Gainers,7638300
3,30.26,EMCI,31.25,Top Gainers,541848
4,20.01,ASFI,4.2,Top Gainers,45580


In [6]:
df.index

RangeIndex(start=0, stop=38, step=1)

In [7]:
import math
import random
import matplotlib.pyplot as plt

price = df['price']
price2 = df2['price']
x = df['index']
fig = plt.figure(figsize=(5,7)) 

plt.scatter(price,x, color = 'r')
plt.scatter(price2,x, color = 'b')
plt.xlabel('Price')
plt.ylabel('Ticker')
#plt.savefig('test.png')
plt.show()


NameError: name 'df2' is not defined

In [None]:
#plot price of df vs df2

import math
import random
import matplotlib.pyplot as plt

price = df['price'] - df2['price']
price2 = df2['price']
x = df['index']
fig = plt.figure(figsize=(5,7)) 

plt.scatter(price, x)
plt.xlabel('Price')
plt.ylabel('Ticker')
#plt.savefig('test.png')
plt.show()


In [None]:
group = df2.groupby(["signal"])
for key,item in group:
    display(HTML(group.get_group(key).to_html()))
    
df2['signal'].value_counts()

In [None]:
%%html
testObject._html
display(HTML(str(testObject._html)))

In [None]:
%whos          #gives variable and types used
%lsmagic       #commands

make a list for each category (ie volume, index, etc) and append the values associated with them. then make a dictionary containing all of the lists and add that to a dataframe