## Scrapping with Pandas

In [None]:
import requests
from bs4 import BeautifulSoup
import re
    
def findH2(tag):
    return tag.name == 'h2' and 'requirements' in tag.text


def parse_table(table):
    """ Get data from table """
    return [
        [cell.get_text().strip() for cell in row.find_all(['th', 'td'])]
           for row in table.find_all('tr')
    ]

def get_table(soup):
    h2 = soup.find_all(findH2)
    try:
        h2 = soup.find_all(findH2)
        table = sibling.find_next('table')
    except AttributeError as e:
        print('No tables found, exiting')
        return 1
    
    return table


domain = 'https://en.wikipedia.org'
url = domain+"/w/index.php?title=Category:Visa_requirements_by_nationality";

result = requests.get(url)

result.status_code
c = result.content
soup = BeautifulSoup(c)

samples = soup.find_all("a", href=re.compile("Visa_requirements_for"))

i = 0
for a in samples:
    link = a.get('href')
    if i == 9:
        break
    i +=1
    
    
url = domain+link;

print(url)
result = requests.get(url)

if result.status_code:
    c = result.content
    soup = BeautifulSoup(c)

    soup = BeautifulSoup(html3)

    table = get_table(soup)
    table_data = parse_table(table)

    i = 0
    data = []

    for n in table_data:
        
        if i == 0:
            i +=1
            continue
        country = {
            'country': n[0],
            'requirement': n[1],
            'allowed_stay': n[2],
            'notes': n[3]
        }
        data.append(country)
    
for n in data:
    print(n.get('requirement'))
    

In [None]:
import argparse
import pandas as pd
from influxdb import DataFrameClient
from pandas.io.json import json_normalize
from pymongo import MongoClient
from odo import odo
client = MongoClient()

mydb = client['stock']


    
    
def get_client(dbname, host='localhost', port=8086):
    """Instantiate the connection to the InfluxDB client."""
    user = 'root'
    password = 'root'
    
    influx = DataFrameClient(host, port, user, password, dbname)
    influx.create_database(dbname)
    return influx

def main(df, metric, dbname, timecolumn, tagcolumn=None, host='localhost', port=8086):
    protocol = 'json'
    df.set_index(timecolumn,inplace=True)
    influx = get_client(dbname)
    chunk_size = 500
    for start in range(0, df.shape[0], chunk_size):
        df_subset = df.iloc[start:start + chunk_size]
        influx.write_points(df_subset, coll, protocol=protocol, time_precision='s')
    return 
    


influx.drop_database('stocks')
    
coll = 'historical_day'
df = pd.DataFrame(list(mydb[coll].find()))
del df['_cls']
del df['_id']

df.open = df.open*1.00
display(df.shape)
df = main(df, coll,'stocks','datetime')

# df.info()

In [None]:
# mongoexport --db stock --collection historical_minute --type=csv --fields=tradingsymbol,high,open,datetime,volume,instrument_type,close,low,exchange,instrument_token --out historical_minute.csv
# mongoexport --db stock --collection historical_day --type=csv --fields=datetime,close,volume,instrument_type,open,high,tradingsymbol,low,exchange --out historical_day.csv

import numpy as np
import pandas as pd
from time import time

df = pd.read_csv("historical_minute.csv", chunksize=500000)
measurement='historical_minute'

ddl = """# DDL
CREATE DATABASE stocks

# DML
# CONTEXT-DATABASE: stocks
# CONTEXT-RETENTION-POLICY: autogen
"""
with open(measurement+'.txt', 'w') as f:
    f.write("%s\n" % ddl)


for chunk in df:
    t0 = time()
    chunk.high = chunk.high*1.00
    chunk.close = chunk.close*1.00
    chunk.low = chunk.low*1.00
    chunk.volume = chunk.volume*1.00
    chunk.datetime = pd.to_datetime(chunk.datetime).astype(int)
    timeMulti = time() - t0
    lines = []
    t1 = time()
    
    lines = [f'{measurement},instrument_token={row[7]}i close={row[2]},high={row[3]},low={row[4]},instrument_type="{row[5]}",exchange="{row[6]}",tradingsymbol="{row[0]}",open={row[8]},volume={row[9]} {row[10]}' 
    for row in chunk[["tradingsymbol","datetime","close","high","low","instrument_type","exchange","instrument_token","open","volume","datetime"]].values]
    t2 = time()
    timeLoop = time() - t1
    t3 = time()
    with open(measurement+'.txt', 'a') as f:
        for item in lines:
            f.write("%s\n" % item)
    timeFile = time() - t2
    print("txt file writing timeMulti: {}, Loop: {},  full: {} write: {}".format(timeMulti,timeLoop,  time() - t0, timeFile))


In [None]:
!ls -ltrh historical_minute.txt
!head historical_minute.txt

In [393]:
from numba import jit

df=pd.DataFrame( np.random.randn(10000,3), columns=['v','h','l'] )

df['vwap_pandas'] = (df.v*(df.h+df.l)/2).cumsum() / df.v.cumsum()

@jit
def vwap():
    tmp1 = np.zeros_like(v)
    tmp2 = np.zeros_like(v)
    for i in range(0,len(v)):
        tmp1[i] = tmp1[i-1] + v[i] * ( h[i] + l[i] ) / 2.
        tmp2[i] = tmp2[i-1] + v[i]
    return tmp1 / tmp2

@jit
def np_vwap():
    return np.cumsum(v*(h+l)/2) / np.cumsum(v)

v = df.v.values
h = df.h.values
l = df.l.values

df['vwap_numpy'] = np.cumsum(v*(h+l)/2) / np.cumsum(v)

df['vwap_numba'] = vwap()

df['vwap_np_numba'] = np_vwap()

In [397]:
%timeit (df.v*(df.h+df.l)/2).cumsum() / df.v.cumsum()  # pandas

%timeit np.cumsum(v*(h+l)/2) / np.cumsum(v)            # numpy

%timeit vwap()                                         # numba

%timeit np_vwap()                                         # numba

613 µs ± 12.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
98.3 µs ± 766 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
38 µs ± 548 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
41.8 µs ± 536 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
