In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from selenium.webdriver.support.ui import WebDriverWait
import pandas as pd
import os
import time

In [2]:
# OTC Markets Website URL
url = "https://www.otcmarkets.com/market-activity/current-market/ALL/active/dollarVolume"

## Scrape data by selenium library

In [3]:
#For Mac user, please use "/usr/local/bin/chromedriver"
driver = webdriver.Chrome('/usr/local/bin/chromedriver')
#For Windows user, please use "chromedriver.exe"
#driver = webdriver.Chrome('chromedriver.exe')

driver.get(url)
soup = BeautifulSoup(driver.page_source,'lxml') #Currently, soup only has 25 rows data in web table

In [4]:
# Scrape how much rows in web table
title = soup.title.text
most_active_stocks_no = soup.find("div", {"class": "_12doApv4EC"}).text[17:22]
print(most_active_stocks_no)

18509


In [5]:
#We decide to only collect top 4000 Volume of most active companies.
#We let code to click "MORE" buttomn several times to get top 4000 rows.
#It takes around 6 mins to fully excute this cell code.
for x in range(0, (int(4000/25)-1)):
     
    python_button = driver.find_element_by_class_name('_2sFaw3zGf1')
    python_button.click()
    
    time.sleep(2)


In [6]:
#Need to run soup agian to get top 4000 row data from the browser which is clicked for several times by code.
soup = BeautifulSoup(driver.page_source,'lxml')

In [7]:
OTC_list = []
TIER_list = []

#To scrape TIER link
tier_column = soup.findAll("div", {"class": "_2_IqcDGekT"})
#To scrape Table
table_rows = soup.find_all('tr')

#To scrape the TIER (ps,qb,qx....) from the link
for t in tier_column:
        TIER_list.append(t.img["src"].split("/")[3].split(".")[0])

#To scrape the data in the table
for tr in table_rows:
            td = tr.find_all('td')
            row = [tr.text for tr in td]
            OTC_list.append(row)

driver.quit()

In [8]:
# Convert the TIER list into a DataFrame
TIER = pd.DataFrame(TIER_list, columns=["TIER"])
TIER.index += 1
TIER.head()

Unnamed: 0,TIER
1,ps
2,ps
3,ps
4,qx
5,qx


In [9]:
# Convert the OTC List into Dataframe
OTC = pd.DataFrame(OTC_list, columns=["SYMBOL", "PRICE", "CHANGE_PERCENT", "VOL", "SHARE_VOL", "TRADES"])
OTC = OTC.iloc[1:]
OTC.head()

Unnamed: 0,SYMBOL,PRICE,CHANGE_PERCENT,VOL,SHARE_VOL,TRADES
1,TCEHY,49.76,1.1,100474741,2015981,5064
2,IDEXY,14.7,0.62,56714077,3872068,302
3,SFTBY,55.88,4.84,48413077,875543,2476
4,RHHBY,33.23,0.97,46887527,1415561,1369
5,GBTC,7.325,6.93,45268895,6058170,7443


In [10]:
#Merge two data sets into one data set
OTC_Markets = pd.merge(TIER, OTC, left_index=True, right_index=True)
OTC_Markets.head()

Unnamed: 0,TIER,SYMBOL,PRICE,CHANGE_PERCENT,VOL,SHARE_VOL,TRADES
1,ps,TCEHY,49.76,1.1,100474741,2015981,5064
2,ps,IDEXY,14.7,0.62,56714077,3872068,302
3,ps,SFTBY,55.88,4.84,48413077,875543,2476
4,qx,RHHBY,33.23,0.97,46887527,1415561,1369
5,qx,GBTC,7.325,6.93,45268895,6058170,7443


In [11]:
# Only keep qx and qb TIER which we want to know
OTC_qxqb_Markets = OTC_Markets[(OTC_Markets.TIER == 'qb') | (OTC_Markets.TIER == 'qx')]
OTC_qxqb_Markets.head()

Unnamed: 0,TIER,SYMBOL,PRICE,CHANGE_PERCENT,VOL,SHARE_VOL,TRADES
4,qx,RHHBY,33.23,0.97,46887527,1415561,1369
5,qx,GBTC,7.325,6.93,45268895,6058170,7443
13,qb,FMCKJ,11.25,2.27,12268924,843803,293
20,qb,FNMAS,11.315,1.57,8654586,769755,695
21,qx,CURLF,10.7564,-3.22,8470204,772704,2774


In [12]:
# Reset Index
OTC_qxqb_Markets.reset_index(drop=True,inplace=True)
OTC_qxqb_Markets.index += 1
OTC_qxqb_Markets.head()

Unnamed: 0,TIER,SYMBOL,PRICE,CHANGE_PERCENT,VOL,SHARE_VOL,TRADES
1,qx,RHHBY,33.23,0.97,46887527,1415561,1369
2,qx,GBTC,7.325,6.93,45268895,6058170,7443
3,qb,FMCKJ,11.25,2.27,12268924,843803,293
4,qb,FNMAS,11.315,1.57,8654586,769755,695
5,qx,CURLF,10.7564,-3.22,8470204,772704,2774


In [13]:
#export csv to be backup
file_path = os.path.join("data","OTCmarkets4000_qx_qb.csv")
OTC_qxqb_Markets.to_csv(file_path, index = False)

## Convert dataframe into Mongodb

In [14]:
import pandas as pd
import json
import os
import datetime
import pymongo
import pprint

In [15]:
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

db = client.otc_market_finance_data

# Drop old DB in order to create a new one
db.otc_market_most_active_stocks.drop()

collection = db.otc_market_most_active_stocks

In [16]:
records = json.loads(OTC_qxqb_Markets.T.to_json()).values()
db.otc_market_most_active_stocks.insert_many(OTC_qxqb_Markets.to_dict('records'))

<pymongo.results.InsertManyResult at 0x11f409ec8>