In [None]:
"""
----------------------------------------------
I.  Update CMC info from the API
----------------------------------------------
 1. get top 600 info from CMC
         [CMC_id, name, rank, ticker, source_code_unverified]
 2. get top 600 website info
         [primary_web, secondary_web]
 3. merge the two dataframes as 'dfnew'
 4. write 'cmc_top_600.csv'
----------------------------------------------
II. Merge with manually updated Source Code info
----------------------------------------------
 4. read 'cmc_data_manually_verified.csv'
 5. merge together new ['Source_code_unverified'] with 
    manually updated ['source_code']
 6. remove duplicates
 7. write 'cmc_data_600_updated_TIMESTAMP.csv'
----------------------------------------------
III. Tidy up
---------------------------------------------- 
 8. handle rank > 600 ?
 9. manually verify new repos
----------------------------------------------
IV. Send to prepare_repos.py to get forge & repo
---------------------------------------------- 
"""

In [None]:
# This script will contact CoinMarketCap's API to gather info about cryptocurrency projects
# 
# It will merge two calls to create a dataframe/csv listing project's:
#  <id>, <rank>, <ticker>, <name>, & <source code location>
# 
# You must have a coinmarketcap dev key to use their API
#
from datetime import datetime
import time
import pandas as pd 
from requests import Request, Session
from requests.exceptions import ConnectionError, Timeout, TooManyRedirects
import json
KEY = 'xxx'

In [None]:
# listing of top 600 by market cap (200 per 1 credit; 333 credits/day)
# default sort is by market cap
# see: https://coinmarketcap.com/api/documentation/v1/#operation/getV1CryptocurrencyListingsLatest
# this will fetch all 600 at one time
url = 'https://pro-api.coinmarketcap.com/v1/cryptocurrency/listings/latest'
parameters = {
    'start':'1',
    'limit':'600',
    'convert':'USD'
}
headers = {
    'Accepts': 'application/json',
    'X-CMC_PRO_API_KEY': KEY,
}

session = Session()
session.headers.update(headers)

try:
    response = session.get(url, params=parameters)
    data_listing = json.loads(response.text)
except (ConnectionError, Timeout, TooManyRedirects) as e:
    print(e)

In [None]:
# extract data fields; could be more efficient
# access CMC_ID field 
# access NAME field 
# access SYMBOL (ticker) field 
# access RANK field 
cmc_id = []
name = []
ticker = []
cmc_rank = []
for project in data_listing['data']:
    cmc_id.append(project['id'])  
    name.append(project['name'])
    ticker.append(project['symbol'])
    cmc_rank.append(project['cmc_rank'])

In [None]:
# create a string of IDs to pass to the API
IDString = ''
for id in cmc_id:
    IDString = IDString + str(id) +','
IDString = IDString[:-1]
#print(IDString)

#testIDString=''
#for id in testID:
#    testIDString = testIDString + str(id) +','
#testIDString = testIDString[:-1]
#print(testIDString)

In [None]:
# get JSON listing of metadata
# see: https://coinmarketcap.com/api/documentation/v1/#operation/getV2CryptocurrencyInfo
# 1 credit per 100 cryptocurrencies (rounded up)
# pass a string of IDs, slugs, or symbols (tickers)
#    IDString is 600 ids long from the previous api call
#    should take 2 credits
url = 'https://pro-api.coinmarketcap.com/v2/cryptocurrency/info'
parameters = {
    'id':IDString,
}
headers = {
    'Accepts': 'application/json',
    'X-CMC_PRO_API_KEY': KEY,
}
session = Session()
session.headers.update(headers)

try:
    response = session.get(url, params=parameters)
    metadata = json.loads(response.text)
except (ConnectionError, Timeout, TooManyRedirects) as e:
    print(e)


In [None]:
# ------------------------------------------------------------
# --G E T  S O U R C E  C O D E-------------------------------
# ------------------------------------------------------------
# access date::<key>::urls::sourcecode field
# json order is not preserved so build an array containing:
# [ [id, sourcecode], [id,sourcecode],...,[id,sourcecode]]
# where id is CMC unique id
source = []
for project in metadata['data']:
    id = project
    sc = metadata['data'][id]['urls']['source_code']
    # turn the sc list into a string and trim the ['']
    element = [id,str(sc)[2:-2]]
    source.append(element)

# create dataframe from this list so it can be merged 
# sourcecode is pulled fresh from web so is unverified by human
dfsource = pd.DataFrame(data=source, columns=['CMC_id', 'source_code_unverified'])

In [None]:
df = pd.DataFrame({'CMC_id':cmc_id}) 
# add columns, probably a more elegant way to do this
df['CMC_rank'] = cmc_rank
df['name'] = name
df['ticker'] = ticker

# DF is an INT type and DFSOURCE is a STRING type, cast df as string to merge
# df.astype(str)
dfout = pd.merge(df.astype(str), dfsource, on=['CMC_id'])


In [None]:
# write to a CSV for doing other stuffs
# if using COLAB 
# must authenticate first with google drive
# from google.colab import drive
# drive.mount('drive')

In [None]:
# write the dataframe to CSV and copy to drive/directory
dfout.to_csv('cmc_data.csv', index=False, encoding='utf-8')
# >!cp CMCdata.csv "drive/My Drive/PhDstuffs"

In [None]:
# ------------------------------------------------------------
# --G E T  W E B S I T E--------------------------------------
# ------------------------------------------------------------
# access date::<key>::urls::sourcecode field
# json order is not preserved so build an array containing:
# [ [id, website],...,[id, website]]
# where id is CMC unique id
site = []
for project in metadata['data']:
    id = project
    web = metadata['data'][id]['urls']['website']
    # turn the list into a string and trim the ['']
    #print(sc)
    element = [id,str(web)[1:-1]]
    site.append(element)

# create dataframe from this list so it can be merged 
dfweb = pd.DataFrame(data = site, columns = ['CMC_id','website'])

In [None]:
# some listings have muliple sites
# split into primary and secondary sites
# first instance is web_primary
# if second is web_secondary
dfweb = pd.concat([dfweb[['CMC_id']], dfweb['website'].str.split(', ', expand=True)], axis=1)
dfweb.rename(columns = {0:'web_primary',1:'web_secondary'}, inplace=True)
# remove single quotes
dfweb['web_primary'].replace("[\']", "", inplace=True, regex=True)
dfweb['web_secondary'].replace("[\']", "", inplace=True, regex=True)

In [None]:
# output to '600_websites.csv'; key is 'CMC_id'
dfweb.to_csv('600_websites.csv', index=False, encoding='utf-8')

In [None]:
# ------------------------------------------------------------
# --M E R G E   S O U R C E   w   W E B S I T E---------------
# ------------------------------------------------------------
#dfout=pd.merge(df.astype(str),dfsource, on=['CMC_id'])
dfnew = pd.merge(dfout, dfweb, on=['CMC_id'], how = 'outer')

In [None]:
# update the types; makes for easier comparison
dfnew['CMC_id'] = dfnew['CMC_id'].astype('int')
dfnew['CMC_rank'] = dfnew['CMC_rank'].astype('int')

# output to 'cmc_top_600.csv'; key is 'CMC_id'
dfnew.to_csv('cmc_top_600.csv', index=False, encoding='utf-8')

In [None]:
# ------------------------------------------------------------
# --U P D A T E   w   M A N U A L L Y   V E R I F I E D-------
# read in the man[ual] file that has source code repos verified
# and merge with the new data
# ------------------------------------------------------------
dfman = pd.read_csv('cmc_data_manually_verified.csv')

# drop the columns that could be outdated as this will help with the df merger:
#   CMC_rank
#   name
#   ticker
#   web_primary
#   web_secondary
#
dfman.drop(['CMC_rank', 'name', 'ticker', 'web_primary', 'web_secondary'], axis = 1, inplace = True)

In [None]:
# merge the two together; keep common columns
# If any of 'CMC_id','name', or 'ticker' have changed this will cause issues
# and these should have been removed in previous step
dfm = pd.merge(dfnew, dfman, on = ['CMC_id'], how = 'outer')

# can't update this just yet, come are NaN
# dfm['CMC_rank'] = dfm['CMC_rank'].astype('int')

In [None]:
# update source_code_unverified
for row in dfm.itertuples():
    if row.check_source != 'y':
        
        # copy new repo location to 'source_code'
        dfm.at[row.Index, 'source_code'] = row.source_code_unverified
        
        # update the 'check_source' entry for manual verification
        dfm.at[row.Index, 'check_source'] = 'n'

# remove the unverified column
dfm.drop(['source_code_unverified'], axis = 1, inplace = True)

In [None]:
# ------------------------------------------------------------
# --C H E C K   F O R   D U P L I C A T E S-------------------
# CMC_id column *may* contain duplicate entries (CAKE, FUN!?)
# ------------------------------------------------------------
if dfm[dfm.duplicated(['CMC_id'], keep='last')].empty:
    print('No duplicate CMC_ids were found')
else: 
    num = len(dfm[dfm.duplicated(['CMC_id'], keep='last')])
    print(num, 'duplicates were found and deleted based on CMC_id:')
    print(dfm[dfm.duplicated(['CMC_id'], keep='last')])
    
    # delete duplicates keeping 2nd entry
    dfm.drop_duplicates(subset=['CMC_id'], keep='last', inplace=True)


In [None]:
# ------------------------------------------------------------
# --O U T P U T   D A T A   F I L E---------------------------
# ------------------------------------------------------------
# timestamp
date_time = datetime.fromtimestamp(time.time())
# convert timestamp to string in dd-mm-yyyy HH:MM:SS
str_date_time = date_time.strftime("%d-%m-%Y-%H-%M-%S")
str_filename = 'cmc_data_600_updated_' + str_date_time + '.csv'

# output to 'cmc_data_600_updated_TIMESTAMP.csv';
dfm.to_csv(str_filename, index=False, encoding='utf-8')

print('Data written to: '+str_filename)