# Crypto scrape and analysis project

## Aim of the project
- As the cryptocurrency environment is expanding very fast, it has become difficult to gather meaningful information in one place about any project.
- Input : a crypto token name (available on CoinGecko)
- Outputs : 
    1. Important data about the token 
        - price
        - price evolution for main periods
        - market capitalization
        - daily trading volume
        - sentiment of people
    2. News analysis (using NLP)
    3. Designed infography gathering relevant information
- Long term goal : evolution and comparison of a portfolio (Ethereum, Solana, Elrond, Uniris...) 

## 1. Retrieve CoinGecko's data

In [1]:
import pandas as pd

from datetime import datetime
from selenium import webdriver

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec

import nltk
from nltk.tokenize import word_tokenize
#nltk.download('punkt')
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer #Tokenizer that takes only alphanumerical characters (no punctuation)
from nltk.util import ngrams

from bs4 import BeautifulSoup
import requests

In [57]:
# Function to extract the % of good sentiment in the sentence scraped
# Input : string
# Output : int

def extract_good_sentiment_pct(sentence):
    res = []
    for char in sentence:
        if char.isdigit():
            res.append(char)
    n_char = ''
    for i in range(len(res)):
        n_char += res[i]
    n = int(n_char)
    return n/100

In [3]:
# Function to clean the price output of coingecko
# Input : unformated price (string)
# Output : price (int)

def format_price(price_string):
    price_string = price_string.replace("\u202f","")
    price_string = price_string.replace(" $US","")
    price_string = price_string.replace(",", ".")
    price = float(price_string)
    return price

In [4]:
def format_pct(pct_string):
    return(round(float(pct_string.replace('%',''))/100,3))

In [45]:
def click_sentiment_button(driver, xpath):
    try:
        driver.execute_script("window.scrollTo(0,2000)")
        wait = WebDriverWait(driver, 5)
        system = wait.until(ec.element_to_be_clickable((By.XPATH, xpath)))
        system.click()
    except:
        pass

In [1]:
def make_search(coin_name):
    driver = webdriver.Chrome('/Users/manuel/Documents/GitHub/chromedriver')
    driver.get(f"https://www.coingecko.com/fr/pi%C3%A8ces/{coin_name}")
    driver.implicitly_wait(10)

In [58]:
# Function that returns coin gecko's data about a given coin
# Input : full coin name (ex: bitcoin, ethereum...) (string)
# Output : Number of favorites (int), current price in $ (float), 

def retrieve_coingecko_data(coin_name):
    
    make_search(coin_name)
    
    price_string = driver.find_element_by_xpath('/html/body/div[4]/div[4]/div[1]/div/div[1]/div[4]/div/div[1]/span[1]/span').text
    price = format_price(price_string)
    
    hourly_evolution_string = driver.find_element_by_xpath('/html/body/div[4]/div[6]/div/div[2]/div[1]/div/div[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/span').text
    hourly_evolution = format_pct(hourly_evolution_string)
    
    daily_evolution_string = driver.find_element_by_xpath('/html/body/div[4]/div[6]/div/div[2]/div[1]/div/div[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[2]/span').text
    daily_evolution = format_pct(daily_evolution_string)
    
    weekly_evolution_string = driver.find_element_by_xpath('/html/body/div[4]/div[6]/div/div[2]/div[1]/div/div[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[3]/span').text
    weekly_evolution = format_pct(weekly_evolution_string)
    
    bimonthly_evolution_string = driver.find_element_by_xpath('/html/body/div[4]/div[6]/div/div[2]/div[1]/div/div[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[4]/span').text
    bimonthly_evolution = format_pct(bimonthly_evolution_string)
    
    monthly_evolution_string = driver.find_element_by_xpath('/html/body/div[4]/div[6]/div/div[2]/div[1]/div/div[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[5]/span').text
    monthly_evolution = format_pct(monthly_evolution_string)
    
    yearly_evolution_string = driver.find_element_by_xpath('/html/body/div[4]/div[6]/div/div[2]/div[1]/div/div[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[6]/span').text
    yearly_evolution = format_pct(yearly_evolution_string)
    
    market_cap_string = driver.find_element_by_xpath('/html/body/div[4]/div[4]/div[1]/div/div[2]/div[2]/div[1]/div/div/div[1]/span[2]/span').text
    market_cap = format_price(market_cap_string)
    
    daily_trading_string = driver.find_element_by_xpath('/html/body/div[4]/div[4]/div[1]/div/div[2]/div[2]/div[1]/div/div/div[2]/span[2]/span').text
    daily_trading = format_price(daily_trading_string)
    
    sentiment_xpath = "//*[@id='general']/div[1]/div[1]/div[2]/div[3]/div[1]/div[2]/a[1]"
    click_sentiment_button(driver, sentiment_xpath)
    good_sentiment = driver.find_element_by_xpath("/html/body/div[4]/div[6]/div/div[2]/div[1]/div/div[1]/div[1]/div[1]/div[2]/div[3]/div[2]/div[2]/div[2]/div[1]").get_attribute("style")
    good_sentiment_n = extract_good_sentiment_pct(good_sentiment)
    
    return ({"price ($US)":price,
             "price_evolution":{"1h":hourly_evolution, 
                                "24h":daily_evolution,
                                "7d": weekly_evolution,
                                "14d": bimonthly_evolution,
                                "1m": monthly_evolution,
                                "1y": yearly_evolution},
             "market cap":market_cap,
             "daily trading volume":daily_trading,
             "good_sentiment": good_sentiment_n})
    

In [60]:
coin="solana"
retrieve_coingecko_data(coin)

{'price ($US)': 185.9,
 'price_evolution': {'1h': -0.007,
  '24h': 0.029,
  '7d': 0.27,
  '14d': 1.113,
  '1m': 3.447,
  '1y': 52.835},
 'market cap': 54569328405.0,
 'daily trading volume': 5906181784.0,
 'good_sentiment': 0.76}

## News textual analysis (w/ NLTK)

#### Get 10 last news titles and descriptions related to the coin

In [11]:
def scrape_news_titles(coin_name):
    driver = webdriver.Chrome('/Users/manuel/Documents/GitHub/chromedriver')
    driver.get(f"https://www.coingecko.com/fr/pi%C3%A8ces/{coin_name}/news")
    driver.implicitly_wait(10)

    titles = driver.find_elements_by_class_name("tw-text-xl")

    titles_text = []
    for title in titles:
        titles_text.append(title.text)
    titles_text = titles_text[4:]

    descriptions = driver.find_elements_by_class_name("post-body")
    desc_text = []
    for desc in descriptions:
        desc_text.append(desc.text)
    
    return (titles_text, desc_text)

In [12]:
news = scrape_news_titles("ethereum")
news[0]

['Le stablecoin USDC privilégie Solana (SOL) et snobe Ethereum',
 'EY choisit Polygon (MATIC) pour mettre à l’échelle ses produits blockchain',
 '2 milliards de dollars affluent sur Arbitrum… Grâce à Nyan Cat',
 'Le géant de l’audit financier EY va s’appuyer sur Polygon pour permettre à ses clients de profiter de l’écosystème Ethereum',
 'Les NFTs inspirés de collections Ethereum flambent sur Solana',
 'Sacrilège chez OpenSea : des NFT historiques détruits… par erreur',
 'Solana est-il un véritable « Ethereum Killer » ? Comparatif et perspectives',
 'Investissements institutionnels dans la crypto : les experts pèsent le pour et le contre des implications',
 'Les oracles de Chainlink partent à l’assaut d’Optimism sur Ethereum',
 'Explosion des utilisateurs du wallet Ethereum Metamask : la barre des 10 millions franchie']

#### Tokenize a text

In [33]:
#Input : list of lists
#Output : list
def concat_list(l):
    res = []
    for i in range(len(l)):
        for j in range(len(l[i])):
            res.append(l[i][j])
    return res


#Input : list of strings
#output : clean list of strings (without spaces and breaks notations)
def clean_sentences(l):
    clean_list = []
    for element in l:
        clean_element = element.replace('\xad', '')
        clean_element = clean_element.replace('\xa0',' ')
        clean_list.append(clean_element)
    return clean_list


#Input : scraped text, as a list of strings (sentences) and a list of stop words to add if necessary
#Output : clean list of words of the scraped text (without punctuation or mistakes)
def tokenize(text, stop_w_extend):
    
    tokenizer = RegexpTokenizer(r'\w+') #set tokenizer to withdraw punctuation
    clean_text = clean_sentences(text) #clean text
    
    sentences_list = []
    for element in clean_text: #tokenize each sentence of the text
        tokenized_sentence = tokenizer.tokenize(element)
        sentences_list.append(tokenized_sentence)
        
    words_list = concat_list(sentences_list) #make one list of words out of the list of lists of words
    words_list = [string.lower() for string in words_list]
    
    stop_words=set(stopwords.words("french")) #words to withdraw because meaningless
    sw_l = [w for w in stop_words]
    sw_l.extend(stop_w_extend)

    final_words = []
    for word in words_list:
        if word not in sw_l:
            final_words.append(word)

    return final_words


# Input : text as a list of strings
# Output : top n most frequent words in the text
def most_frequent_words(text, n, stop_w_extend):
    words_list = tokenize(text, stop_w_extend)
    return FreqDist(words_list).most_common(n)


#Input : text as a list of strings, int for the number of grams, coin_name to spot relevant grams
#Output : list of relevant ngrams
def word_sequence(text, n, keyword):
    sequence = ngrams(text,n)
    grams_list = []
    for grams in sequence:
        if f"{keyword}" in grams:
              grams_list.append(grams)
    return grams_list

## Scrape last 10 articles related to a coin (source : CoinGecko)

In [5]:
# function to extract html document from given url
def getHTMLdocument(url):
    response = requests.get(url)
    return response.text
  
    
def scrape_url_text(url):
    html_document = getHTMLdocument(url)
    soup = BeautifulSoup(html_document, 'html.parser')
    text = []
    for link in soup.find_all('p'):
        text.append(link.text)
    return text


#Input : coin name (string)
#Output : text of the last 10 articles on CoinGecko about associated to the coin
def scrape_top10_article(coin_name):
    driver = webdriver.Chrome('/Users/manuel/Documents/GitHub/chromedriver')
    driver.get(f"https://www.coingecko.com/fr/pi%C3%A8ces/{coin_name}/news")
    driver.implicitly_wait(10)

    links = []
    for i in range(10):
        elem = driver.find_element_by_xpath(f"//*[@id='news']/article[{i+1}]/div/div[2]/header/h2/a")
        links.append(elem.get_attribute("href"))
    
    top10_articles = []
    for i in range(10):
        article_text = scrape_url_text(links[i])
        top10_articles.append(article_text)
    
    top10_articles = concat_list(top10_articles)
    
    return top10_articles

In [24]:
# Raw text, not cleaned (one list of strings)
top10_test = scrape_top10_article('ethereum')

In [34]:
# Tokenize and clean the text
stop_w = ['a','cette', 'cookies', 'site','to','le', 'la', 'l','of']
words_top10_test = tokenize(top10_test, stop_w)

# NEXT STEPS :
# -> lower words and get all necessary additional stop words

#### Most frequent words of a text and number of occurences

In [36]:
most_frequent_words(top10_test,15, stop_w)

[('plus', 61),
 ('ethereum', 59),
 ('eth', 38),
 ('réseau', 29),
 ('2', 28),
 ('dollars', 25),
 ('cours', 22),
 ('milliards', 22),
 ('bitcoin', 22),
 ('crypto', 22),
 ('offre', 19),
 ('nft', 19),
 ('3', 18),
 ('blockchain', 17),
 ('septembre', 17)]

#### Sequences of n words in which a chosen keyword appear

In [37]:
word_sequence(words_top10_test,3, 'ethereum')[:5]

[('cofondateur', 'réseau', 'ethereum'),
 ('réseau', 'ethereum', 'vient'),
 ('ethereum', 'vient', 'temps'),
 ('cofondateur', 'réseau', 'ethereum'),
 ('réseau', 'ethereum', 'vient')]

In [39]:
# NEXT STEPS

# 1. tokenize
# 2. Create a df and attribute weight to meaningful words (out of punctuation and stop words)
# 3. Don't forget to uniformize to lower
# 4. Loop over sentences, compute their score and filter for less than 30 words sentences 

In [41]:
df_words_weigth = pd.DataFrame(words_top10_test)
df_words_weigth.rename(columns={0:'word'}, inplace = True)

df_words_weigth.head()

Unnamed: 0,word
0,ether
1,enregistré
2,nette
3,augmentation
4,cours
