**Top Trending Coin on Reddit**

praw – Interacts with Reddit’s API to fetch posts and comments. logging – Tracks errors, warnings, and events for debugging. dotenv – Loads API keys securely from a .env file.

In [41]:
%pip install praw pandas requests python-dotenv
import praw
import pandas as pd
import logging
import os
import re
import requests
from datetime import datetime
from dotenv import load_dotenv
from collections import Counter


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [42]:

load_dotenv()

True


Fetches and filters cryptocurrencies listed on **both CoinGecko and Coinbase**:
- **APIs Used**:  
  CoinGecko (`/coins/list`) for symbols/names, Coinbase (`/products`) for trading pairs.
- **Filters**:  
  - Keeps cryptos present in **both platforms** (symbols from CoinGecko, product IDs from Coinbase).  
  - Explicitly removes **BTC** and **ETH** (and their names).  
- **Returns**:  
  - `verified_crypto_set`: Lowercase symbols (e.g., `sol`, `ada`).  
  - `verified_crypto_names`: Names (likely empty due to symbol/name mismatch).  

Handles API errors and logs status.  

In [43]:
# Logging setup
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(message)s"
)

def get_verified_crypto_list():
    """Fetch a list of real cryptocurrencies from CoinGecko and Coinbase, filtering by market cap."""
    coingecko_url = "https://api.coingecko.com/api/v3/coins/list"
    coinbase_url = "https://api.exchange.coinbase.com/products"
    
    try:
        response_gecko = requests.get(coingecko_url)
        response_coinbase = requests.get(coinbase_url)
        
        if response_gecko.status_code == 200 and response_coinbase.status_code == 200:
            coin_data_gecko = response_gecko.json()
            coin_data_coinbase = response_coinbase.json()
            
            crypto_set_gecko = {coin["symbol"].lower() for coin in coin_data_gecko}
            crypto_names_gecko = {coin["name"].lower() for coin in coin_data_gecko}
            
            crypto_set_coinbase = {product["id"].split('-')[0].lower() for product in coin_data_coinbase}
            
            verified_crypto_set = crypto_set_gecko.intersection(crypto_set_coinbase)
            verified_crypto_names = crypto_names_gecko.intersection(crypto_set_coinbase)
            
            verified_crypto_set.discard('btc')
            verified_crypto_set.discard('eth')
            verified_crypto_names.discard('bitcoin')
            verified_crypto_names.discard('ethereum')
            
            logging.info(f"Loaded {len(verified_crypto_set)} verified cryptocurrencies from CoinGecko and Coinbase.")
            return verified_crypto_set, verified_crypto_names
        else:
            logging.warning("Failed to fetch verified crypto list from CoinGecko or Coinbase.")
    except Exception as e:
        logging.error(f"Error fetching crypto list: {e}")
    return set(), set()



**Purpose**: Scrape/find trending cryptocurrencies on Reddit using verified lists.  

**Key Features**:  
- **Auth**: Connects to Reddit via PRAW using env variables.  
- **Trend Analysis**: Identifies most-mentioned crypto (excl. BTC/ETH) in top weekly posts.  
- **Post Fetching**: Retrieves posts for a specific crypto keyword with metadata.  
- **Validation**: Uses pre-verified crypto symbols/names from CoinGecko/Coinbase.  

**Limitations**:  
- `valid_crypto_names` may not work reliably (symbol vs. name mismatch).  

In [48]:
class RedditScraper:
    def __init__(self):
        self.reddit = praw.Reddit(
            client_id=os.getenv('REDDIT_CLIENT_ID'),
            client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
            user_agent=os.getenv('REDDIT_USER_AGENT')
        )
        self.valid_cryptos, self.valid_crypto_names = get_verified_crypto_list()

    def tokenize(self, text):
        """Extract words from text, ensuring they are clean and meaningful."""
        return re.findall(r'\b[a-zA-Z0-9-]+\b', text.lower())

    def get_top_trending_coin(self, subreddit='cryptocurrency', limit=100):
        """Find the most mentioned cryptocurrency in Reddit posts, ensuring it is verified by CoinGecko and Coinbase."""
        try:
            posts = self.reddit.subreddit(subreddit).top(time_filter='week', limit=limit)
            keyword_count = Counter()

            for post in posts:
                words = self.tokenize(post.title)
                for word in words:
                    if word in self.valid_cryptos or word in self.valid_crypto_names:
                        keyword_count[word] += 1

            for coin, _ in keyword_count.most_common():
                if coin.lower() == "trump":
                    continue
                if coin in self.valid_cryptos or coin in self.valid_crypto_names:
                    logging.info(f"Top trending verified cryptocurrency (excluding BTC & ETH): {coin}")
                    return coin
            
            logging.warning("No trending cryptocurrency found.")
        except Exception as e:
            logging.error(f"Error fetching trending cryptocurrency: {e}")
        return None

    def get_posts(self, keyword, subreddit='cryptocurrency', limit=50):
        """Retrieve posts for a given cryptocurrency keyword."""
        try:
            posts = self.reddit.subreddit(subreddit).search(query=keyword, limit=limit, params={'sort': 'new'})
            results = []
            
            exclusion_terms = [
                "president", "donates", "meet", "policy", "election", "news",
                "Understanding", "administration", "donated", "campaign", "Tariff",
                "congress", "senate", "tax", "announcement", "Chairman", "pardon", "interview", 
                "lawsuit", "motion", "White House", "says", "shilling"
            ]
            
            for post in posts:
                title = post.title.lower()
                if keyword.lower() == "trump" or any(term in title for term in exclusion_terms):
                    continue  # Skip non-crypto Trump mentions

                if "trump coin" in title or "trumpcoin" in title:
                    continue
                
                results.append({
                    'keyword': keyword,
                    'title': post.title,
                    'content': post.selftext,
                    'score': post.score,
                    'url': post.url,
                    'created_at': datetime.fromtimestamp(post.created_utc),
                })

            logging.info(f"Found {len(results)} posts for {keyword}")
            return results
        except Exception as e:
            logging.error(f"Error getting posts: {e}")
        return []

### `DataManager`

**Handles data cleanup/storage**:  
- **Clean**: Strips URLs/special chars, enforces lowercase.  
- **Save**: Exports cleaned Reddit posts to dated CSV (e.g., `sol_clean_2024-05-20.csv`).  
- Auto-creates folders; logs empty data warnings.  

In [49]:
class DataManager:
    @staticmethod
    def clean_text(text):
        """Text cleaning pipeline."""
        if not text or text.strip() == '':
            return 'no_content'
        text = re.sub(r'http\S+', '', text)  # Remove URLs
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
        return text.lower().strip() or 'no_content'

    @staticmethod
    def save_data(posts, keyword):
        """Save post data to a CSV file."""
        if not posts:
            logging.warning(f"No data to save for {keyword}.")
            return

        df = pd.DataFrame(posts)
        df['content'] = df['content'].apply(DataManager.clean_text)
        
        save_dir = "cleaned_data"
        os.makedirs(save_dir, exist_ok=True)
        filename = f"{save_dir}/{keyword}_clean_{datetime.now().strftime('%Y-%m-%d')}.csv"
        df.to_csv(filename, index=False)
        logging.info(f"Saved cleaned data to {filename}")

### Main Execution Flow  

In [50]:


if __name__ == "__main__":
    scraper = RedditScraper()
    trending_coin = scraper.get_top_trending_coin()

    if trending_coin:
        posts = scraper.get_posts(trending_coin, limit=50)
        DataManager.save_data(posts, trending_coin)


2025-02-11 12:42:15,560 - Loaded 314 verified cryptocurrencies from CoinGecko and Coinbase.
2025-02-11 12:42:17,302 - Top trending verified cryptocurrency (excluding BTC & ETH): xrp
2025-02-11 12:42:18,371 - Found 41 posts for xrp
2025-02-11 12:42:18,374 - Saved cleaned data to cleaned_data/xrp_clean_2025-02-11.csv
