# EDA on Dexscreener Results
This notebook loads `solana_dexscreener_info.json` and provides a starting point for exploratory data analysis (EDA).


In [1]:
import json
import pandas as pd
from pandas import json_normalize

# Load the results
with open('solana_dexscreener_info.json', 'r') as f:
    data = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(data)
print(f'Loaded {len(df)} records.')
df.head()


Loaded 5961 records.


Unnamed: 0,mint_address,usd_volume,dexscreener_data
0,So11111111111111111111111111111111111111112,834366153878.362,"{""schemaVersion"":""1.0.0"",""pairs"":[{""chainId"":""..."
1,EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v,326903656510.9317,"{""schemaVersion"":""1.0.0"",""pairs"":[{""chainId"":""..."
2,3NZ9JMVBmGAqocybic2c7LQCJScmgsAZ6vQqTDzcqmJh,222885529101.90427,"{""schemaVersion"":""1.0.0"",""pairs"":[{""chainId"":""..."
3,Es9vMFrzaCERmJfrF4H2FYD4KCoNkY11McCe8BenwNYB,55122557274.5685,"{""schemaVersion"":""1.0.0"",""pairs"":[{""chainId"":""..."
4,DL8CcQBJT95hSyzJZE6x3PyafzjH2Xu8E1BdN7eWzwqF,48807687744.86127,"{""schemaVersion"":""1.0.0"",""pairs"":null}"


In [5]:
# Parse the dexscreener_data JSON string for the first row as an example
sample_json = df.loc[1, 'dexscreener_data']
try:
    parsed = json.loads(sample_json)
    print("Successfully parsed JSON")
except Exception as e:
    print('Error parsing JSON:', e)
    parsed = {}
parsed


Successfully parsed JSON


{'schemaVersion': '1.0.0',
 'pairs': [{'chainId': 'solana',
   'dexId': 'orca',
   'url': 'https://dexscreener.com/solana/4fuuiyxtq6qcrdsq9oubyctm7bqswytsyluegzlty4t4',
   'pairAddress': '4fuUiYxTQ6QCrdSq9ouBYcTM7bqSwYTSyLueGZLTy4T4',
   'labels': ['wp'],
   'baseToken': {'address': 'EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v',
    'name': 'USD Coin',
    'symbol': 'USDC'},
   'quoteToken': {'address': 'Es9vMFrzaCERmJfrF4H2FYD4KCoNkY11McCe8BenwNYB',
    'name': 'USDT',
    'symbol': 'USDT'},
   'priceNative': '0.9995',
   'priceUsd': '0.9995',
   'txns': {'m5': {'buys': 0, 'sells': 2},
    'h1': {'buys': 36, 'sells': 31},
    'h6': {'buys': 593, 'sells': 154},
    'h24': {'buys': 1332, 'sells': 425}},
   'volume': {'h24': 641456.21, 'h6': 303432.67, 'h1': 16898.21, 'm5': 24.02},
   'priceChange': {'h1': -0.02, 'h6': -0.02, 'h24': -0.01},
   'liquidity': {'usd': 2054254.9, 'base': 1051608, 'quote': 1003163},
   'fdv': 9338306228,
   'marketCap': 61027113944,
   'pairCreatedAt': 166960

In [None]:
# Extract mint addresses with FDV < 1 billion or no FDV data
# Using existing fdv_df from the previous cells

# Filter for tokens with FDV < 1 billion or NaN (no FDV data)
low_fdv_threshold = 1e9  # 1 billion
low_fdv_tokens = fdv_df[(fdv_df['fdv'] < low_fdv_threshold) | (fdv_df['fdv'].isna())]

# Get the original data for tokens not in fdv_df (those that couldn't be processed)
processed_mints = set(fdv_df['mint_address'])
missing_mints = [addr for addr in df['mint_address'] if addr not in processed_mints]

# Create a dataframe for tokens not in fdv_df
missing_df = df[df['mint_address'].isin(missing_mints)][['mint_address', 'usd_volume']]

print(f"Found {len(low_fdv_tokens)} tokens with FDV < {low_fdv_threshold} or no FDV data")
print(f"Found {len(missing_mints)} tokens not processed in fdv_df")
print(f"Total: {len(low_fdv_tokens) + len(missing_mints)} tokens")

# Combine both sets of tokens (low FDV and missing)
combined_low_fdv = pd.DataFrame({'mint_address': low_fdv_tokens['mint_address'].tolist() + missing_df['mint_address'].tolist()})

# Save to CSV
combined_low_fdv.to_csv('solana_low_fdv_mints.csv', index=False)
print(f"Saved {len(combined_low_fdv)} mint addresses to solana_low_fdv_mints.csv")

# Show first 20 mint addresses
combined_low_fdv.head(20)


In [None]:
# Save to CSV and analyze results
low_fdv_df.to_csv('solana_low_fdv_mints.csv', index=False)
print(f"Results saved to solana_low_fdv_mints.csv")

# Show distribution by status
status_counts = low_fdv_df['fdv_status'].value_counts()
print("\nDistribution by status:")
print(status_counts)

# Show mint addresses only (for easy copying)
print("\nFirst 20 mint addresses:")
print(low_fdv_df['mint_address'].head(20).to_list())


In [6]:
import json
import pandas as pd

def extract_fdv_info(json_file_path):
    """
    Extract FDV (fully diluted valuation) information from dexscreener data.
    Only uses pairs where the mint_address is the baseToken, and deduplicates results.
    
    Args:
        json_file_path: Path to the solana_dexscreener_info.json file
        
    Returns:
        DataFrame with mint_address, name, symbol, fdv, and other relevant info
    """
    # Load the JSON data
    with open(json_file_path, 'r') as f:
        data = json.load(f)
    
    # List to store extracted information
    extracted_info = []
    
    # Process each mint address
    for entry in data:
        mint_address = entry['mint_address']
        usd_volume = entry['usd_volume']
        
        try:
            # Parse the dexscreener data
            dex_data = json.loads(entry['dexscreener_data'])
            
            # Skip if no pairs data
            if not dex_data or 'pairs' not in dex_data or not dex_data['pairs']:
                continue
                
            # Find pairs where this mint_address is the baseToken
            valid_pairs = []
            for pair in dex_data['pairs']:
                if pair.get('baseToken', {}).get('address') == mint_address:
                    valid_pairs.append(pair)
            
            # Skip if no valid pairs found
            if not valid_pairs:
                continue
                
            # Get the pair with the highest liquidity (most reliable data)
            best_pair = max(valid_pairs, key=lambda p: p.get('liquidity', {}).get('usd', 0) or 0)
            
            # Extract information
            info = {
                'mint_address': mint_address,
                'name': best_pair.get('baseToken', {}).get('name'),
                'symbol': best_pair.get('baseToken', {}).get('symbol'),
                #'price_usd': best_pair.get('priceUsd'),
                'fdv': best_pair.get('fdv'),
                #'market_cap': best_pair.get('marketCap'),
                #'liquidity_usd': best_pair.get('liquidity', {}).get('usd'),
                #'volume_24h': best_pair.get('volume', {}).get('h24'),
                #'price_change_24h': best_pair.get('priceChange', {}).get('h24'),
                #'usd_volume': usd_volume,
                #'pair_address': best_pair.get('pairAddress'),
                #'dex_id': best_pair.get('dexId'),
                #'url': best_pair.get('url')
            }
            
            extracted_info.append(info)
            
        except Exception as e:
            print(f"Error processing {mint_address}: {e}")
    
    # Convert to DataFrame and deduplicate
    df = pd.DataFrame(extracted_info)
    df = df.drop_duplicates(subset=['mint_address'])
    
    # Convert numeric columns
    numeric_cols = ['fdv']
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    return df


fdv_df = extract_fdv_info('solana_dexscreener_info.json')
fdv_df.to_csv('solana_fdv_info.csv', index=False)



In [7]:
fdv_df

Unnamed: 0,mint_address,name,symbol,fdv
0,So11111111111111111111111111111111111111112,Wrapped SOL,SOL,
1,EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v,USD Coin,USDC,8.612518e+09
2,3NZ9JMVBmGAqocybic2c7LQCJScmgsAZ6vQqTDzcqmJh,Wrapped BTC (Wormhole),WBTC,4.057594e+08
3,Es9vMFrzaCERmJfrF4H2FYD4KCoNkY11McCe8BenwNYB,USDT,USDT,2.040750e+09
4,6p6xgHyF7AeE6TZkSmFsko444wqoP15icUSqi2jfGiPN,OFFICIAL TRUMP,TRUMP,9.931412e+09
...,...,...,...,...
2768,4XQvdipJBdrb5hUgUrbZPPFmp6BCav41n55dc7KDYW3m,ENGLAND,ENG,4.537900e+04
2769,6AKyiCnqtYw67mMDJsFxBQgHzqUgPUXiQaXNkRz2pump,Father of Crypto,Satoshi,3.364800e+04
2770,7NfRbZBV9U5jfoqqGjDyJrJqsfiPCAJDL3Yf2xMsN1sg,FARTBABY,FARTBABY,1.806800e+04
2771,DNyxufTEwqNCgGruJ4W2ue6u1RTFcH61H4sa42LGpump,bullish,bullish,2.333100e+04


In [8]:
# Extract mint addresses with FDV < 1 billion or no FDV data
# Using existing fdv_df from the previous cells

# Filter for tokens with FDV < 1 billion or NaN (no FDV data)
low_fdv_threshold = 1e9  # 1 billion
low_fdv_tokens = fdv_df[(fdv_df['fdv'] < low_fdv_threshold)]

# Get the original data for tokens not in fdv_df (those that couldn't be processed)
processed_mints = set(fdv_df['mint_address'])
missing_mints = [addr for addr in df['mint_address'] if addr not in processed_mints]

# Create a dataframe for tokens not in fdv_df
missing_df = df[df['mint_address'].isin(missing_mints)][['mint_address', 'usd_volume']]

print(f"Found {len(low_fdv_tokens)} tokens with FDV < {low_fdv_threshold} or no FDV data")
print(f"Found {len(missing_mints)} tokens not processed in fdv_df")
print(f"Total: {len(low_fdv_tokens) + len(missing_mints)} tokens")

# Combine both sets of tokens (low FDV and missing)
combined_low_fdv = pd.DataFrame({'mint_address': low_fdv_tokens['mint_address'].tolist() + missing_df['mint_address'].tolist()})

# Save to CSV
combined_low_fdv.to_csv('solana_low_fdv_mints.csv', index=False)
print(f"Saved {len(combined_low_fdv)} mint addresses to solana_low_fdv_mints.csv")

# Show first 20 mint addresses
combined_low_fdv.head(20)


Found 2748 tokens with FDV < 1000000000.0 or no FDV data
Found 3188 tokens not processed in fdv_df
Total: 5936 tokens
Saved 5936 mint addresses to solana_low_fdv_mints.csv


Unnamed: 0,mint_address
0,3NZ9JMVBmGAqocybic2c7LQCJScmgsAZ6vQqTDzcqmJh
1,EKpQGSJtjMFqKZ9KQanSqYXRcF8fBopzLHYxdM65zcjm
2,7GCihgDB8fe6KNjn2MYtkzZcRjQy3t9GHdC8uHYmW2hr
3,mSoLzYCxHdYgdzU16g5QSh3i5K3z3KZK7ytfqcJm7So
4,v62Jv9pwMTREWV9f6TetZfMafV254vo99p7HSF25BPr
5,7vfCXTUXx5WJV5JADk17DUJ4ksgau7utNKj4b963voxs
6,FUAfBo2jgks6gB4Z4LfZkqSZgzNucisEHqnNebaRxM1P
7,HeLp6NuQkmYB4pYWo2zYs22mESHXPQYzXbB8n4V98jwC
8,7BgBvyjrZX1YKz4oh9mjb8ZScatkkwb8DzFx7LoiVkM3
9,cbbtcf3aa214zXHbiAZQwf4122FBYbraNdFqgw4iMij


In [10]:
# Save to CSV and analyze results
combined_low_fdv.to_csv('solana_low_fdv_mints.csv', index=False)
print(f"Results saved to solana_low_fdv_mints.csv")


Results saved to solana_low_fdv_mints.csv
