<a href="https://colab.research.google.com/github/mittushaji25/crypto-xrp-analysis/blob/main/xrp_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🧠 Project Step 1: Data Extraction

This step extracts historical XRPUSDT trading data from Binance using their public API. The data includes timestamped information and is saved as a raw CSV file for further cleaning and analysis.

#### ✅ Outputs:
- `xrpusdt_combined.csv`

#### 🔍 Key Tasks:
- Connect to Binance API
- Fetch monthly data starting from January 2023 to till date
- Combine data from multiple files to one for reproducibility


In [None]:
import requests, zipfile, io
import pandas as pd
from datetime import datetime

In [None]:
years = [2023, 2024, 2025]
months = [f"{i:02d}" for i in range(1, 13)]

all_data = pd.DataFrame()

for year in years:
    for month in months:
        # Skip future months in 2025
        if year == 2025 and int(month) > datetime.now().month:
            continue

        url = f"https://data.binance.vision/data/spot/monthly/klines/XRPUSDT/1d/XRPUSDT-1d-{year}-{month}.zip"
        try:
            response = requests.get(url)
            response.raise_for_status()

            with zipfile.ZipFile(io.BytesIO(response.content)) as z:
                z.extractall("xrpusdt_data")
                csv_name = z.namelist()[0]

            df = pd.read_csv(f"xrpusdt_data/{csv_name}", header=None)
            df.columns = [
                "Open Time", "Open", "High", "Low", "Close", "Volume",
                "Close Time", "Quote Asset Volume", "Number of Trades",
                "Taker Buy Base Volume", "Taker Buy Quote Volume", "Ignore"
            ]

            all_data = pd.concat([all_data, df], ignore_index=True)
        except Exception as e:
            print(f"❌ Failed for {year}-{month}: {e}")


❌ Failed for 2025-07: 404 Client Error: Not Found for url: https://data.binance.vision/data/spot/monthly/klines/XRPUSDT/1d/XRPUSDT-1d-2025-07.zip


In [None]:
all_data.to_csv("xrpusdt_combined.csv", index=False)

# 🧪 Project Step 2: Data Exploration

Explore the raw XRPUSDT dataset to understand its structure, identify potential issues, and guide the next steps in cleaning and transformation.

#### 🔍 Key Questions:
- What columns are present, and what do they represent?
- Are there missing values or duplicates?
- Are timestamps valid and consistent?
- Are there outliers or anomalies in price or volume?
- What is the overall shape and distribution of the data?

In [None]:
all_data.shape
all_data.columns
all_data.head()

Unnamed: 0,Open Time,Open,High,Low,Close,Volume,Close Time,Quote Asset Volume,Number of Trades,Taker Buy Base Volume,Taker Buy Quote Volume,Ignore
0,1672531200000,0.3389,0.34,0.3354,0.3387,102026821.0,1672617599999,34471640.0,48661,53592277.0,18107900.0,0
1,1672617600000,0.3386,0.3555,0.3,0.3482,647520487.0,1672703999999,217804000.0,285660,320823575.0,108056900.0,0
2,1672704000000,0.3482,0.352,0.3395,0.3436,346410334.0,1672790399999,119511300.0,154113,172318472.0,59448460.0,0
3,1672790400000,0.3436,0.3605,0.339,0.3472,372672796.0,1672876799999,129585300.0,147677,193027855.0,67139280.0,0
4,1672876800000,0.3471,0.3487,0.3333,0.338,265529668.0,1672963199999,90735850.0,106910,129294274.0,44187810.0,0


In [None]:
# Check data types
all_data.dtypes

Unnamed: 0,0
Open Time,int64
Open,float64
High,float64
Low,float64
Close,float64
Volume,float64
Close Time,int64
Quote Asset Volume,float64
Number of Trades,int64
Taker Buy Base Volume,float64


In [None]:
# Summary statistics
all_data.describe()

Unnamed: 0,Open Time,Open,High,Low,Close,Volume,Close Time,Quote Asset Volume,Number of Trades,Taker Buy Base Volume,Taker Buy Quote Volume,Ignore
count,912.0,912.0,912.0,912.0,912.0,912.0,912.0,912.0,912.0,912.0,912.0,912.0
mean,347382600000000.0,0.970641,1.003148,0.936285,0.972723,370095900.0,347399800000000.0,349257800.0,772864.2,183306200.0,172687600.0,0.0
std,695075300000000.0,0.792662,0.824639,0.757598,0.793491,294127200.0,695109800000000.0,507763200.0,1211049.0,146831100.0,252620500.0,0.0
min,1672531000000.0,0.338,0.34,0.3,0.338,40526700.0,1672618000000.0,27507210.0,38367.0,19985830.0,12614340.0,0.0
25%,1692209000000.0,0.502775,0.511175,0.488975,0.50295,188705100.0,1692295000000.0,130432900.0,155888.5,91578590.0,63676820.0,0.0
50%,1711886000000.0,0.5613,0.5766,0.5432,0.56205,303166400.0,1711973000000.0,194928500.0,287023.0,152131400.0,97593240.0,0.0
75%,1731564000000.0,0.816275,0.930425,0.77055,0.837975,442572700.0,1731650000000.0,347896100.0,805287.2,220180000.0,170456100.0,0.0
max,1751242000000000.0,3.2923,3.4,3.1758,3.2922,2543717000.0,1751328000000000.0,6398461000.0,11491880.0,1284592000.0,3232098000.0,0.0


In [None]:
# Missing values
all_data.isnull().sum()

Unnamed: 0,0
Open Time,0
Open,0
High,0
Low,0
Close,0
Volume,0
Close Time,0
Quote Asset Volume,0
Number of Trades,0
Taker Buy Base Volume,0


In [None]:
# Duplicates
all_data.duplicated().sum()

np.int64(0)

In [None]:
# Timestamps coversion from Unix -- Open & Close Time
all_data['Open Time'] = pd.to_datetime(all_data['Open Time'], unit='ms', errors='coerce')
all_data['Close Time'] = pd.to_datetime(all_data['Close Time'], unit='ms', errors='coerce')

Unnamed: 0,Open Time
0,2023-01-01
1,2023-01-02
2,2023-01-03
3,2023-01-04
4,2023-01-05
...,...
907,NaT
908,NaT
909,NaT
910,NaT
