In [12]:
# Environment & imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path
import os
# Styling and reproducibility
sns.set(style='darkgrid')
%matplotlib inline
np.random.seed(42)

# Locate data directory robustly using os (works even if kernel cwd differs)
cwd = os.getcwd()
base = Path(cwd)
candidates = [base / 'data', base.parent / 'data', Path('data'), Path('..') / 'data']
data_dir = None
for c in candidates:
    try:
        if c.exists():
            data_dir = c
            break
    except Exception:
        continue
if data_dir is None:
    data_dir = base / 'data'
# Debug prints to help locate files when running in different kernels
print('os.getcwd():', cwd)
print('Working dir (Path):', base)
print('Using data dir:', data_dir)
print('Data dir exists:', data_dir.exists())
if data_dir.exists():
    print('Listing data dir:')
    for p in os.listdir(str(data_dir)):
        print(' -', p)
else:
    print('data directory not found at any candidate locations')

hist_path = data_dir / 'clean_historical.csv'
sent_path = data_dir / 'clean_sentiment.csv'
merged_path = data_dir / 'merged_by_date.csv'

# Load data if available (safe fallback to empty DataFrame)
hist = pd.read_csv(hist_path, parse_dates=['timestamp'], dayfirst=True, infer_datetime_format=True) if hist_path.exists() else pd.DataFrame()
sent = pd.read_csv(sent_path, parse_dates=['date'], infer_datetime_format=True) if sent_path.exists() else pd.DataFrame()
merged = pd.read_csv(merged_path, parse_dates=['date'], infer_datetime_format=True) if merged_path.exists() else pd.DataFrame()

print('hist_path.exists():', hist_path.exists())
print('sent_path.exists():', sent_path.exists())
print('merged_path.exists():', merged_path.exists())

# Quick peek
if not hist.empty:
    display(hist.head().T)
else:
    print('clean_historical.csv not found at', hist_path)

os.getcwd(): c:\Users\arman\Downloads\task\notebooks
Working dir (Path): c:\Users\arman\Downloads\task\notebooks
Using data dir: c:\Users\arman\Downloads\task\data
Data dir exists: True
Listing data dir:
 - clean_historical.csv
 - clean_sentiment.csv
 - merged_by_date.csv


  hist = pd.read_csv(hist_path, parse_dates=['timestamp'], dayfirst=True, infer_datetime_format=True) if hist_path.exists() else pd.DataFrame()


hist_path.exists(): True
sent_path.exists(): True
merged_path.exists(): True


  sent = pd.read_csv(sent_path, parse_dates=['date'], infer_datetime_format=True) if sent_path.exists() else pd.DataFrame()
  merged = pd.read_csv(merged_path, parse_dates=['date'], infer_datetime_format=True) if merged_path.exists() else pd.DataFrame()


Unnamed: 0,0,1,2,3,4
Account,0xae5eacaf9c6b9111fd53034a602c192a04e082ed,0xae5eacaf9c6b9111fd53034a602c192a04e082ed,0xae5eacaf9c6b9111fd53034a602c192a04e082ed,0xae5eacaf9c6b9111fd53034a602c192a04e082ed,0xae5eacaf9c6b9111fd53034a602c192a04e082ed
Coin,@107,@107,@107,@107,@107
Execution Price,7.9769,7.98,7.9855,7.9874,7.9894
Size Tokens,986.87,16.0,144.09,142.98,8.73
Size USD,7872.16,127.68,1150.63,1142.04,69.75
Side,BUY,BUY,BUY,BUY,BUY
Timestamp IST,02-12-2024 22:50,02-12-2024 22:50,02-12-2024 22:50,02-12-2024 22:50,02-12-2024 22:50
Start Position,0.0,986.524596,1002.518996,1146.558564,1289.488521
Direction,Buy,Buy,Buy,Buy,Buy
Closed PnL,0.0,0.0,0.0,0.0,0.0


In [13]:
# Quick dtype and missing-value inspection
def summary_df(df, name='df'):
    print(f"{name} summary:")
    display(df.dtypes)
    display(df.isnull().sum().sort_values(ascending=False).head(20))

# Run summaries if data loaded
if not hist.empty:
    summary_df(hist, 'historical (clean)')
else:
    print('historical (clean) dataframe is empty')

if not sent.empty:
    summary_df(sent, 'sentiment (clean)')
else:
    print('sentiment (clean) dataframe is empty')

if not merged.empty:
    summary_df(merged, 'merged daily')
else:
    print('merged daily dataframe is empty')

historical (clean) summary:


Account              object
Coin                 object
Execution Price     float64
Size Tokens         float64
Size USD            float64
Side                 object
Timestamp IST        object
Start Position      float64
Direction            object
Closed PnL          float64
Transaction Hash     object
Order ID              int64
Crossed                bool
Fee                 float64
Trade ID            float64
Timestamp           float64
timestamp            object
dtype: object

Account             0
Coin                0
Execution Price     0
Size Tokens         0
Size USD            0
Side                0
Timestamp IST       0
Start Position      0
Direction           0
Closed PnL          0
Transaction Hash    0
Order ID            0
Crossed             0
Fee                 0
Trade ID            0
Timestamp           0
timestamp           0
dtype: int64

sentiment (clean) summary:


timestamp                  int64
value                      int64
classification            object
date              datetime64[ns]
dtype: object

timestamp         0
value             0
classification    0
date              0
dtype: int64

merged daily summary:


date              datetime64[ns]
trades                     int64
total_pnl                float64
avg_pnl                  float64
win_rate                 float64
notional_usd             float64
timestamp                float64
value                    float64
classification            object
dtype: object

value             1
timestamp         1
classification    1
date              0
trades            0
win_rate          0
avg_pnl           0
total_pnl         0
notional_usd      0
dtype: int64

In [14]:
# Aggregate: daily metrics already in merged (from script). Show sentiment bucket summaries vs daily pnl
daily = merged.copy()
# If merged doesn't include 'classification' but sentiment cleaned has it, join

## Findings (quick notes)
- The notebook computes grouped summaries and visualizations linking daily trader PnL to Fear & Greed buckets.
- Saved `data/trader_metrics.csv` and `data/sentiment_summary_by_daily.csv` for downstream analysis and modeling.

Next steps: deeper cohort analysis, rolling lead/lag correlation, Granger causality, and predictive modeling pipelines.