<a href="https://colab.research.google.com/github/manasasuryasubrahmanayeswari/ds_manasa_nalla/blob/master/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q gdown pandas numpy matplotlib plotly seaborn statsmodels

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import statsmodels.api as sm
from scipy import stats
from datetime import datetime

In [None]:
# =========================================================
# STEP 1: Load datasets from Google Drive
# =========================================================
import gdown

# Replace file IDs with your actual IDs from the Google Drive links
hist_file_id = "1IAfLZwu6rJzyWKgBToqwSmmVYU6VbjVs"  # Historical trader data
fg_file_id   = "1PgQC0tO8XN-wqkNyghWc_-mnrYv_nhSf"  # Fear-Greed Index

# Download directly into Colab working directory
!gdown --id {hist_file_id} -O historical_data.csv
!gdown --id {fg_file_id}   -O fear_greed_index.csv

Downloading...
From: https://drive.google.com/uc?id=1IAfLZwu6rJzyWKgBToqwSmmVYU6VbjVs
To: /content/historical_data.csv
100% 47.5M/47.5M [00:01<00:00, 39.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=1PgQC0tO8XN-wqkNyghWc_-mnrYv_nhSf
To: /content/fear_greed_index.csv
100% 90.8k/90.8k [00:00<00:00, 3.20MB/s]


In [None]:
# STEP 2: Load and preview the data
# =========================================================
hist = pd.read_csv('historical_data.csv')
fg   = pd.read_csv('fear_greed_index.csv')

print("Historical Trader Data:")
display(hist.head())
print("\nFear-Greed Index:")
display(fg.head())

Historical Trader Data:


Unnamed: 0,Account,Coin,Execution Price,Size Tokens,Size USD,Side,Timestamp IST,Start Position,Direction,Closed PnL,Transaction Hash,Order ID,Crossed,Fee,Trade ID,Timestamp
0,0xae5eacaf9c6b9111fd53034a602c192a04e082ed,@107,7.9769,986.87,7872.16,BUY,02-12-2024 22:50,0.0,Buy,0.0,0xec09451986a1874e3a980418412fcd0201f500c95bac...,52017706630,True,0.345404,895000000000000.0,1730000000000.0
1,0xae5eacaf9c6b9111fd53034a602c192a04e082ed,@107,7.98,16.0,127.68,BUY,02-12-2024 22:50,986.524596,Buy,0.0,0xec09451986a1874e3a980418412fcd0201f500c95bac...,52017706630,True,0.0056,443000000000000.0,1730000000000.0
2,0xae5eacaf9c6b9111fd53034a602c192a04e082ed,@107,7.9855,144.09,1150.63,BUY,02-12-2024 22:50,1002.518996,Buy,0.0,0xec09451986a1874e3a980418412fcd0201f500c95bac...,52017706630,True,0.050431,660000000000000.0,1730000000000.0
3,0xae5eacaf9c6b9111fd53034a602c192a04e082ed,@107,7.9874,142.98,1142.04,BUY,02-12-2024 22:50,1146.558564,Buy,0.0,0xec09451986a1874e3a980418412fcd0201f500c95bac...,52017706630,True,0.050043,1080000000000000.0,1730000000000.0
4,0xae5eacaf9c6b9111fd53034a602c192a04e082ed,@107,7.9894,8.73,69.75,BUY,02-12-2024 22:50,1289.488521,Buy,0.0,0xec09451986a1874e3a980418412fcd0201f500c95bac...,52017706630,True,0.003055,1050000000000000.0,1730000000000.0



Fear-Greed Index:


Unnamed: 0,timestamp,value,classification,date
0,1517463000,30,Fear,2018-02-01
1,1517549400,15,Extreme Fear,2018-02-02
2,1517635800,40,Fear,2018-02-03
3,1517722200,24,Extreme Fear,2018-02-04
4,1517808600,11,Extreme Fear,2018-02-05


In [None]:
# --- Standardize column names
hist = hist.rename(columns=lambda c: c.strip().lower().replace(' ', '_'))
fg   = fg.rename(columns=lambda c: c.strip().lower().replace(' ', '_'))

# --- Handle your actual column names
# Rename timestamp → time for consistency
if 'timestamp' in hist.columns:
    hist = hist.rename(columns={'timestamp': 'time'})

# Align closed_pnl naming
if 'closed_pnl' in hist.columns:
    hist = hist.rename(columns={'closed_pnl': 'closedpnl'})

# Prefer size_usd if present
if 'size_usd' in hist.columns:
    hist['size'] = hist['size_usd']
elif 'size_tokens' in hist.columns:
    hist['size'] = hist['size_tokens']

# Display current columns after renaming
print("Cleaned historical columns:", sorted(hist.columns.tolist()))
print("Fear/Greed columns:", sorted(fg.columns.tolist()))

# --- Parse timestamps / dates
hist['time'] = pd.to_datetime(hist['time'], utc=True, errors='coerce')
hist['date'] = hist['time'].dt.date

# Parse fear/greed dates
fg['date'] = pd.to_datetime(fg['date'], errors='coerce').dt.date
fg['classification'] = fg['classification'].astype(str).str.strip().str.title()

# Basic cleaning
before = len(hist)
hist = hist.dropna(subset=['time']).drop_duplicates()
print(f"Dropped {before - len(hist)} rows with invalid time/duplicates.")

# Force numeric for key numeric columns
for c in ['execution_price','size','closedpnl','start_position']:
    if c in hist.columns:
        hist[c] = pd.to_numeric(hist[c], errors='coerce')

if 'side' in hist.columns:
    hist['side'] = hist['side'].astype(str).str.strip().str.lower()
    hist['is_long']  = hist['side'].isin(['buy','long']).astype(int)
    hist['is_short'] = hist['side'].isin(['sell','short']).astype(int)
else:
    hist['is_long'] = np.nan
    hist['is_short'] = np.nan

# --- Create notional & ROI proxies
if {'size','execution_price'}.issubset(hist.columns):
    hist['notional'] = (hist['size'].abs() * hist['execution_price']).replace(0, np.nan)
else:
    hist['notional'] = np.nan

if {'closedpnl','notional'}.issubset(hist.columns):
    hist['roi'] = hist['closedpnl'] / hist['notional']
else:
    hist['roi'] = np.nan

# --- Preview
display(hist.head())
display(fg.head())


Cleaned historical columns: ['account', 'closedpnl', 'coin', 'crossed', 'direction', 'execution_price', 'fee', 'order_id', 'side', 'size', 'size_tokens', 'size_usd', 'start_position', 'time', 'timestamp_ist', 'trade_id', 'transaction_hash']
Fear/Greed columns: ['classification', 'date', 'timestamp', 'value']
Dropped 0 rows with invalid time/duplicates.


Unnamed: 0,account,coin,execution_price,size_tokens,size_usd,side,timestamp_ist,start_position,direction,closedpnl,...,crossed,fee,trade_id,time,size,date,is_long,is_short,notional,roi
0,0xae5eacaf9c6b9111fd53034a602c192a04e082ed,@107,7.9769,986.87,7872.16,buy,02-12-2024 22:50,0.0,Buy,0.0,...,True,0.345404,895000000000000.0,1970-01-01 00:28:50+00:00,7872.16,1970-01-01,1,0,62795.433104,0.0
1,0xae5eacaf9c6b9111fd53034a602c192a04e082ed,@107,7.98,16.0,127.68,buy,02-12-2024 22:50,986.524596,Buy,0.0,...,True,0.0056,443000000000000.0,1970-01-01 00:28:50+00:00,127.68,1970-01-01,1,0,1018.8864,0.0
2,0xae5eacaf9c6b9111fd53034a602c192a04e082ed,@107,7.9855,144.09,1150.63,buy,02-12-2024 22:50,1002.518996,Buy,0.0,...,True,0.050431,660000000000000.0,1970-01-01 00:28:50+00:00,1150.63,1970-01-01,1,0,9188.355865,0.0
3,0xae5eacaf9c6b9111fd53034a602c192a04e082ed,@107,7.9874,142.98,1142.04,buy,02-12-2024 22:50,1146.558564,Buy,0.0,...,True,0.050043,1080000000000000.0,1970-01-01 00:28:50+00:00,1142.04,1970-01-01,1,0,9121.930296,0.0
4,0xae5eacaf9c6b9111fd53034a602c192a04e082ed,@107,7.9894,8.73,69.75,buy,02-12-2024 22:50,1289.488521,Buy,0.0,...,True,0.003055,1050000000000000.0,1970-01-01 00:28:50+00:00,69.75,1970-01-01,1,0,557.26065,0.0


Unnamed: 0,timestamp,value,classification,date
0,1517463000,30,Fear,2018-02-01
1,1517549400,15,Extreme Fear,2018-02-02
2,1517635800,40,Fear,2018-02-03
3,1517722200,24,Extreme Fear,2018-02-04
4,1517808600,11,Extreme Fear,2018-02-05


In [None]:
# Keep only needed columns from F&G
fg_small = fg[['date','classification']].drop_duplicates()

# Merge each trade to the day’s sentiment
df = hist.merge(fg_small, on='date', how='left')

# Quick diagnostics: how many trades got a sentiment?
matched = df['classification'].notna().mean()
total_trades = len(df)
missing = df['classification'].isna().sum()
print(f"Trades matched to sentiment: {matched:.2%} ({total_trades - missing}/{total_trades})")
print(df['classification'].value_counts(dropna=False))

# Optional: encode sentiment as numeric
sent_map = {'Fear': 0, 'Greed': 1}
df['sentiment_num'] = df['classification'].map(sent_map)

# Keep a clean analysis frame
analysis_cols = ['date','time','account','coin','side','execution_price',
                 'size','closedpnl','start_position','fee','notional','roi',
                 'classification','sentiment_num']
df_analysis = df[[c for c in analysis_cols if c in df.columns]].copy()

display(df_analysis.head())


Trades matched to sentiment: 0.00% (0/211224)
classification
NaN    211224
Name: count, dtype: int64


Unnamed: 0,date,time,account,coin,side,execution_price,size,closedpnl,start_position,fee,notional,roi,classification,sentiment_num
0,1970-01-01,1970-01-01 00:28:50+00:00,0xae5eacaf9c6b9111fd53034a602c192a04e082ed,@107,buy,7.9769,7872.16,0.0,0.0,0.345404,62795.433104,0.0,,
1,1970-01-01,1970-01-01 00:28:50+00:00,0xae5eacaf9c6b9111fd53034a602c192a04e082ed,@107,buy,7.98,127.68,0.0,986.524596,0.0056,1018.8864,0.0,,
2,1970-01-01,1970-01-01 00:28:50+00:00,0xae5eacaf9c6b9111fd53034a602c192a04e082ed,@107,buy,7.9855,1150.63,0.0,1002.518996,0.050431,9188.355865,0.0,,
3,1970-01-01,1970-01-01 00:28:50+00:00,0xae5eacaf9c6b9111fd53034a602c192a04e082ed,@107,buy,7.9874,1142.04,0.0,1146.558564,0.050043,9121.930296,0.0,,
4,1970-01-01,1970-01-01 00:28:50+00:00,0xae5eacaf9c6b9111fd53034a602c192a04e082ed,@107,buy,7.9894,69.75,0.0,1289.488521,0.003055,557.26065,0.0,,


In [None]:
import os
os.makedirs("csv_files", exist_ok=True)

# --- Per-trade summary by sentiment
trade_summary = (df.groupby('classification', dropna=False)
                   .agg(
                       trades=('account','count'),
                       pnl_sum=('closedpnl','sum'),
                       pnl_mean=('closedpnl','mean'),
                       pnl_median=('closedpnl','median'),
                       win_rate=('closedpnl', lambda s: (s > 0).mean()),
                       avg_notional=('notional','mean'),
                       med_notional=('notional','median'),
                   ).reset_index())
display(trade_summary)
trade_summary.to_csv("csv_files/trade_summary_by_sentiment.csv", index=False)

# --- Per-account-per-day (robust to hyperactive accounts)
group_cols = ['date','classification','account'] if 'account' in df.columns else ['date','classification']
acct_day = (df.groupby(group_cols)
              .agg(
                  trades=('classification','count'),
                  pnl=('closedpnl','sum'),
                  winrate=('closedpnl', lambda s: (s>0).mean()),
                  long_share=('side', lambda s: pd.Series(s).astype(str).str.lower().isin(['buy','long']).mean()
                              if s.notna().any() else np.nan),
                  notional_sum=('notional','sum'),
                  roi_mean=('roi','mean'),
              ).reset_index())

# If leverage exists later, you can add its mean/median here; not in your columns now.
display(acct_day.head(10))
acct_day.to_csv("csv_files/account_day_metrics.csv", index=False)

# --- Market-level daily aggregation
market_day = (acct_day.groupby(['date','classification'])
              .agg(
                  accounts=('account','nunique') if 'account' in acct_day.columns else ('trades','count'),
                  trades=('trades','sum'),
                  pnl_sum=('pnl','sum'),
                  pnl_mean_per_acct=('pnl','mean'),
                  winrate_mean=('winrate','mean'),
                  long_bias=('long_share','mean'),
                  notional_sum=('notional_sum','sum'),
              ).reset_index())
display(market_day.head(10))
market_day.to_csv("csv_files/market_day_metrics.csv", index=False)


Unnamed: 0,classification,trades,pnl_sum,pnl_mean,pnl_median,win_rate,avg_notional,med_notional
0,,211224,10296960.0,48.749001,0.0,0.411265,268910800.0,10458.44338


Unnamed: 0,date,classification,account,trades,pnl,winrate,long_share,notional_sum,roi_mean


Unnamed: 0,date,classification,accounts,trades,pnl_sum,pnl_mean_per_acct,winrate_mean,long_bias,notional_sum


In [None]:
!pip install -U plotly==6.1.1 kaleido==0.2.1
import plotly.express as px
import os
os.makedirs("outputs", exist_ok=True)




In [None]:
!pip -q install kaleido
import plotly.express as px, os

os.makedirs("outputs", exist_ok=True)

# 6.1 PnL by sentiment (per-account-day)
fig = px.box(acct_day, x='classification', y='pnl', points='outliers',
             title='PnL by Sentiment (Per-Account-Day)')
fig.write_image("outputs/box_pnl_by_sentiment.png")

# 6.2 Average win rate by sentiment
winrate_plot = acct_day.groupby('classification', dropna=False)['winrate'].mean().reset_index()
fig = px.bar(winrate_plot, x='classification', y='winrate', text='winrate',
             title='Average Win Rate by Sentiment')
fig.update_traces(texttemplate='%{y:.2%}', textposition='outside')
fig.write_image("outputs/bar_winrate_by_sentiment.png")

# 6.3 Total market PnL over time colored by sentiment
fig = px.line(market_day, x='date', y='pnl_sum', color='classification',
              title='Total Market PnL by Day & Sentiment')
fig.write_image("outputs/line_market_pnl_by_day.png")

# 6.4 Long-bias by sentiment
bias = acct_day.groupby('classification', dropna=False)['long_share'].mean().reset_index()
fig = px.bar(bias, x='classification', y='long_share', text='long_share',
             title='Long Bias by Sentiment (Per-Account-Day)')
fig.update_traces(texttemplate='%{y:.1%}', textposition='outside')
fig.write_image("outputs/bar_long_bias_by_sentiment.png")

# 6.5 Notional per-account-day by sentiment
fig = px.box(acct_day, x='classification', y='notional_sum', points='outliers',
             title='Trade Notional per Account-Day by Sentiment')
fig.write_image("outputs/box_notional_by_sentiment.png")

print("✅ Saved figures to /outputs")


NameError: name 'acct_day' is not defined

In [None]:
from scipy import stats
import numpy as np
import pandas as pd

def mannwhitney_print(series_fear, series_greed, label):
    x = series_fear.dropna()
    y = series_greed.dropna()
    if len(x) < 10 or len(y) < 10:
        print(f"[{label}] Not enough data for a robust test (Fear n={len(x)}, Greed n={len(y)}).")
        return
    # Mann–Whitney U (robust to non-normal)
    u_stat, p_val = stats.mannwhitneyu(x, y, alternative='two-sided')
    # Simple Cliff’s delta (sample if large to keep fast)
    def cliffs_delta(a, b, max_n=4000, seed=42):
        rng = np.random.default_rng(seed)
        if len(a) > max_n: a = pd.Series(a).sample(max_n, random_state=seed).values
        if len(b) > max_n: b = pd.Series(b).sample(max_n, random_state=seed).values
        more = 0; less = 0
        for ai in a:
            more += np.sum(ai > b)
            less += np.sum(ai < b)
        return (more - less) / (len(a) * len(b))
    delta = cliffs_delta(x.values, y.values)

    print(f"[{label}] Mann–Whitney U p-value: {p_val:.4g} | Cliff’s δ: {delta:.3f} (range -1..1)")

fear_pnl  = acct_day.loc[acct_day['classification']=='Fear',  'pnl']
greed_pnl = acct_day.loc[acct_day['classification']=='Greed', 'pnl']
mannwhitney_print(fear_pnl, greed_pnl, "PnL per-account-day")

fear_wr  = acct_day.loc[acct_day['classification']=='Fear',  'winrate']
greed_wr = acct_day.loc[acct_day['classification']=='Greed', 'winrate']
mannwhitney_print(fear_wr, greed_wr, "Win rate per-account-day")

fear_not  = acct_day.loc[acct_day['classification']=='Fear',  'notional_sum']
greed_not = acct_day.loc[acct_day['classification']=='Greed', 'notional_sum']
mannwhitney_print(fear_not, greed_not, "Notional per-account-day")


[PnL per-account-day] Not enough data for a robust test (Fear n=0, Greed n=0).
[Win rate per-account-day] Not enough data for a robust test (Fear n=0, Greed n=0).
[Notional per-account-day] Not enough data for a robust test (Fear n=0, Greed n=0).


In [None]:
import pandas as pd

# Reload from your saved CSVs instead of recomputing all metrics:
acct_day = pd.read_csv("csv_files/account_day_metrics.csv")
market_day = pd.read_csv("csv_files/market_day_metrics.csv")

# Optional check
print("acct_day:", acct_day.shape)
print("market_day:", market_day.shape)
acct_day.head()


acct_day: (0, 9)
market_day: (0, 9)


Unnamed: 0,date,classification,account,trades,pnl,winrate,long_share,notional_sum,roi_mean


In [None]:
!pip install -U plotly==6.1.1 kaleido==0.2.1 -q

import plotly.express as px
import os

os.makedirs("outputs", exist_ok=True)

# 6.1 PnL by sentiment (per-account-day)
fig = px.box(acct_day, x='classification', y='pnl', points='outliers',
             title='PnL by Sentiment (Per-Account-Day)')
fig.write_image("outputs/box_pnl_by_sentiment.png")

# 6.2 Average win rate by sentiment
winrate_plot = acct_day.groupby('classification', dropna=False)['winrate'].mean().reset_index()
fig = px.bar(winrate_plot, x='classification', y='winrate', text='winrate',
             title='Average Win Rate by Sentiment')
fig.update_traces(texttemplate='%{y:.2%}', textposition='outside')
fig.write_image("outputs/bar_winrate_by_sentiment.png")

# 6.3 Total market PnL over time colored by sentiment
fig = px.line(market_day, x='date', y='pnl_sum', color='classification',
              title='Total Market PnL by Day & Sentiment')
fig.write_image("outputs/line_market_pnl_by_day.png")

# 6.4 Long-bias by sentiment
bias = acct_day.groupby('classification', dropna=False)['long_share'].mean().reset_index()
fig = px.bar(bias, x='classification', y='long_share', text='long_share',
             title='Long Bias by Sentiment (Per-Account-Day)')
fig.update_traces(texttemplate='%{y:.1%}', textposition='outside')
fig.write_image("outputs/bar_long_bias_by_sentiment.png")

# 6.5 Notional per-account-day by sentiment
fig = px.box(acct_day, x='classification', y='notional_sum', points='outliers',
             title='Trade Notional per Account-Day by Sentiment')
fig.write_image("outputs/box_notional_by_sentiment.png")

print("✅ Saved figures to /outputs")


✅ Saved figures to /outputs


In [None]:
# ==== RECOVERY CELL: Rebuild merged dataset and per-trade analysis ====
import os
import pandas as pd
import numpy as np

# Ensure output folder
os.makedirs("csv_files", exist_ok=True)

# If source CSVs are missing, try to fetch via gdown using your IDs
hist_csv = "historical_data.csv"
fg_csv   = "fear_greed_index.csv"

if not (os.path.exists(hist_csv) and os.path.exists(fg_csv)):
    try:
        import gdown  # should already be installed
        # Your file IDs from earlier steps
        hist_file_id = "1IAfLZwu6rJzyWKgBToqwSmmVYU6VbjVs"
        fg_file_id   = "1PgQC0tO8XN-wqkNyghWc_-mnrYv_nhSf"
        if not os.path.exists(hist_csv):
            !gdown --id {hist_file_id} -O {hist_csv}
        if not os.path.exists(fg_csv):
            !gdown --id {fg_file_id} -O {fg_csv}
    except Exception as e:
        raise FileNotFoundError(
            "Source CSVs not found and auto-download failed. "
            "Please ensure historical_data.csv and fear_greed_index.csv are present."
        ) from e

# Load
hist = pd.read_csv(hist_csv)
fg   = pd.read_csv(fg_csv)

# --- Standardize column names
hist = hist.rename(columns=lambda c: c.strip().lower().replace(' ', '_'))
fg   = fg.rename(columns=lambda c: c.strip().lower().replace(' ', '_'))

# --- Harmonize common fields
if 'timestamp' in hist.columns:
    hist = hist.rename(columns={'timestamp': 'time'})
if 'closed_pnl' in hist.columns:
    hist = hist.rename(columns={'closed_pnl': 'closedpnl'})

# Prefer a consistent "size"
if 'size_usd' in hist.columns:
    hist['size'] = hist['size_usd']
elif 'size_tokens' in hist.columns and 'execution_price' in hist.columns:
    # Convert tokens -> notional proxy if price exists
    hist['size'] = hist['size_tokens'] * hist['execution_price']

# --- Parse times / dates
hist['time'] = pd.to_datetime(hist['time'], utc=True, errors='coerce')
hist['date'] = hist['time'].dt.date
fg['date']   = pd.to_datetime(fg['date'], errors='coerce').dt.date

# Normalize sentiment labels
if 'classification' in fg.columns:
    fg['classification'] = fg['classification'].astype(str).str.strip().str.title()
else:
    raise ValueError("Fear-Greed CSV must contain a 'classification' column (Fear/Greed).")

# Drop bad times / dups
before = len(hist)
hist = hist.dropna(subset=['time']).drop_duplicates()
# print(f"Dropped {before - len(hist)} trades during cleaning.")

# Force numeric on key fields if present
for c in ['execution_price','size','closedpnl','start_position','fee','leverage']:
    if c in hist.columns:
        hist[c] = pd.to_numeric(hist[c], errors='coerce')

# Normalize side + flags
if 'side' in hist.columns:
    hist['side'] = hist['side'].astype(str).str.strip().str.lower()
    hist['is_long']  = hist['side'].isin(['buy','long']).astype(int)
    hist['is_short'] = hist['side'].isin(['sell','short']).astype(int)
else:
    hist['is_long'] = np.nan
    hist['is_short'] = np.nan

# Notional and ROI proxies
if {'size','execution_price'}.issubset(hist.columns):
    hist['notional'] = (hist['size'].abs() * hist['execution_price']).replace(0, np.nan)
else:
    hist['notional'] = np.nan

if {'closedpnl','notional'}.issubset(hist.columns):
    hist['roi'] = hist['closedpnl'] / hist['notional']
else:
    hist['roi'] = np.nan

# Merge with daily sentiment
fg_small = fg[['date','classification']].drop_duplicates()
df = hist.merge(fg_small, on='date', how='left')

# Numeric encoding for sentiment (optional)
sent_map = {'Fear': 0, 'Greed': 1}
df['sentiment_num'] = df['classification'].map(sent_map)

# Save merged dataset
merged_path = "csv_files/merged_trades_with_sentiment.csv"
df.to_csv(merged_path, index=False)

# Build per-trade analysis subset
keep_cols = [c for c in [
    'date','time','account','coin','symbol','side','execution_price',
    'size','closedpnl','start_position','fee','notional','roi',
    'classification','sentiment_num','leverage'
] if c in df.columns]
df_analysis = df[keep_cols].copy()
df_analysis_path = "csv_files/per_trade_analysis.csv"
df_analysis.to_csv(df_analysis_path, index=False)

print("✅ Rebuilt and saved:")
print(" -", merged_path)
print(" -", df_analysis_path)
print("Rows:", len(df), "| Analysis rows:", len(df_analysis))


✅ Rebuilt and saved:
 - csv_files/merged_trades_with_sentiment.csv
 - csv_files/per_trade_analysis.csv
Rows: 211224 | Analysis rows: 211224


In [None]:
# STEP 8 — organize ds_<your_name> with required subfolders
import os, shutil, pandas as pd

CANDIDATE = "manasa_nalla"   # <-- change if you want another name
ROOT = f"ds_{CANDIDATE}"
CSV_SRC = "csv_files"
OUT_SRC = "outputs"
CSV_DST = os.path.join(ROOT, "csv_files")
OUT_DST = os.path.join(ROOT, "outputs")

# Ensure sources exist
os.makedirs(CSV_SRC, exist_ok=True)
os.makedirs(OUT_SRC, exist_ok=True)

# Make destination structure
os.makedirs(CSV_DST, exist_ok=True)
os.makedirs(OUT_DST, exist_ok=True)

# Copy CSVs
for f in os.listdir(CSV_SRC):
    src = os.path.join(CSV_SRC, f)
    if os.path.isfile(src) and f.lower().endswith(".csv"):
        shutil.copy2(src, os.path.join(CSV_DST, f))

# Copy images
for f in os.listdir(OUT_SRC):
    src = os.path.join(OUT_SRC, f)
    if os.path.isfile(src) and f.lower().endswith(".png"):
        shutil.copy2(src, os.path.join(OUT_DST, f))

print("✅ Organized standardized folder:")
print(" -", CSV_DST)
print(" -", OUT_DST)


✅ Organized standardized folder:
 - ds_manasa_nalla/csv_files
 - ds_manasa_nalla/outputs


In [None]:
# STEP 9 — build ds_report.pdf inside ds_<name>
import os, textwrap
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from datetime import datetime

# Paths
trade_summary_path = "csv_files/trade_summary_by_sentiment.csv"
acct_day_path      = "csv_files/account_day_metrics.csv"
market_day_path    = "csv_files/market_day_metrics.csv"
merged_path        = "csv_files/merged_trades_with_sentiment.csv"
out_dir            = "outputs"

# Load whichever KPI table is available
trade_summary = None
if os.path.exists(trade_summary_path):
    trade_summary = pd.read_csv(trade_summary_path)

# Fallbacks if needed
acct_day = pd.read_csv(acct_day_path) if os.path.exists(acct_day_path) else None
market_day = pd.read_csv(market_day_path) if os.path.exists(market_day_path) else None
merged = pd.read_csv(merged_path) if os.path.exists(merged_path) else None

# Compose summary text
blocks = []
blocks.append("Trader Behavior vs. Bitcoin Fear & Greed — Summary")
blocks.append("")
blocks.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
blocks.append("")
blocks.append("Datasets:")
blocks.append("• Historical Trader Data (Hyperliquid)")
blocks.append("• Bitcoin Fear & Greed Index")
blocks.append("")

if trade_summary is not None and not trade_summary.empty:
    blocks.append("Key KPIs by Sentiment")
    blocks.append(trade_summary.round(4).to_string(index=False))
elif acct_day is not None:
    quick = (acct_day.groupby('classification', dropna=False)
             .agg(trades=('trades','sum'),
                  pnl_sum=('pnl','sum'),
                  winrate=('winrate','mean'),
                  notional_sum=('notional_sum','sum'))
             .reset_index())
    blocks.append("Key KPIs by Sentiment (computed from account-day)")
    blocks.append(quick.round(4).to_string(index=False))
else:
    blocks.append("Key KPIs by Sentiment: (unavailable – run earlier steps)")

# Create PDF
pdf_path = os.path.join(ROOT, "ds_report.pdf")
with PdfPages(pdf_path) as pdf:
    # Page 1: summary text
    fig = plt.figure(figsize=(8.27, 11.69))  # A4
    plt.axis('off')
    wrapped = textwrap.fill("\n".join(blocks), width=95, replace_whitespace=False)
    plt.text(0.02, 0.98, wrapped, va='top', ha='left', wrap=True, fontsize=10)
    pdf.savefig(fig); plt.close(fig)

    # Pages 2+: add each PNG from outputs/
    if os.path.isdir(out_dir):
        for f in sorted(os.listdir(out_dir)):
            if f.lower().endswith(".png"):
                img = plt.imread(os.path.join(out_dir, f))
                fig = plt.figure(figsize=(8.27, 11.69))
                plt.imshow(img); plt.axis('off')
                plt.title(f, fontsize=10)
                pdf.savefig(fig); plt.close(fig)

print("✅ PDF report created at:", pdf_path)


✅ PDF report created at: ds_manasa_nalla/ds_report.pdf


In [None]:
# STEP 10 — README.md
readme = f"""# ds_{CANDIDATE}

**Assignment:** Trader Behavior Insights vs. Bitcoin Fear & Greed Index

## Structure

ds_{CANDIDATE}/
├── notebook_1.ipynb # (optional) if you add a Colab notebook
├── csv_files/ # Processed & intermediate CSVs
├── outputs/ # Visuals (PNGs)
├── ds_report.pdf # Final summarized insights
└── README.md




## How to Reproduce
1. Open the Colab notebook used to generate this folder.
2. Place/Download `historical_data.csv` and `fear_greed_index.csv` into the runtime.
3. Run cells in order (environment → load/merge → KPIs → charts → PDF).
4. Outputs will be saved under `csv_files/`, `outputs/`, and `ds_{CANDIDATE}/ds_report.pdf`.

## Highlights
- KPIs by sentiment (Fear vs Greed): PnL, win rate, notional/volume, long-bias.
- Visualizations saved as static PNGs for quick review.
"""

with open(os.path.join(ROOT, "README.md"), "w") as f:
    f.write(readme)

print("✅ README written to:", os.path.join(ROOT, "README.md"))


✅ README written to: ds_manasa_nalla/README.md


In [None]:
# STEP 11 — Zip the ds_<name> folder
import shutil

zip_base = f"ds_{CANDIDATE}"
zip_path = shutil.make_archive(zip_base, "zip", zip_base)
print("✅ Zipped submission ready:", zip_path)


✅ Zipped submission ready: /content/ds_manasa_nalla.zip
