# Exploratory Data Analysis for Sentiment Dataset
This notebook provides a structured and thorough EDA across three platforms: **Twitter**, **Reddit**, and **Bitcointalk**. We explore the raw and engineered features through statistical and visual analysis.

In [1]:
#Import all necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import math
import statsmodels.api as sm
import scipy.stats as stats


In [None]:
# Load dataset
df = pd.read_csv("Data/baseline_data_v1.csv")

# Show a preview
df.head()


Unnamed: 0,date,twitter_fomo,twitter_uncertain,twitter_hopeful,twitter_bearish,twitter_pessimistic_doubtful,twitter_sad,twitter_fearful_concerned,twitter_angry,twitter_mistrustful,...,open,high,low,close_binance,volume,quote_asset_volume,num_trades,taker_buy_base_volume,taker_buy_quote_volume,close_augmento
0,2017-08-31 23:00:00,-0.44576,-0.942589,0.019913,-0.25683,-0.453646,-0.219225,0.197802,-0.395022,-0.183469,...,4699.0,4724.89,4683.36,4724.89,12.001618,56396.880782,162.0,9.227133,43376.763033,4734.26
1,2017-09-01 00:00:00,-0.44576,0.196526,0.204217,-0.526668,-0.453646,-0.219225,1.312522,2.287562,-0.183469,...,4689.89,4745.35,4689.89,4721.05,15.711673,74145.736108,105.0,2.494201,11801.62397,4763.99
2,2017-09-01 01:00:00,-0.44576,0.174289,-0.007331,0.07558,0.341731,-0.219225,0.167287,-0.395022,-0.183469,...,4730.05,4766.99,4701.11,4725.0,28.111344,133018.250682,270.0,9.947925,47251.438702,4771.3
3,2017-09-01 02:00:00,-0.44576,0.196526,1.015152,-1.021372,0.622453,-0.219225,-0.504058,0.94627,-0.183469,...,4740.99,4767.0,4723.0,4735.96,15.140693,71825.621786,147.0,9.184023,43593.354559,4764.0
4,2017-09-01 03:00:00,-0.44576,-0.468345,0.771871,-1.021372,-0.453646,-0.219225,-0.504058,-0.395022,-0.183469,...,4767.0,4767.0,4713.67,4714.73,16.684268,78802.109354,101.0,1.914899,9118.754582,4745.6


In [None]:
# =========================
# UTILITY SECTION
# =========================

import logging
import pandas as pd
import sys
import re

# ---- ANSI Color Codes ----
class ColorCodes:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    RED = '\033[31m'
    GREEN = '\033[32m'
    YELLOW = '\033[33m'
    BLUE = '\033[34m'
    MAGENTA = '\033[35m'
    CYAN = '\033[36m'
    WHITE = '\033[37m'
    BLACK = '\033[30m'

# ---- Strip ANSI for File Logs ----
class StripColorFormatter(logging.Formatter):
    ansi_escape = re.compile(r'(?:\x1B[@-_][0-?]*[ -/]*[@-~])')
    def format(self, record):
        message = super().format(record)
        return self.ansi_escape.sub('', message)

# ---- Logger Initialization ----
def reinitialize_logger(name="DualLogger", log_file="eda_output.log", level=logging.INFO):
    logger = logging.getLogger(name)
    logger.setLevel(level)

    if logger.hasHandlers():
        logger.handlers.clear()

    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(level)
    console_handler.setFormatter(logging.Formatter('%(message)s'))

    file_handler = logging.FileHandler(log_file, mode='w')
    file_handler.setLevel(level)
    file_handler.setFormatter(StripColorFormatter('%(message)s'))

    logger.addHandler(console_handler)
    logger.addHandler(file_handler)

    return logger

# ---- Initialize Logger ----
log = reinitialize_logger()


# =========================
# DATA WRANGLER SECTION
# =========================

class DataWrangler:
    def __init__(self, path):
        self.df = self._load_data(path)
        self.cleaned = None
        self.transformed = None

    def _load_data(self, path):
        df = pd.read_csv(path)
        self.df = df
        self._get_columns()
        df.dropna(axis=0, how='any', inplace=True)
        df.reset_index(drop=True, inplace=True)
        return df

    def _get_columns(self):
        log.info("🔍 Columns in the dataset:")
        for col in self.df.columns:
            log.info(f"  • {col}")

    def clean_data(self):
        df = self.df.copy()
        df['date'] = pd.to_datetime(df['date'], errors='coerce')
        df.dropna(axis=0, how='any', inplace=True)
        df.reset_index(drop=True, inplace=True)
        self.cleaned = df
        return df

    def engineer_relative_features(self, platform_keywords):
        df = self.cleaned.copy()
        for platform in platform_keywords:
            cols = [col for col in df.columns if col.startswith(platform + "_") and not col.endswith("total")]
            df[platform + '_total'] = df[cols].sum(axis=1)
            for col in cols:
                new_col = col + "_relative"
                df[new_col] = df[col] / df[platform + '_total']
        self.transformed = df
        return df

    def get_platform_data(self, platform):
        if self.cleaned is None:
            raise ValueError("Data must be cleaned first.")
        return self.cleaned[[col for col in self.cleaned.columns if col.startswith(platform)] + ['date']]

    def summarize_basic_dataset(self, df: pd.DataFrame = None, num_examples: int = 5):
        """
        Display a color-coded summary of the dataset using the shared logger
        """
        if df is None:
            df = self.df
        c = ColorCodes

        log.info(f"\n\n\n{c.BOLD}{c.HEADER}Preview of the data with all columns{c.ENDC}")
        log.info(f"{c.OKBLUE}--------------------------------------------------------{c.ENDC}")
        with pd.option_context('display.max_columns', None):
            log.info(f"\n{df.head()}")

        log.info(f"\n\n\n{c.BOLD}{c.HEADER}Overview of columns{c.ENDC}")
        log.info(f"{c.OKBLUE}--------------------------------------------------------{c.ENDC}")
        log.info(f"{df.columns.tolist()}")
        log.info(f"Shape: {df.shape}")

        log.info(f"\n\n\n{c.BOLD}{c.HEADER}Detailed overview of columns, data types, null values, and unique values{c.ENDC}")
        log.info(f"{c.OKBLUE}--------------------------------------------------------{c.ENDC}")
        log.info(f"{c.UNDERLINE}Column name | Data type | Null values | Num Unique values{c.ENDC}")
        log.info(f"{c.OKBLUE}--------------------------------------------------------{c.ENDC}")

        for col in df.columns:
            nulls = df[col].isnull().sum()
            unique_vals = df[col].unique()
            num_unique = len(unique_vals)
            dtype = df[col].dtype
        
            log.info(f"{c.CYAN}{col:<30}{c.ENDC} {c.GREEN}{str(dtype):<10}{c.ENDC} "
                     f"{c.YELLOW}{nulls:<10}{c.ENDC} {c.MAGENTA}{num_unique}{c.ENDC}")
        
            # Print basic stats if it's numeric
            if pd.api.types.is_numeric_dtype(df[col]):
                min_val = df[col].min()
                max_val = df[col].max()
                mean_val = df[col].mean()
                median_val = df[col].median()
                log.info(f"   ↳ {c.OKBLUE}Min:{c.ENDC} {min_val} | {c.OKBLUE}Max:{c.ENDC} {max_val} | "
                         f"{c.OKBLUE}Mean:{c.ENDC} {mean_val} | {c.OKBLUE}Median:{c.ENDC} {median_val}")
        
            # Show unique values (up to 15), regardless of dtype
            if num_unique <= num_examples or pd.api.types.is_numeric_dtype(df[col]):
                sample_uniques = unique_vals[:num_examples]
                formatted_uniques = ", ".join(map(str, sample_uniques))
                log.info(f"   ↳ {c.OKCYAN}Sample unique values:{c.ENDC} {formatted_uniques}")

    def convert_string_dates(self, df: pd.DataFrame = None, max_error_rate: float = 0.2):
        """
        Automatically detect and convert string-based date columns into datetime columns.
        Adds new columns with '_datetime' suffix. Original columns remain unchanged.
        Skips columns where conversion fails or too many entries become NaT.
        """
        if df is None:
            df = self.df

        c = ColorCodes
        log.info(f"\n\n{c.BOLD}{c.HEADER}🔄 Attempting to convert string-based date columns...{c.ENDC}")

        for col in df.columns:
            if df[col].dtype == 'object':
                try:
                    converted = pd.to_datetime(df[col], errors='coerce', infer_datetime_format=True)
                    num_parsed = converted.notna().sum()
                    total = len(df)
                    error_ratio = 1 - (num_parsed / total)

                    if 0 < error_ratio <= max_error_rate:
                        new_col = col + "_datetime"
                        df[new_col] = converted
                        log.info(f"{c.GREEN}✔ Partially converted:{c.ENDC} '{col}' → '{new_col}' "
                                 f"({num_parsed}/{total} parsed, {round((1-error_ratio)*100, 1)}% success)")
                    elif error_ratio == 0:
                        new_col = col + "_datetime"
                        df[new_col] = converted
                        log.info(f"{c.OKGREEN}✔ Fully converted:{c.ENDC} '{col}' → '{new_col}' (100% success)")
                    else:
                        log.warning(f"{c.WARNING}⚠ Skipped column '{col}' due to high parsing failure "
                                    f"({round(error_ratio * 100, 1)}% unparsed){c.ENDC}")

                except Exception as e:
                    log.error(f"{c.FAIL}✖ Error while parsing column '{col}': {str(e)}{c.ENDC}")

        self.df = df
        return df

    def extract_datetime_features(self, df: pd.DataFrame = None, features: list = None, columns: list = None):
        """
        Extracts specified datetime features from datetime columns in the DataFrame (non-destructive).
        Result is saved in self.transformed, preserving the original data.

        :param df: DataFrame to work on. Defaults to self.df.
        :param features: List of datetime features to extract. Defaults to all.
        :param columns: Optional list of specific datetime columns to extract from.
        """
        if df is None:
            df = self.df

        df = df.copy()  # non-destructive
        c = ColorCodes

        if features is None:
            features = ['year', 'month', 'month_name', 'day', 'weekday', 'day_name', 'hour', 'minute', 'is_weekend']

        log.info(f"\n\n{c.BOLD}{c.HEADER}🧠 Extracting datetime features (non-destructive)...{c.ENDC}")

        # Filter datetime columns
        datetime_cols = df.select_dtypes(include=['datetime64[ns]']).columns.tolist()
        if columns:
            datetime_cols = [col for col in columns if col in datetime_cols]

        if not datetime_cols:
            log.warning(f"{c.WARNING}⚠ No valid datetime columns found to extract from.{c.ENDC}")
            return df

        for col in datetime_cols:
            for feature in features:
                try:
                    new_col = f"{col}_{feature}"
                    if feature == 'year':
                        df[new_col] = df[col].dt.year
                    elif feature == 'month':
                        df[new_col] = df[col].dt.month
                    elif feature == 'month_name':
                        df[new_col] = df[col].dt.month_name()
                    elif feature == 'day':
                        df[new_col] = df[col].dt.day
                    elif feature == 'weekday':
                        df[new_col] = df[col].dt.weekday
                    elif feature == 'day_name':
                        df[new_col] = df[col].dt.day_name()
                    elif feature == 'hour':
                        df[new_col] = df[col].dt.hour
                    elif feature == 'minute':
                        df[new_col] = df[col].dt.minute
                    elif feature == 'is_weekend':
                        df[new_col] = df[col].dt.weekday >= 5
                    else:
                        log.warning(f"{c.WARNING}⚠ Unsupported feature: '{feature}' skipped.{c.ENDC}")
                        continue

                    log.info(f"{c.OKGREEN}✔ Extracted:{c.ENDC} {new_col} from {col}")

                except Exception as e:
                    log.error(f"{c.FAIL}✖ Failed to extract {feature} from {col}: {str(e)}{c.ENDC}")

        self.transformed = df
        return df


    def clean_currency_fields(self, df: pd.DataFrame = None, inplace: bool = False, convert_to_float: bool = True):
        """
        Cleans and standardizes currency-formatted strings in the DataFrame.
        - Handles mixed decimal separators (e.g., 1.000,00€ vs $1,000.00)
        - Removes symbols, commas, etc.
        - Optionally converts to float
        - Adds a column with inferred currency type (e.g., USD, EUR)
        """
        if df is None:
            df = self.df

        if not inplace:
            df = df.copy()

        import numpy as np

        c = ColorCodes
        currency_map = {
            "$": "USD",
            "€": "EUR",
            "£": "GBP",
            "¥": "JPY",
            "₽": "RUB",
            "₹": "INR"
        }

        currency_symbols = list(currency_map.keys())
        log.info(f"\n\n{c.BOLD}{c.HEADER}💰 Cleaning currency fields with intelligent separator handling...{c.ENDC}")

        for col in df.columns:
            if df[col].dtype == "object":
                try:
                    sample = df[col].dropna().astype(str).head(50)

                    if any(sym in val for val in sample for sym in currency_symbols) or sample.str.contains(r"\d{1,3}[\.,]\d{2,3}").any():
                        cleaned_col = df[col].astype(str)

                        # Extract currency type from first matching symbol
                        def detect_currency_symbol(val):
                            for sym in currency_symbols:
                                if sym in val:
                                    return currency_map[sym]
                            return "UNKNOWN"

                        df[col + "_currency_type"] = df[col].apply(detect_currency_symbol)

                        # Remove currency symbols and normalize
                        cleaned_col = cleaned_col.str.replace(r"[^\d,.\-]", "", regex=True)

                        # Detect format style
                        dot_comma = sample.str.contains(r"\.\d{3},\d{2}", regex=True).any()
                        comma_dot = sample.str.contains(r",\d{3}\.\d{2}", regex=True).any()

                        if dot_comma:
                            cleaned_col = cleaned_col.str.replace(".", "", regex=False)
                            cleaned_col = cleaned_col.str.replace(",", ".", regex=False)
                            style = "dot-comma (EU)"
                        elif comma_dot:
                            cleaned_col = cleaned_col.str.replace(",", "", regex=False)
                            style = "comma-dot (US)"
                        else:
                            # Fallback auto-detection
                            def standardize(val):
                                if val.count(",") > 0 and val.rfind(",") > val.rfind("."):
                                    val = val.replace(".", "").replace(",", ".")
                                else:
                                    val = val.replace(",", "")
                                return val
                            cleaned_col = cleaned_col.apply(standardize)
                            style = "auto-detected"

                        # Convert to float
                        if convert_to_float:
                            df[col + "_cleaned"] = pd.to_numeric(cleaned_col, errors='coerce')
                            converted = df[col + "_cleaned"].notna().sum()
                            total = len(df)
                            log.info(f"{c.OKGREEN}✔ Cleaned '{col}' ({converted}/{total} parsed, {style}){c.ENDC}")
                            log.info(f"   ↳ {c.CYAN}Currency type column:{c.ENDC} '{col}_currency_type'")
                        else:
                            df[col + "_cleaned"] = cleaned_col
                            log.info(f"{c.OKCYAN}➤ Cleaned '{col}' as standardized strings ({style}){c.ENDC}")

                except Exception as e:
                    log.error(f"{c.FAIL}✖ Error cleaning column '{col}': {str(e)}{c.ENDC}")

        return df


In [31]:
# Step 1: Create an instance of DataWrangler
dw = DataWrangler("/Users/lucpellinger/Documents/Projects/BTC-Price-Prediction/BTC_Sentiment_Reddit/baseline_data_v1.csv")  # Replace with your CSV path

dw.convert_string_dates()
dw.extract_datetime_features()

# Check the new DataFrame
dw.summarize_basic_dataset(dw.transformed)

# If needed, you can always go back to:
# dw.df (raw loaded)
# dw.cleaned (cleaned version)

🔍 Columns in the dataset:
  • date
  • twitter_fomo
  • twitter_uncertain
  • twitter_hopeful
  • twitter_bearish
  • twitter_pessimistic_doubtful
  • twitter_sad
  • twitter_fearful_concerned
  • twitter_angry
  • twitter_mistrustful
  • twitter_panicking
  • twitter_annoyed_frustrated
  • twitter_bullish
  • twitter_optimistic
  • twitter_happy
  • twitter_euphoric_excited
  • reddit_fomo
  • reddit_uncertain
  • reddit_hopeful
  • reddit_bearish
  • reddit_pessimistic_doubtful
  • reddit_sad
  • reddit_fearful_concerned
  • reddit_angry
  • reddit_mistrustful
  • reddit_panicking
  • reddit_annoyed_frustrated
  • reddit_bullish
  • reddit_optimistic
  • reddit_happy
  • reddit_euphoric_excited
  • bitcointalk_fomo
  • bitcointalk_uncertain
  • bitcointalk_hopeful
  • bitcointalk_bearish
  • bitcointalk_pessimistic_doubtful
  • bitcointalk_sad
  • bitcointalk_fearful_concerned
  • bitcointalk_angry
  • bitcointalk_mistrustful
  • bitcointalk_panicking
  • bitcointalk_annoyed_frustrat

  converted = pd.to_datetime(df[col], errors='coerce', infer_datetime_format=True)




[1m[95m🧠 Extracting datetime features (non-destructive)...[0m
[92m✔ Extracted:[0m date_datetime_year from date_datetime
[92m✔ Extracted:[0m date_datetime_month from date_datetime
[92m✔ Extracted:[0m date_datetime_month_name from date_datetime
[92m✔ Extracted:[0m date_datetime_day from date_datetime
[92m✔ Extracted:[0m date_datetime_weekday from date_datetime
[92m✔ Extracted:[0m date_datetime_day_name from date_datetime
[92m✔ Extracted:[0m date_datetime_hour from date_datetime
[92m✔ Extracted:[0m date_datetime_minute from date_datetime
[92m✔ Extracted:[0m date_datetime_is_weekend from date_datetime



[1m[95mPreview of the data with all columns[0m
[94m--------------------------------------------------------[0m

                  date  twitter_fomo  twitter_uncertain  twitter_hopeful  \
0  2017-08-31 23:00:00      -0.44576          -0.942589         0.019913   
1  2017-09-01 00:00:00      -0.44576           0.196526         0.204217   
2  2017-09-01 01:00:00  

In [35]:
import pandas as pd

# Define a dictionary with mixed-format currency data
test_data = {
    "price_usd": [
        "$1,000.00",  # standard US format
        "$100.5",     # US format without thousand separator
        "2000",       # plain number
        "$1.50",      # decimal only
        "$1,234.56"   # standard US format
    ],
    "price_eur": [
        "1.000,00 €",  # standard EU format
        "1.234,56€",  # EU with thousands
        "1000€",      # no decimal
        "12,50€",     # decimal only
        "500,00€"     # simple EU format
    ],
    "price_mixed": [
        "€1.000,00",   # euro with dot comma
        "$ 1,000.00",   # dollar with comma dot
        "£2,500.99",   # GBP format
        "1000.00",     # plain decimal
        "1.000,00€"    # EU style again
    ],
    "non_currency": [
        "apple",       # clearly non-numeric
        "banana",
        "cherry",
        "42",          # numeric-looking string
        "1234"         # numeric-looking string
    ]
}

# Create the DataFrame
test_df = pd.DataFrame(test_data)

# Display it
print(test_df)

cleaned_df = dw.clean_currency_fields(test_df)


   price_usd   price_eur price_mixed non_currency
0  $1,000.00  1.000,00 €   €1.000,00        apple
1     $100.5   1.234,56€  $ 1,000.00       banana
2       2000       1000€   £2,500.99       cherry
3      $1.50      12,50€     1000.00           42
4  $1,234.56     500,00€   1.000,00€         1234


[1m[95m💰 Cleaning currency fields with intelligent separator handling...[0m
[92m✔ Cleaned 'price_usd' (5/5 parsed, comma-dot (US))[0m
   ↳ [36mCurrency type column:[0m 'price_usd_currency_type'
[92m✔ Cleaned 'price_eur' (5/5 parsed, dot-comma (EU))[0m
   ↳ [36mCurrency type column:[0m 'price_eur_currency_type'
[92m✔ Cleaned 'price_mixed' (5/5 parsed, dot-comma (EU))[0m
   ↳ [36mCurrency type column:[0m 'price_mixed_currency_type'


In [36]:
cleaned_df

Unnamed: 0,price_usd,price_eur,price_mixed,non_currency,price_usd_currency_type,price_usd_cleaned,price_eur_currency_type,price_eur_cleaned,price_mixed_currency_type,price_mixed_cleaned
0,"$1,000.00","1.000,00 €","€1.000,00",apple,USD,1000.0,EUR,1000.0,EUR,1000.0
1,$100.5,"1.234,56€","$ 1,000.00",banana,USD,100.5,EUR,1234.56,USD,1.0
2,2000,1000€,"£2,500.99",cherry,UNKNOWN,2000.0,EUR,1000.0,GBP,2.50099
3,$1.50,"12,50€",1000.00,42,USD,1.5,EUR,12.5,UNKNOWN,100000.0
4,"$1,234.56","500,00€","1.000,00€",1234,USD,1234.56,EUR,500.0,EUR,1000.0


## Descriptive Statistics & Visual Exploration

In [None]:
# =========================
# UTILITY SECTION
# =========================

import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import re
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from factor_analyzer import FactorAnalyzer, calculate_kmo

# ---- ANSI Color Codes for terminal formatting ----
class ColorCodes:
    """Class for terminal color codes used in logging."""
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    RED = '\033[31m'
    GREEN = '\033[32m'
    YELLOW = '\033[33m'
    BLUE = '\033[34m'
    MAGENTA = '\033[35m'
    CYAN = '\033[36m'
    WHITE = '\033[37m'
    BLACK = '\033[30m'

# ---- Custom formatter to strip ANSI color codes for file logging ----
class StripColorFormatter(logging.Formatter):
    """Custom log formatter to remove ANSI codes from log file output."""
    ansi_escape = re.compile(r'(?:\x1B[@-_][0-?]*[ -/]*[@-~])')

    def format(self, record):
        message = super().format(record)
        return self.ansi_escape.sub('', message)

# ---- Logger Initialization ----
def reinitialize_logger(name="DualLogger", log_file="eda_output.log", level=logging.INFO):
    """
    Initializes and configures a logger that outputs to both the terminal (with colors)
    and a log file (without ANSI codes).
    """
    logger = logging.getLogger(name)
    logger.setLevel(level)

    if logger.hasHandlers():
        logger.handlers.clear()

    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(level)
    console_handler.setFormatter(logging.Formatter('%(message)s'))

    file_handler = logging.FileHandler(log_file, mode='w')
    file_handler.setLevel(level)
    file_handler.setFormatter(StripColorFormatter('%(message)s'))

    logger.addHandler(console_handler)
    logger.addHandler(file_handler)

    return logger

# ---- Initialize Logger ----
log = reinitialize_logger()


# =========================
# DATA VISUALIZER SECTION
# =========================

class DataVisualizer:
    """
    A class to perform common exploratory data analysis (EDA) visualizations on a DataFrame.
    Supports various types of plots such as CDF, KDE, boxplots, and correlation heatmaps.
    """

    def __init__(self, df):
        """Initialize the visualizer with a DataFrame."""
        self.df = df

    def plot_cdf(self, columns: pd.DataFrame):
        """Plot cumulative distribution function (CDF) for selected columns."""
        n_cols = columns.shape[1] if len(columns.shape) > 1 else 1
        plt.figure(figsize=(20, 5 * n_cols))
        for i in range(n_cols):
            plt.subplot(n_cols, 1, i + 1)
            sorted_vals = np.sort(columns.iloc[:, i])
            cdf = np.arange(1, len(sorted_vals) + 1) / len(sorted_vals)
            plt.plot(sorted_vals, cdf, marker='.', linestyle='none')
            plt.title(f'CDF Plot - {columns.columns[i]}')
            plt.ylabel('CDF')
            plt.grid(True)
        plt.tight_layout()
        plt.show()

    def plot_kernel_density(self, columns: pd.DataFrame):
        """Plot kernel density estimation (KDE) for selected columns."""
        n_cols = columns.shape[1] if len(columns.shape) > 1 else 1
        plt.figure(figsize=(20, 5 * n_cols))
        for i in range(n_cols):
            plt.subplot(n_cols, 1, i + 1)
            sns.kdeplot(columns.iloc[:, i], fill=True, color='salmon')
            plt.title(f'Kernel Density - {columns.columns[i]}')
            plt.ylabel('Density')
            plt.grid(True)
        plt.tight_layout()
        plt.show()

    def plot_boxplot(self, columns: pd.DataFrame):
        """Plot boxplots for selected columns."""
        n_cols = columns.shape[1] if len(columns.shape) > 1 else 1
        plt.figure(figsize=(10, 5 * n_cols))
        for i in range(n_cols):
            plt.subplot(n_cols, 1, i + 1)
            sns.boxplot(y=columns.iloc[:, i], color='skyblue')
            plt.title(f'Boxplot - {columns.columns[i]}')
            plt.xlabel(columns.columns[i])
            plt.grid(True)
        plt.tight_layout()
        plt.show()

    def plot_correlation_heatmaps(self, method: str = 'pearson'):
        """Plot a heatmap of the correlation matrix using the specified method ('pearson' or 'spearman')."""
        if method not in ['pearson', 'spearman']:
            raise ValueError("Method must be 'pearson' or 'spearman'.")
        corr = self.df.corr(method=method)
        plt.figure(figsize=(15, 15))
        sns.heatmap(corr, annot=False, cmap='coolwarm', vmin=-1, vmax=1, center=0)
        plt.title(f'Correlation Heatmap ({method.title()} Coefficients)')
        plt.tight_layout()
        plt.show()

    def log_transform_and_standardize(self):
        """Apply log(1+x) transformation and standardization to numeric data."""
        numeric_data = self.df.select_dtypes(include=[np.number])
        log_transformed = np.log1p(numeric_data)
        scaler = StandardScaler()
        standardized = scaler.fit_transform(log_transformed)
        self.transformed_df = pd.DataFrame(standardized, columns=numeric_data.columns)
        log.info("✅ Log transformation and standardization completed.")
        return self.transformed_df

    def plot_elbow_method(self, max_k: int = 10):
        """Plot elbow curve to determine optimal number of clusters for KMeans."""
        data = self.transformed_df if hasattr(self, 'transformed_df') else self.df.select_dtypes(include=[np.number])
        inertia = []
        for k in range(1, max_k + 1):
            kmeans = KMeans(n_clusters=k, random_state=42)
            kmeans.fit(data)
            inertia.append(kmeans.inertia_)
        plt.figure(figsize=(10, 6))
        plt.plot(range(1, max_k + 1), inertia, 'bx-')
        plt.xlabel('k')
        plt.ylabel('Inertia')
        plt.title('Elbow Method For Optimal k')
        plt.grid(True)
        plt.savefig("elbow_plot.png")
        plt.show()

    def run_kmeans_clustering(self, k: int = 4):
        """Run KMeans clustering and return cluster means."""
        data = self.transformed_df if hasattr(self, 'transformed_df') else self.df.select_dtypes(include=[np.number])
        kmeans = KMeans(n_clusters=k, random_state=42)
        data['cluster_kmeans'] = kmeans.fit_predict(data)
        grouped = data.groupby('cluster_kmeans').mean()
        grouped.to_csv("kmeans_cluster_means.csv")
        log.info(f"✅ KMeans clustering done with {k} clusters. Results saved to CSV.")
        return grouped

    def run_efa(self, n_factors: int = None, rotation: str = None, variance_threshold: float = 0.6):
        """Run exploratory factor analysis (EFA) with optional rotation and threshold-based factor selection."""
        data = self.transformed_df if hasattr(self, 'transformed_df') else self.df.select_dtypes(include=[np.number])
        kmo_all, kmo_model = calculate_kmo(data)
        log.info(f"🔎 KMO Model Score: {kmo_model:.3f}")

        # Estimate number of factors if not provided
        fa_full = FactorAnalyzer(n_factors=data.shape[1], rotation=None)
        fa_full.fit(data)
        eigenvalues, _ = fa_full.get_eigenvalues()
        var_exp = fa_full.get_factor_variance()
        cum_var = var_exp[2]

        if n_factors is None:
            n_factors = np.argmax(cum_var >= variance_threshold) + 1
            log.info(f"✅ Selected {n_factors} factors based on {variance_threshold*100:.1f}% variance threshold.")

        # Final factor analysis with selected number of factors
        fa = FactorAnalyzer(n_factors=n_factors, rotation=rotation)
        fa.fit(data)
        loadings = fa.loadings_
        factors = pd.DataFrame(loadings, index=data.columns, columns=[f"Factor{i+1}" for i in range(n_factors)])
        factors.to_csv("efa_factor_loadings.csv")

        plt.figure(figsize=(20, 20))
        sns.heatmap(factors, annot=True, cmap='coolwarm', center=0, vmin=-1, vmax=1, cbar=True)
        plt.title("Factor Loadings Heatmap")
        plt.savefig("efa_heatmap.png")
        plt.show()

        # Scree plot
        plt.figure(figsize=(10, 6))
        plt.plot(range(1, len(eigenvalues) + 1), eigenvalues, marker='o')
        plt.title('Scree Plot of Eigenvalues')
        plt.xlabel('Factor Number')
        plt.ylabel('Eigenvalue')
        plt.axhline(y=1, color='r', linestyle='--')
        plt.grid(True)
        plt.savefig("efa_scree_plot.png")
        plt.show()

        return factors

    def parse_command(self, command):
        """Parse a command string with optional arguments like 'kmeans:k=3'."""
        if ':' in command:
            base, args_str = command.split(':', 1)
            args = dict(arg.split('=') for arg in args_str.split(','))
            for k, v in args.items():
                try:
                    args[k] = float(v) if '.' in v else int(v)
                except ValueError:
                    pass
            return base, args
        return command, {}

    def visualize(self, commands: list[str]):
        """Run visualizations based on list of command strings with optional arguments."""
        for command in commands:
            cmd, args = self.parse_command(command)
            if cmd == 'cdf':
                self.plot_cdf(self.df.select_dtypes(include=[np.number]))
            elif cmd == 'kde':
                self.plot_kernel_density(self.df.select_dtypes(include=[np.number]))
            elif cmd == 'boxplot':
                self.plot_boxplot(self.df.select_dtypes(include=[np.number]))
            elif cmd == 'pearson_corr':
                self.plot_correlation_heatmaps(method='pearson')
            elif cmd == 'spearman_corr':
                self.plot_correlation_heatmaps(method='spearman')
            elif cmd == 'log_standardize':
                self.log_transform_and_standardize()
            elif cmd == 'elbow':
                self.plot_elbow_method(**args)
            elif cmd == 'kmeans':
                self.run_kmeans_clustering(**args)
            elif cmd == 'efa':
                self.run_efa(**args)
            else:
                log.warning(f"Unknown visualization command: {command}")

In [None]:
viz = DataVisualizer(df)
viz.visualize(['cdf', 'boxplot', 'pearson_corr'])

viz.visualize([
    "log_standardize",
    "elbow:max_k=8",
    "kmeans:k=4",
    "efa:n_factors=5,rotation=varimax"
])



In [None]:
def perform_factor_analysis(df, columns, n_factors, rotation=None):
    fa = FactorAnalyzer(n_factors=n_factors, rotation=rotation)
    fa.fit(df[columns])
    loadings = pd.DataFrame(fa.loadings_, index=columns)
    return fa, loadings

def plot_scree(df, columns):
    fa = FactorAnalyzer()
    fa.fit(df[columns])
    ev, v = fa.get_eigenvalues()
    plt.plot(range(1, len(ev)+1), ev, marker='o')
    plt.title('Scree Plot')
    plt.xlabel('Factors')
    plt.ylabel('Eigenvalue')
    plt.grid(True)
    plt.show()


In [None]:
# Scree plot for Twitter features
plot_scree(df_cleaned, twitter_cols)


In [None]:
# Perform FA without rotation
fa, loadings = perform_factor_analysis(df_cleaned, twitter_cols, n_factors=3)
sns.heatmap(loadings, annot=True, cmap="vlag")
plt.title("Factor Loadings (No Rotation) - Twitter")
plt.show()


In [None]:
df_engineered = wrangler.engineer_relative_features(['twitter', 'reddit', 'bitcointalk'])
df_engineered.head()


In [None]:
twitter_relative = [col for col in df_engineered.columns if 'twitter_' in col and '_relative' in col]
plot_scree(df_engineered, twitter_relative)


In [None]:
fa_rot, loadings_rot = perform_factor_analysis(df_engineered, twitter_relative, n_factors=3, rotation="varimax")
sns.heatmap(loadings_rot, annot=True, cmap="coolwarm")
plt.title("Factor Loadings (Varimax Rotation) - Twitter Relative")
plt.show()


This concludes the EDA for Twitter. Repeat similar steps for Reddit and Bitcointalk by modifying the prefix and reusing the modular `DataWrangler`.