In [2]:
import sys
from pathlib import Path

project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

print(f"Added to path: {project_root}")


Added to path: d:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 18008\MasterRepo & LabRepo\mlops-labs-portfolio\FlaskGCP_Lab


In [3]:
# Data loader testing:
from src.data_loader import get_pandas_data, get_polars_data, get_info
print(get_info())

df_pd = get_pandas_data()
print(f"Pandas shape: {df_pd.shape}")
df_pl = get_polars_data()
print(f"Polars shape: {df_pl.shape}")

# singleton works (should say "already loaded")
df_pd2 = get_pandas_data()  

# should NOT reload data (check console)
df_pd33 = get_pandas_data()
print(f"Same object? {df_pd is df_pd33}")  # Should print True

{'path': 'D:\\JoelDesktop folds_24\\NEU FALL2025\\MLops IE7374 18008\\MasterRepo & LabRepo\\mlops-labs-portfolio\\FlaskGCP_Lab\\data\\exports\\sec_filings_small_full.parquet', 'size_mb': 16.54, 'exists': True, 'pandas_loaded': False, 'polars_loaded': False}
Loading Pandas DataFrame from D:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 18008\MasterRepo & LabRepo\mlops-labs-portfolio\FlaskGCP_Lab\data\exports\sec_filings_small_full.parquet...
Loaded 200,000 rows into Pandas
Pandas shape: (200000, 19)
Loading Polars DataFrame from D:\JoelDesktop folds_24\NEU FALL2025\MLops IE7374 18008\MasterRepo & LabRepo\mlops-labs-portfolio\FlaskGCP_Lab\data\exports\sec_filings_small_full.parquet...
Loaded 200,000 rows into Polars
Polars shape: (200000, 19)
Same object? True


In [5]:

# All-in-one test cell
import sys
from pathlib import Path
import json
project_root = Path.cwd().parent if 'test' in str(Path.cwd()) else Path.cwd()
sys.path.insert(0, str(project_root))


from src.data_loader import get_pandas_data, get_polars_data
from src.stats_service import get_overall_stats
from src.benchmark_service import run_benchmark

print("Loading data...")
df_pd = get_pandas_data()
df_pl = get_polars_data()
print(f"Loaded: {df_pd.shape}")



# Check the section column data type and values
print("Pandas section dtype:", df_pd['section'].dtype)
print("Pandas unique sections:", df_pd['section'].unique()[:10])

print("\nPolars section dtype:", df_pl['section'].dtype)
print("Polars unique sections:", df_pl['section'].unique()[:10])
print("=" * 60)




# Load data
df_pd = get_pandas_data()
df_pl = get_polars_data()

# Test 1: Stats
print("1. STATS SERVICE")
print("=" * 60)
stats = get_overall_stats(df_pl)
print(f"Total rows: {stats['shape']['total_rows']:,}")
print(f"Unique companies: {stats['unique_values']['unique_companies']}")
print(f"Top company: {list(stats['top_companies'].items())[0]}")
print()

# Test 2: Benchmark
print("2. BENCHMARK SERVICE")
print("=" * 60)

bench = run_benchmark(df_pd, df_pl)
for test in bench['tests']:
    print(f"{test['test']}: {test['speedup']}x faster ({test['winner']})")
print()
print(f"Overall: {bench['summary']['verdict']}")



Loading data...
Loaded: (200000, 19)
Pandas section dtype: int64
Pandas unique sections: [0 1 2 3 4 5 6 7 8 9]

Polars section dtype: Int64
Polars unique sections: shape: (10,)
Series: 'section' [i64]
[
	0
	1
	2
	3
	4
	5
	6
	7
	8
	9
]
1. STATS SERVICE
Total rows: 200,000
Unique companies: 10
Top company: ('ADVANCED MICRO DEVICES INC', 38799)

2. BENCHMARK SERVICE
Filter rows (name contains 'CORP'): 22.38x faster (Polars)
Group by company + count: 3.46x faster (Polars)
Calculate avg sentence length: 4.25x faster (Polars)
Sort by filing date: 1.98x faster (Polars)
Complex query (filter+group+sort+limit): 0.96x faster (Pandas)

Overall: Polars is 3.0x faster overall


In [7]:
# ========================================
# TEST: OUTLIER DETECTION (Full Analysis)


from src.outlier_service import detect_text_outliers

print("OUTLIER DETECTION (Full Analysis)")
print("=" * 60)
print("Analyzing all 200k sentences...")
print("This may take 30-60 seconds...\n")

# Run full detection with 5% contamination
outliers = detect_text_outliers(df_pl, contamination=0.05)

# Print summary
summary = outliers['detection_summary']
print(f"Total sentences: {summary['total_sentences']:,}")
print(f"Outliers detected: {summary['outliers_detected']:,}")
print(f"Outlier percentage: {summary['outlier_percentage']}%")
print(f"Algorithm: {summary['algorithm']}")

# Print interpretation
print("\n" + "=" * 60)
print("OUTLIER CHARACTERISTICS")
print("=" * 60)
print(outliers['outlier_characteristics']['interpretation'])

# Show outlier vs normal stats
outlier_stats = outliers['outlier_characteristics']['outlier_stats']
normal_stats = outliers['outlier_characteristics']['normal_stats']

print(f"\nOutlier sentences:")
print(f"  Char count: {outlier_stats['char_count']['min']} - {outlier_stats['char_count']['max']} (avg: {outlier_stats['char_count']['mean']})")
print(f"  Word count: {outlier_stats['word_count']['min']} - {outlier_stats['word_count']['max']} (avg: {outlier_stats['word_count']['mean']})")

print(f"\nNormal sentences:")
print(f"  Char count avg: {normal_stats['char_count']['mean']}")
print(f"  Word count avg: {normal_stats['word_count']['mean']}")

# Show top 5 most anomalous sentences
print("\n" + "=" * 60)
print("TOP 5 MOST ANOMALOUS SENTENCES")
print("=" * 60)

for i, outlier in enumerate(outliers['top_10_outliers'][:5], 1):
    print(f"\n{i}. Company: {outlier['name']}")
    print(f"   Section: {outlier['section']}")
    print(f"   Length: {outlier['char_count']} chars, {outlier['word_count']} words")
    print(f"   Anomaly Score: {outlier['anomaly_score']:.3f}")
    print(f"   Text: {outlier['sentence'][:150]}...")
    print()

OUTLIER DETECTION (Full Analysis)
Analyzing all 200k sentences...
This may take 30-60 seconds...

Extracting text features...
Training IsolationForest (contamination=0.05)...




Total sentences: 200,000
Outliers detected: 9,980
Outlier percentage: 4.99%
Algorithm: Isolation Forest

OUTLIER CHARACTERISTICS
Outliers have avg 438 chars vs 155 chars for normal sentences

Outlier sentences:
  Char count: 1 - 4850 (avg: 437.99)
  Word count: 1 - 737 (avg: 64.58)

Normal sentences:
  Char count avg: 155.13
  Word count avg: 23.72

TOP 5 MOST ANOMALOUS SENTENCES

1. Company: ACME UNITED CORP
   Section: 10
   Length: 4045 chars, 428 words
   Anomaly Score: 0.255
   Text: Information on the Company's Operations by Business Segments: (All Figures in Thousands) 1997 1996 1995 - --------------------------------------------...


2. Company: ACME UNITED CORP
   Section: 10
   Length: 2374 chars, 237 words
   Anomaly Score: 0.249
   Text: Other disclosures related to the pension plan follow: 2002 2001 ------------------------------- Changes in benefit obligation Benefit obligation at be...


3. Company: ABBOTT LABORATORIES
   Section: 3
   Length: 2533 chars, 264 words
   An