# 고급 데이터 필터링 및 분석
## 이메일 마케팅 타겟 선별 시스템

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import re
import ipywidgets as widgets
from IPython.display import display, clear_output

# 데이터 로드
data_path = '/Users/milo/Desktop/ocean/영중소구간필터링/202508최종취합/서울성남_통신판매사업자_완전통합.csv'
df = pd.read_csv(data_path)

print(f"데이터 로드 완료: {len(df):,}개 레코드")

In [None]:
# 필터링 함수들 정의
class DataFilter:
    def __init__(self, dataframe):
        self.df = dataframe.copy()
        self.filtered_df = dataframe.copy()
        
    def filter_by_region(self, regions=None):
        """지역별 필터링"""
        if regions:
            self.filtered_df = self.filtered_df[self.filtered_df['지역'].isin(regions)]
        return self
    
    def filter_by_business_type(self, business_types=None):
        """법인구분별 필터링 (개인/법인)"""
        if business_types:
            self.filtered_df = self.filtered_df[self.filtered_df['법인구분'].isin(business_types)]
        return self
    
    def filter_by_email_domain(self, domains=None, exclude_domains=None):
        """이메일 도메인별 필터링"""
        if domains:
            pattern = '|'.join([f'@{domain}' for domain in domains])
            self.filtered_df = self.filtered_df[self.filtered_df['전자우편'].str.contains(pattern, na=False)]
        
        if exclude_domains:
            pattern = '|'.join([f'@{domain}' for domain in exclude_domains])
            self.filtered_df = self.filtered_df[~self.filtered_df['전자우편'].str.contains(pattern, na=False)]
        return self
    
    def filter_by_website_platform(self, platforms=None):
        """웹사이트 플랫폼별 필터링 (네이버, 쿠팡 등)"""
        if platforms:
            pattern = '|'.join(platforms)
            self.filtered_df = self.filtered_df[self.filtered_df['인터넷도메인'].str.contains(pattern, na=False, case=False)]
        return self
    
    def filter_by_registration_date(self, start_date=None, end_date=None):
        """신고일자별 필터링"""
        if start_date:
            self.filtered_df = self.filtered_df[self.filtered_df['신고일자'] >= start_date]
        if end_date:
            self.filtered_df = self.filtered_df[self.filtered_df['신고일자'] <= end_date]
        return self
    
    def exclude_invalid_emails(self):
        """유효하지 않은 이메일 제외"""
        # 빈 이메일이나 마스킹된 이메일 제외
        self.filtered_df = self.filtered_df[
            (self.filtered_df['전자우편'].notna()) & 
            (self.filtered_df['전자우편'] != '') &
            (~self.filtered_df['전자우편'].str.contains('\*', na=False))
        ]
        return self
    
    def get_results(self):
        """필터링 결과 반환"""
        return self.filtered_df
    
    def reset_filters(self):
        """필터 초기화"""
        self.filtered_df = self.df.copy()
        return self
    
    def get_summary(self):
        """필터링 결과 요약"""
        original_count = len(self.df)
        filtered_count = len(self.filtered_df)
        
        print(f"원본 데이터: {original_count:,}개")
        print(f"필터링 후: {filtered_count:,}개")
        print(f"필터링 비율: {filtered_count/original_count*100:.1f}%")
        
        return {
            'original_count': original_count,
            'filtered_count': filtered_count,
            'filter_ratio': filtered_count/original_count*100
        }

# 필터 객체 생성
filter_obj = DataFilter(df)
print("필터링 시스템 준비 완료!")