In [1]:
import torch
from torch import nn
from torch.nn import functional as F
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

class PandasDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe
        self.features = dataframe.drop(columns=['y']).values  # 입력 데이터
        self.labels = dataframe['y'].values  # 레이블

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # 특정 샘플의 feature와 label 반환
        feature = torch.tensor(self.features[idx], dtype=torch.float32)
        label = torch.tensor(self.labels[idx], dtype=torch.float32)
        return feature, label

# 데이터 불러오기
train_data = pd.read_csv('train_B2C.csv')
test_data = pd.read_csv('test_B2C.csv')

# 필요 없는 열 제거
train_data = train_data.drop(columns=['train/test', 'cust_id', 'base_dt'])
test_data = test_data.drop(columns=['train/test', 'cust_id', 'base_dt'])

# 결측치 처리
rcnt_columns = ['rcnt_sub_0', 'rcnt_sub_1', 'rcnt_sub_2', 'rcnt_sub_3', 'rcnt_sub_4',
                'rcnt_buy_0', 'rcnt_buy_1', 'rcnt_buy_2', 'rcnt_buy_3', 'rcnt_buy_4']
for col in rcnt_columns:
    train_data[col].fillna('no item', inplace=True)
    test_data[col].fillna('no item', inplace=True)

for column in train_data.columns:
    if train_data[column].dtype == 'object':  # 문자열 데이터
        train_data[column] = train_data[column].fillna('Unknown')
    elif train_data[column].dtype in ['int64', 'float64']:  # 숫자 데이터
        train_data[column] = train_data[column].fillna(train_data[column].mean())

for column in test_data.columns:
    if test_data[column].dtype == 'object':  # 문자열 데이터
        test_data[column] = test_data[column].fillna('Unknown')
    elif test_data[column].dtype in ['int64', 'float64']:  # 숫자 데이터
        test_data[column] = test_data[column].fillna(test_data[column].mean())

# 카테고리형 및 수치형 피처 구분
categorical_features = [
    'gendr_nm', 'gen_div_nm', 'ctdo_nm', 'empmbr_entr_yn', 'lge_mbr_entr_yn', 'mbsp_app_estb_yn', 'mbsp_entr_yn', 'pref_stlm_kd',
    'thinq_entr_yn', 'new_prdc_pref_yn', 'orco_prdc_mny_buy_yn', 'orco_prdc_mny_hld_yn', 'hprop_clcn_voc_yn', 'lgebst_buy_hist_xstn_yn',
    'buy_hist_xstn_yn', 'most_infw_domn_nm', 'age_group',
    'rcnt_sub_0', 'rcnt_sub_1', 'rcnt_sub_2', 'rcnt_sub_3', 'rcnt_sub_4', 
    'rcnt_buy_0', 'rcnt_buy_1', 'rcnt_buy_2', 'rcnt_buy_3', 'rcnt_buy_4'
]
numeric_features = [
    'orco_prdc_buy_cnt', 'orco_prdc_hld_cnt', 'totl_stay_tm_sum', 
    'totl_pvw_cont_sum', 'inpc_page_knd_cont_sum'
]

# # 카테고리형 변수 인코딩
# for feat in categorical_features:
#     lbe = LabelEncoder()
#     train_data[feat] = lbe.fit_transform(train_data[feat])
#     test_data[feat] = lbe.fit_transform(test_data[feat])

# # 수치형 변수 스케일링
# scaler = StandardScaler()
# train_data[numeric_features] = scaler.fit_transform(train_data[numeric_features])
# test_data[numeric_features] = scaler.fit_transform(test_data[numeric_features])

# train_data, val_data = train_test_split(train_data, test_size=0.2)

# # Dataset 객체 생성
# train_dataset = PandasDataset(train_data)
# val_dataset = PandasDataset(val_data)
# test_dataset = PandasDataset(test_data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data[col].fillna('no item', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data[col].fillna('no item', inplace=True)


In [None]:
columns_of_interest = [
    'rcnt_sub_0', 'rcnt_sub_1', 'rcnt_sub_2', 'rcnt_sub_3', 'rcnt_sub_4',
    'rcnt_buy_0', 'rcnt_buy_1', 'rcnt_buy_2', 'rcnt_buy_3', 'rcnt_buy_4'
]

mask = train_data[columns_of_interest].apply(lambda row: row.isin(['PC', 'TV']).any(), axis=1)

data_with_pc_tv = train_data[mask]
data_without_pc_tv = train_data[~mask]

print(len(data_with_pc_tv))
print(len(data_without_pc_tv))

31 모든 아이템: {'두피관리기', '와인냉장고', '김치냉장고', '제습기', 'PC', 'TV', '스타일러', '프라엘', '안마의자', '세탁기', '홈브루', '냉동고', '공청기', '식물재배기', '슈케어', 'no item', '워시타워', '정수기냉장고', '전기레인지', '식세기', '공기청정기', '냉장고', '청소기', '건조기', '휴대폰', '정수기', 'RAC에어컨', '식기세척기', '에어컨', '노트북', '로봇청소기'}
274618
1722587
