In [1]:
# Assigning features and label variables


In [147]:
# Import LabelEncoder
import pandas as pd
import numpy as np
import seaborn as sns

# 라벨링
def encode_labels(*args, to_numpy=False):
    le = preprocessing.LabelEncoder()
    encoded_lists = [le.fit_transform(arg) for arg in args]
    
    if to_numpy:
        return [np.array(encoded) for encoded in encoded_lists]
    
    return encoded_lists
# 결측값 처리
class MissingValueHandler:
    def __init__(self, dataframe):
        self.df = dataframe

    def fill_missing(self, column_data, method):
        if method == 'mean':
            return column_data.fillna(column_data.mean())
        elif method == 'median':
            return column_data.fillna(column_data.median())
        elif method == 'mode':
            return column_data.fillna(column_data.mode()[0])
        elif method == 'zero':
            return column_data.fillna(0)
        else:
            raise ValueError("지원하지 않는 방법입니다. 'mean', 'median', 'mode', 또는 'zero' 중에서 선택하세요.")

    def drop(self):
        return self.df.dropna()

    def drop_loc(self):
        return self.df.dropna(axis=1)

    def get_dataframe(self):
        return self.df

    def summarize(self, preprocessing=False):
        #데이터프레임의 합계와 결측값 수를 출력하는 메서드
        if preprocessing:
            print("--- After Fill Missing ---")
            print("\nNumber of missing values in each column:")
        else:
            print("--- Before Fill Missing ---")
            print("Sum of each numeric column:")
        
        if not preprocessing:
            print(self.df.dtypes)
        
        print("\nNumber of missing values in each column:")
        print(self.df.isnull().sum())
        print("\n" + "-" * 30)

# 토큰나이져
class CategoricalEncoder:
    def __init__(self, dataframe):
        from sklearn import preprocessing
        self.df = dataframe

    def encode_labels(self):
        categorical_cols = self.df.select_dtypes(include=['object']).columns # 범주형 열 선택
        encoded_arrays = encode_labels(*[self.df[col] for col in categorical_cols], to_numpy=True) # 선택된 범주형 열을 인코딩하고 DataFrame으로 변환
        encoded_df = pd.DataFrame(encoded_arrays).T  # 인코딩된 결과를 DataFrame으로 결합, 전치하여 행과 열을 맞춤
        encoded_df.columns = categorical_cols # 인코딩된 열 이름 설정
        self.df.loc[:, categorical_cols] = encoded_df # .loc를 사용하여 원래 DataFrame에 인코딩된 열 대체

        return self.df


In [148]:
# 데이터셋 로드
df = sns.load_dataset('titanic')
df_filter = df[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']]

# 필터링된 데이터프레임으로 MissingValueHandler의 인스턴스 생성
MVS = MissingValueHandler(df_filter)

# 결측값 처리 예시
MVS.summarize(preprocessing=False)  # Before 상태
df_filter.loc[:, 'age'] = MVS.fill_missing(df_filter['age'], method='mean') 
df_filter.loc[:, 'embarked'] = MVS.fill_missing(df_filter['embarked'], method='mode')

# After 상태
MVS.summarize(preprocessing=True) # After 상태

--- Before Fill Missing ---
Sum of each numeric column:
survived      int64
pclass        int64
sex          object
age         float64
sibsp         int64
parch         int64
fare        float64
embarked     object
dtype: object

Number of missing values in each column:
survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
dtype: int64

------------------------------
--- After Fill Missing ---

Number of missing values in each column:

Number of missing values in each column:
survived    0
pclass      0
sex         0
age         0
sibsp       0
parch       0
fare        0
embarked    0
dtype: int64

------------------------------


In [150]:
encoder = CategoricalEncoder(df_filter)
df_encoded = encoder.encode_labels()
df_encoded 

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,1,22.000000,1,0,7.2500,2
1,1,1,0,38.000000,1,0,71.2833,0
2,1,3,0,26.000000,0,0,7.9250,2
3,1,1,0,35.000000,1,0,53.1000,2
4,0,3,1,35.000000,0,0,8.0500,2
...,...,...,...,...,...,...,...,...
886,0,2,1,27.000000,0,0,13.0000,2
887,1,1,0,19.000000,0,0,30.0000,2
888,0,3,0,29.699118,1,2,23.4500,2
889,1,1,1,26.000000,0,0,30.0000,0


In [None]:


# survived 열만 있는 데이터프레임
df_survived_only = df_filter[['survived']]

# survived 열이 없는 데이터프레임 (다른 열들만 포함)
df_without_survived = df_filter.drop(columns=['survived'])

df_filter =  encode_labels(X, X2, y, to_numpy=True)
# ncode_labels 함수
X_encoded, X2_encoded, y_encoded = encode_labels(X, X2, y, to_numpy=True)

#X_combined = list(zip(X_encoded, X2_encoded)) #튜플
X_combined = np.column_stack((X_encoded, X2_encoded)) #2D 배열

X_train, X_test, y_train, y_test = train_test_split(df_filter, df_filter['survived'], test_size=0.3, random_state=109)


model = GaussianNB()
model.fit(X_train, y_train)

predicted= model.predict([[0,2]]) # 0:Overcast, 2:Mild
print("Predicted Value:", predicted) # 1: Yes