In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import random
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

### seed값 고정하기


In [None]:
def seed_everything(seed):
  random.seed(seed)
  os.environ["PYTHONHASHSEED"] = str(seed)
  np.random.seed(seed)

seed_everything(42)

### Dataset 읽어오기

1. 중복되지 않는 열은 'id' column임

In [None]:
df_train = pd.read_csv("/content/drive/MyDrive/BITAmin/LG/lg_aimers.csv") # 학습용 데이터
df_test = pd.read_csv("/content/drive/MyDrive/BITAmin/LG/lg_aimers_submission.csv") # 테스트 데이터(제출파일의 데이터)

df_train.shape, df_test.shape

((59299, 29), (5271, 30))

### Data 전처리

In [None]:
# == 컬럼별 NA율 90% 이상 컬럼 제외 ==
df_train_drop_col = []
for col in df_train.columns:
  if df_train[col].isnull().sum() / len(df_train) * 100 > 90:
    df_train_drop_col.append(col)
    print(f"삭제된 column : {col}")
    df_train.drop(col, axis = 1, inplace = True)
    df_test.drop(col, axis = 1, inplace = True)

# == 중복되는 행 처리 ==
df_train = df_train.drop_duplicates()

print(f"\n중복 처리 난 뒤의 train shape : {df_train.shape}")



삭제된 column : id_strategic_ver
삭제된 column : it_strategic_ver
삭제된 column : idit_strategic_ver
삭제된 column : business_subarea
중복 처리 난 뒤의 train shape : (55779, 25)


### bant_submit

[Encoding]
* one_hot_encoding

In [None]:
df_train['bant_submit'].value_counts()

### customer_country

* 나라별 국가별로 나누기

* 확인 결과, 상대적으로 적은 개수에서 상위 20개의 속한 나라가 많은데, 코딩하기 너무 어려워서 일단 패스

* 이건 빼박 one-hot-encoding

In [None]:
# == 결측치 확인 ==
print('결측치 ratio : ', (df_train['customer_country'].isnull().sum() / len(df_train)) * 100)

# 결측치의 비율이 1.7이니 제일 많이 나온 값으로 대체할 수 있을 듯

결측치 ratio :  1.7461768766023056


In [None]:
cus_cty = pd.DataFrame(df_train['customer_country'])

# == 결측치 없애기 ==
cus_cty = cus_cty.dropna()

# == mode값 구하기 ==
cus_cty['customer_country'] = cus_cty['customer_country'].apply(lambda x : x.split('/')[-1] if '/' in x else x)

In [None]:
# india가 가장 많음
cus_cty_mode = cus_cty['customer_country'].value_counts().idxmax()
cus_cty_mode

'India'

In [None]:
df_train['customer_country'] = df_train['customer_country'].fillna(cus_cty_mode)
df_train['customer_country'] = df_train['customer_country'].apply(lambda x : x.split('/')[-1] if '/' in x else x)

df_train['customer_country'].value_counts()

India                             17326
Brazil                             8174
United States                      3058
Mexico                             2511
Philippines                        2436
                                  ...  
7673 HEMPSTON CIR                     1
5555                                  1
PO Box 112292CarrolltonTX75011        1
1808 Lithgow RdCelinaTX75009          1
Manaus                                1
Name: customer_country, Length: 582, dtype: int64

In [None]:
pd.set_option('display.max_rows', None)
df_train['customer_country'].value_counts()

In [None]:
cus_cty_val_counts = df_train['customer_country'].value_counts()

threshold = 20
df_train['customer_country'] = df_train['customer_country'].apply(lambda x : 'others' if cus_cty_val_counts[x] < threshold else x)

df_train['customer_country'].value_counts()

### business_unit

* 결측치 없음

* one-hot-encoding

* cm은 1개이므로 해당 행 지우기

In [None]:
df_train['business_unit'].value_counts()

# == CM 삭제 ==
bus_condition = df_train['business_unit'] == 'CM'
df_train = df_train[~bus_condition]

df_train['business_unit'].value_counts()

ID          24863
AS          22649
IT           7979
Solution      287
Name: business_unit, dtype: int64