In [32]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
%matplotlib inline

# 데이터 불러오기
df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./submission.csv')

In [33]:
col = [
    'historical_existing_cnt', #이전에 Converted된 횟수
    'com_reg_ver_win_rate', #비즈니스 단위 및 지역을 기준으로 한 영업 전환 비율
    'ver_win_rate_x', # 전체 Lead 중에서 Vertical을 기준으로 한 성공 비율
    'ver_win_ratio_per_bu', #특정 Vertical Level1의 Business Unit 별 영업 전환된 샘플 수의 비율
    'is_converted',
]

In [34]:
df_train = df_train[col]
df_test = df_test[col]

In [35]:
# True로 예측된 개수
num_true = sum(df_train['is_converted'] == 1)
print(f'Number of Total Train length: {len(df_train)}')
print(f"Number of True: {num_true}")
print(f'Number of Total Test length: {len(df_test)}')

Number of Total Train length: 59299
Number of True: 4850
Number of Total Test length: 5271


In [36]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59299 entries, 0 to 59298
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   historical_existing_cnt  13756 non-null  float64
 1   com_reg_ver_win_rate     14568 non-null  float64
 2   ver_win_rate_x           18417 non-null  float64
 3   ver_win_ratio_per_bu     15304 non-null  float64
 4   is_converted             59299 non-null  bool   
dtypes: bool(1), float64(4)
memory usage: 1.9 MB


# historical_existing_cnt  

In [37]:
df_train['historical_existing_cnt'] = df_train['historical_existing_cnt'].fillna(0).astype('float64')
df_test['historical_existing_cnt'] = df_test['historical_existing_cnt'].fillna(0).astype('float64')

# com_reg_ver_win_rate

In [39]:
df_train['com_reg_ver_win_rate'] = df_train['com_reg_ver_win_rate'].fillna(0).astype('float64')
df_test['com_reg_ver_win_rate'] = df_test['com_reg_ver_win_rate'].fillna(0).astype('float64')

In [40]:
df_train['com_reg_ver_win_rate']

0        0.066667
1        0.066667
2        0.088889
3        0.088889
4        0.088889
           ...   
59294    0.000000
59295    0.040000
59296    0.040000
59297    0.040000
59298    0.000000
Name: com_reg_ver_win_rate, Length: 59299, dtype: float64

# ver_win_rate_x

In [41]:
df_train['ver_win_rate_x']

0        0.003079
1        0.003079
2        0.003079
3        0.003079
4        0.003079
           ...   
59294    0.000026
59295    0.000026
59296    0.000026
59297    0.000026
59298    0.000026
Name: ver_win_rate_x, Length: 59299, dtype: float64

In [42]:
df_train['ver_win_rate_x'] = df_train['ver_win_rate_x'].fillna(0).astype('float64')
df_test['ver_win_rate_x'] = df_test['ver_win_rate_x'].fillna(0).astype('float64')

# ver_win_ratio_per_bu

In [43]:
df_train['ver_win_ratio_per_bu']

0        0.026846
1        0.026846
2        0.026846
3        0.026846
4        0.026846
           ...   
59294    0.028777
59295    0.028777
59296    0.028777
59297    0.028777
59298         NaN
Name: ver_win_ratio_per_bu, Length: 59299, dtype: float64

In [44]:
df_train['ver_win_ratio_per_bu'] = df_train['ver_win_ratio_per_bu'].fillna(0).astype('float64')
df_test['ver_win_ratio_per_bu'] = df_test['ver_win_ratio_per_bu'].fillna(0).astype('float64')

In [45]:
print(df_train.info())
print(df_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59299 entries, 0 to 59298
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   historical_existing_cnt  59299 non-null  float64
 1   com_reg_ver_win_rate     59299 non-null  float64
 2   ver_win_rate_x           59299 non-null  float64
 3   ver_win_ratio_per_bu     59299 non-null  float64
 4   is_converted             59299 non-null  bool   
dtypes: bool(1), float64(4)
memory usage: 1.9 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5271 entries, 0 to 5270
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   historical_existing_cnt  5271 non-null   float64
 1   com_reg_ver_win_rate     5271 non-null   float64
 2   ver_win_rate_x           5271 non-null   float64
 3   ver_win_ratio_per_bu     5271 non-null   float64
 4   is_converted             

In [46]:
PMI_df_train = df_train.drop('is_converted', axis = 1)
PMI_df_test = df_test.drop('is_converted', axis = 1)

PMI_df_train.to_csv('./preprocessing_data/PMI(성과측정정보)_train.csv')
PMI_df_test.to_csv('./preprocessing_data/PMI(성과측정정보)_test.csv')

In [9]:
# df_train 데이터 프레임을 복사하여 새로운 데이터 프레임을 생성합니다.
df_train_transformed = df_train.copy()
df_test_transformed = df_test.copy()

# 'object' 타입 변수 목록을 가져옵니다.
object_columns = df_train_transformed.select_dtypes(include=['object']).columns

# 각 object 타입 변수에 대한 변환 비율을 저장하는 딕셔너리를 생성합니다.
conversion_rate_dict = {}

for column in object_columns:
    # 각 object 타입 변수에 대해 is_converted의 평균값을 계산합니다.
    conversion_rate = df_train_transformed.groupby(column)['is_converted'].mean()
    
    # 계산된 평균값을 딕셔너리에 저장합니다.
    conversion_rate_dict[column] = conversion_rate
    
    # 새로운 컬럼 이름을 설정합니다.
    new_column_name = f"{column}_conversion_rate"
    
    # 원본 데이터 프레임에 새로운 컬럼을 추가합니다. map 함수를 사용하여 각 행의 값에 대응하는 비율을 적용합니다.
    df_train_transformed[new_column_name] = df_train_transformed[column].map(conversion_rate)
    df_test_transformed[new_column_name] = df_test_transformed[column].map(conversion_rate_dict[column])

    # 테스트 데이터에 없는 카테고리의 경우 0 또는 평균값으로 처리합니다.
    df_test_transformed[new_column_name].fillna(df_test_transformed[new_column_name].mean(), inplace=True)

# 불필요한 컬럼을 제거합니다.
df_train = df_train_transformed.select_dtypes(include=['float64', 'bool'])
df_test = df_test_transformed.select_dtypes(include=['float64', 'bool'])

# 이제 df_train과 df_test는 변환된 특성을 포함하고 있습니다.

In [10]:
df_train

Unnamed: 0,is_converted,lead_owner_conversion_rate,expected_timeline_conversion_rate
0,True,0.046620,0.101449
1,True,0.750000,0.101449
2,True,0.325000,0.101449
3,True,0.379310,0.101449
4,True,0.789474,0.101449
...,...,...,...
59294,False,0.000000,0.076204
59295,False,0.000000,0.071753
59296,False,0.000000,0.101449
59297,False,0.000000,0.041019


In [11]:
from pycaret.classification import *
clf = setup(data=df_train, 
            target='is_converted', 
            session_id=123, 
            use_gpu=True, 
            #normalize=True,  # 데이터 정규화
            #normalize_method='minmax',  # 정규화 방법
            #feature_selection=True,  # 특성 선택
            #remove_multicollinearity=True,  # 다중공선성 제거
            #categorical_imputation='lightgbm',  # 범주형 변수 결측치는 최빈값으로 대체
            #imputation_type='iterative',  # 반복적인(imputation) 방식 사용
            #categorical_features = col[:-1],
            verbose=True,
           )

[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bi

[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1


Unnamed: 0,Description,Value
0,Session id,123
1,Target,is_converted
2,Target type,Binary
3,Original data shape,"(59299, 3)"
4,Transformed data shape,"(59299, 3)"
5,Transformed train set shape,"(41509, 3)"
6,Transformed test set shape,"(17790, 3)"
7,Numeric features,2
8,Rows with missing values,52.0%
9,Preprocess,True


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1


[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1


In [12]:
# 모델 비교
best_model = compare_models(sort='F1', cross_validation=True)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.9338,0.9356,0.511,0.6149,0.558,0.5226,0.5253,0.015
lr,Logistic Regression,0.9388,0.9356,0.4015,0.7299,0.5174,0.4877,0.513,0.03
nb,Naive Bayes,0.9258,0.904,0.4701,0.5544,0.5085,0.4686,0.4707,0.012
qda,Quadratic Discriminant Analysis,0.9257,0.9038,0.4704,0.5538,0.5084,0.4685,0.4705,0.013
lightgbm,Light Gradient Boosting Machine,0.9387,0.9378,0.3832,0.7433,0.5052,0.4761,0.5062,0.111
rf,Random Forest Classifier,0.9366,0.919,0.3935,0.7025,0.5035,0.4726,0.4958,0.321
ridge,Ridge Classifier,0.9394,0.0,0.3711,0.7689,0.5001,0.472,0.508,0.018
gbc,Gradient Boosting Classifier,0.939,0.9392,0.3711,0.7628,0.4987,0.4704,0.5055,0.43
knn,K Neighbors Classifier,0.9318,0.848,0.413,0.6263,0.497,0.4621,0.474,0.159
et,Extra Trees Classifier,0.9369,0.9071,0.3785,0.7175,0.4946,0.4644,0.492,0.263
