In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

import re
from sklearn.decomposition import PCA

print(np.__version__)

1.26.3


여태 전처리 총 정리 코드

유니크 값 많은 상위 값 기준으로 예외처리

expected timeline w2v

In [59]:
df_train = pd.read_csv("/kaggle/input/b2b-customer/train.csv") # 학습용 데이터
df_test = pd.read_csv("/kaggle/input/b2b-customer/submission (1).csv") # 테스트 데이터(제출파일의 데이터)

In [60]:
# Solution, CM 제거

df_train = df_train[~df_train['business_unit'].isin(['Solution', 'CM'])]
df_train['business_unit'].unique()

array(['AS', 'ID', 'IT'], dtype=object)

In [61]:
import re
# '_' 제거
def remove_underbar(text):
    text = re.sub(r'[_]', ' ', text)
    return text
# '-' 제거
def remove_hyphen(text):
    text = re.sub(r'[-]', ' ', text)
    return text
# ' /' or '/ ' -> '/'
def remove_left_space(text):
    text = re.sub(r'\s/', '/', text)
    return text
def remove_right_space(text):
    text = re.sub(r'/\s', '/', text)
    return text
# '//' -> '/'
def remove_double_slash(text):
  text = re.sub(r'//', '/', text)
  return text

def text_cleaning(text):
    if type(text) != str:
        return text
    text = remove_underbar(text)
    text = remove_hyphen(text)
    text = remove_left_space(text)
    text = remove_right_space(text)
    text = remove_double_slash(text)
    return text

In [62]:
object_features = [f for f in df_train.columns.tolist() if df_train[f].dtype == object]
num_features = [f for f in df_train.columns.tolist() if f not in object_features and f != 'is_converted']

In [63]:
# upper -> lower
# (other, etc., others, other_) replace
for col in object_features:
    df_train[col] = df_train[col].str.lower()
    df_train[col] = df_train[col].replace({'other':'others', 'other_':'others', 'etc.':'others', 'etc':'others', 'etc_':'others'})
    df_train[col] = df_train[col].apply(text_cleaning)

    df_test[col] = df_test[col].str.lower()
    df_test[col] = df_test[col].replace({'other':'others', 'other_':'others', 'etc.':'others', 'etc':'others', 'etc_':'others'})
    df_test[col] = df_test[col].apply(text_cleaning)

# 특정 칼럼에 대해상위 몇 개의 값만 남기고 모두 예외처리

* business_unit
* customer_type
* customer_job
* inquiry_type
* business_area

모두 train 데이터를 기준으로 상위 값을 체크했음

In [64]:
df_train.loc[df_train['lead_desc_length'] >= 200, 'lead_desc_length'] = 1
df_train.loc[df_train['lead_desc_length'] < 200, 'lead_desc_length'] = 0

df_test.loc[df_test['lead_desc_length'] >= 200, 'lead_desc_length'] = 1
df_test.loc[df_test['lead_desc_length'] < 200, 'lead_desc_length'] = 0

In [65]:
# # 구간화
# df_train['lead_desc_length'] = pd.cut(df_train['lead_desc_length'],
#                                       bins=[-float('inf'), 200, float('inf')], 
#                                       labels=[0, 1], right=False)

# df_train['lead_desc_length'].value_counts()

In [66]:
# 상위 8 값만
customer_type_top = df_train['customer_type'].value_counts().index[:8]

# 나머지는 'else' 값으로
df_train.loc[~df_train['customer_type'].isin(customer_type_top), 'customer_type'] = 'else'

# 상위 8 값만
# customer_type_top = df_test['customer_type'].value_counts().index[:8]

# 나머지는 'else' 값으로
df_test.loc[~df_test['customer_type'].isin(customer_type_top), 'customer_type'] = 'else'

In [67]:
# # 확인
# print(df_train['customer_type'].value_counts())
# print()
# print('결측치 수: ', df_train['customer_type'].isnull().sum())

In [68]:
# 상위 10개 값만
customer_job_top = df_train['customer_job'].value_counts().index[:10]

# 나머지는 'else' 값으로
df_train.loc[~df_train['customer_job'].isin(customer_job_top), 'customer_job'] = 'else'

# 상위 10개 값만
# customer_job_top = df_test['customer_job'].value_counts().index[:10]

# 나머지는 'else' 값으로
df_test.loc[~df_test['customer_job'].isin(customer_job_top), 'customer_job'] = 'else'

In [69]:
# 상위 10개 값만
inquiry_type_top = df_train['inquiry_type'].value_counts().index[:10]

# 나머지는 'else' 값으로
df_train.loc[~df_train['inquiry_type'].isin(inquiry_type_top), 'inquiry_type'] = 'else'

# 상위 10개 값만
# inquiry_type_top = df_test['inquiry_type'].value_counts().index[:10]

# 나머지는 'else' 값으로
df_test.loc[~df_test['inquiry_type'].isin(inquiry_type_top), 'inquiry_type'] = 'else'

In [70]:
# 상위 10개 값만
business_area_top = df_train['business_area'].value_counts().index[:10]

# 나머지는 'else' 값으로
df_train.loc[~df_train['business_area'].isin(business_area_top), 'business_area'] = 'else'

# 상위 10개 값만
# inquiry_type_top = df_test['business_area'].value_counts().index[:10]

# 나머지는 'else' 값으로
df_test.loc[~df_test['business_area'].isin(business_area_top), 'business_area'] = 'else'

# product_category 매핑 + 상위 값 외 예외처리

In [71]:
# product_category의 외국어 매핑

df_train['product_category'] = df_train['product_category'].replace({'one:quick series':'lg one:quick series', 'ar condicionado residencial':'residential air conditioner', 'aire acondicionado residencial':'residential air conditioner',
                                   'تكييف وتبريد': 'vrf', 'نظام التدفق المتغيرvrf': 'vrf', 'مبرد (تشيلر)':'chiller','חימום':'heating', 'حلول التدفئة':'heating','פיצול מרובה':'multi-split','آخر':'etc.','אחר':'multi-split',
                                   'led 顯示屏':'led signage','oled 顯示屏':'oled signage','互動式顯示屏':'interactive signage','標準顯示屏':'standard signage','特別顯示屏':'special signage','酒店電視':'hotel tv','軟體':'software solution',
                                   '醫院電視':'hospital tv','高亮度顯示屏':'high brightness signage',"điều hòa trung tâm multi":'multi-split',"ฯลฯ":'etc.',"điều hòa cục bộ":'single-split',"điều hòa gia dụng":'residential air conditioner',
                                   "เครื่องปรับอากาศเผื่อที่อยู่อาศัย":'residential air conditioner',"khác":'etc.',"điều hòa trung tâm chiller":'chiller',"điều hòa trung tâm vrf":'vrf',"tv":'standard signage',
                                  "tv signage":'standard signage',"high brightness":'high brightness signage',"multi-split (plusieurs pièces)":'multi-split',"autre":'etc.',"climatiseur résidentiel":'residential air conditioner',
                                   "cac":'residential air conditioner',"rac/cac":'residential air conditioner',"systèmes de débit à réfrigérant variable (drv)":'vrf',
                                   "grzewanie (pompy ciepła)":'heating',"isıtma":'heating',"lainnya":'etc.',"soğutucu":'chiller',"vb.":'etc.',"ac rumah":'residential air conditioner',
                                    'מזגנים למקום מגורים':'residential air conditioner','تكييفات':'residential air conditioner'
                                   })

df_test['product_category'] = df_test['product_category'].replace({'one:quick series':'lg one:quick series', 'ar condicionado residencial':'residential air conditioner', 'aire acondicionado residencial':'residential air conditioner',
                                   'تكييف وتبريد': 'vrf', 'نظام التدفق المتغيرvrf': 'vrf', 'مبرد (تشيلر)':'chiller','חימום':'heating', 'حلول التدفئة':'heating','פיצול מרובה':'multi-split','آخر':'etc.','אחר':'multi-split',
                                   'led 顯示屏':'led signage','oled 顯示屏':'oled signage','互動式顯示屏':'interactive signage','標準顯示屏':'standard signage','特別顯示屏':'special signage','酒店電視':'hotel tv','軟體':'software solution',
                                   '醫院電視':'hospital tv','高亮度顯示屏':'high brightness signage',"điều hòa trung tâm multi":'multi-split',"ฯลฯ":'etc.',"điều hòa cục bộ":'single-split',"điều hòa gia dụng":'residential air conditioner',
                                   "เครื่องปรับอากาศเผื่อที่อยู่อาศัย":'residential air conditioner',"khác":'etc.',"điều hòa trung tâm chiller":'chiller',"điều hòa trung tâm vrf":'vrf',"tv":'standard signage',
                                  "tv signage":'standard signage',"high brightness":'high brightness signage',"multi-split (plusieurs pièces)":'multi-split',"autre":'etc.',"climatiseur résidentiel":'residential air conditioner',
                                   "cac":'residential air conditioner',"rac/cac":'residential air conditioner',"systèmes de débit à réfrigérant variable (drv)":'vrf',
                                   "grzewanie (pompy ciepła)":'heating',"isıtma":'heating',"lainnya":'etc.',"soğutucu":'chiller',"vb.":'etc.',"ac rumah":'residential air conditioner',
                                    'מזגנים למקום מגורים':'residential air conditioner','تكييفات':'residential air conditioner'
                                   })

In [73]:
df_train = df_train.reset_index()

In [74]:
# 'monitor signage,monior/monitor tv,system ac,vacuum cleaner,,, ' -> 'monitor'
# 비슷한 것들을 합치고 너무 비율이 작은 것들은 'the others' 로 묶었습니다

df_train['product_category'] = df_train['product_category'].replace({'commercial tv tv':'commercial tv', 'ur640s':'ur640'})
df_test['product_category'] = df_test['product_category'].replace({'commercial tv tv':'commercial tv', 'ur640s':'ur640'})


for i in range(len(df_train)):
    if not isinstance(df_train.loc[i,'product_category'], str):
        continue
    if 'vrf' in df_train.loc[i,'product_category']:
        df_train.loc[i,'product_category'] = 'vrf'
    elif 'multi-split' in df_train.loc[i,'product_category']:
        df_train.loc[i,'product_category'] = 'multi-split'
    elif 'single-split' in df_train.loc[i,'product_category']:
        df_train.loc[i,'product_category'] = 'single-split'
    elif 'chiller' in df_train.loc[i,'product_category']:
        df_train.loc[i,'product_category'] = 'chiller'
    elif 'monitor' in df_train.loc[i,'product_category']:
        df_train.loc[i,'product_category'] = 'monitor'
        
        
for i in range(len(df_test)):
    if not isinstance(df_test.loc[i,'product_category'], str):
        continue
    if 'vrf' in df_test.loc[i,'product_category']:
        df_test.loc[i,'product_category'] = 'vrf'
    elif 'multi-split' in df_test.loc[i,'product_category']:
        df_test.loc[i,'product_category'] = 'multi-split'
    elif 'single-split' in df_test.loc[i,'product_category']:
        df_test.loc[i,'product_category'] = 'single-split'
    elif 'chiller' in df_test.loc[i,'product_category']:
        df_test.loc[i,'product_category'] = 'chiller'
    elif 'monitor' in df_test.loc[i,'product_category']:
        df_test.loc[i,'product_category'] = 'monitor'

In [75]:
# 상위 10개 값만
product_category_top = df_train['product_category'].value_counts().index[:35]

# 나머지는 'else' 값으로
df_train.loc[~df_train['product_category'].isin(product_category_top), 'product_category'] = 'the others'

df_test.loc[~df_test['product_category'].isin(product_category_top), 'product_category'] = 'the others'

In [76]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59001 entries, 0 to 59000
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   index                    59001 non-null  int64  
 1   bant_submit              59001 non-null  float64
 2   customer_country         58024 non-null  object 
 3   business_unit            59001 non-null  object 
 4   com_reg_ver_win_rate     14567 non-null  float64
 5   customer_idx             59001 non-null  int64  
 6   customer_type            59001 non-null  object 
 7   enterprise               59001 non-null  object 
 8   historical_existing_cnt  13756 non-null  float64
 9   id_strategic_ver         3444 non-null   float64
 10  it_strategic_ver         1121 non-null   float64
 11  idit_strategic_ver       4565 non-null   float64
 12  customer_job             59001 non-null  object 
 13  lead_desc_length         59001 non-null  int64  
 14  inquiry_type          

# 원 핫 + 라벨인코딩

In [77]:
del_col = ['customer_country', 'product_subcategory', 'product_modelname',
          'customer_country.1', 'business_subarea', 'historical_existing_cnt']

# expected_timeline

In [78]:
modify = ['id_strategic_ver', 'it_strategic_ver',
       'idit_strategic_ver']

# expected_timeline

In [79]:
one_hot_col = ['business_unit', 'enterprise', 'response_corporate']

In [80]:
label_col = ['customer_idx', 'lead_owner'] # 이미 돼있음

In [81]:
df_train.drop('index', axis=1, inplace=True)
df_test.drop('id', axis=1, inplace=True)
df_train.drop(del_col, axis=1, inplace=True)
df_test.drop(del_col, axis=1, inplace=True)

In [82]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse_output=False)

for col in one_hot_col:
                                                            # .values 가능
    ohe_train_cols = pd.DataFrame(ohe.fit_transform(np.array(df_train[col]).reshape(-1, 1)),
                 columns = [col + '_' + sub for sub in ohe.categories_[0]])
    df_train = pd.concat([df_train.drop(columns = col), ohe_train_cols], axis=1)
    
    ohe_test_cols = pd.DataFrame(ohe.transform(np.array(df_test[col]).reshape(-1, 1)),
                 columns = [col + '_' + sub for sub in ohe.categories_[0]])
    df_test = pd.concat([df_test.drop(columns = col), ohe_test_cols], axis=1)
    

In [83]:
object_features = [f for f in df_train.columns.tolist() if df_train[f].dtype == object]

In [84]:
object_features.remove('expected_timeline')

In [85]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df_tmp = pd.concat([df_train, df_test], axis=0)
for col in object_features:
                    
    df_tmp[col] = le.fit_transform(df_tmp[col])
    df_train[col] = df_tmp[col][:len(df_train)]
    df_test[col] = df_tmp[col][len(df_train)+1:]



In [86]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59001 entries, 0 to 59000
Data columns (total 77 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   bant_submit               59001 non-null  float64
 1   com_reg_ver_win_rate      14567 non-null  float64
 2   customer_idx              59001 non-null  int64  
 3   customer_type             59001 non-null  int64  
 4   id_strategic_ver          3444 non-null   float64
 5   it_strategic_ver          1121 non-null   float64
 6   idit_strategic_ver        4565 non-null   float64
 7   customer_job              59001 non-null  int64  
 8   lead_desc_length          59001 non-null  int64  
 9   inquiry_type              59001 non-null  int64  
 10  product_category          59001 non-null  int64  
 11  customer_position         59001 non-null  int64  
 12  expected_timeline         28211 non-null  object 
 13  ver_cus                   59001 non-null  int64  
 14  ver_pr

In [87]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5271 entries, 0 to 5270
Data columns (total 77 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   bant_submit               5271 non-null   float64
 1   com_reg_ver_win_rate      1788 non-null   float64
 2   customer_idx              5271 non-null   int64  
 3   customer_type             5270 non-null   float64
 4   id_strategic_ver          593 non-null    float64
 5   it_strategic_ver          53 non-null     float64
 6   idit_strategic_ver        646 non-null    float64
 7   customer_job              5270 non-null   float64
 8   lead_desc_length          5271 non-null   int64  
 9   inquiry_type              5270 non-null   float64
 10  product_category          5270 non-null   float64
 11  customer_position         5270 non-null   float64
 12  expected_timeline         2863 non-null   object 
 13  ver_cus                   5271 non-null   int64  
 14  ver_pro 

In [88]:
df_train

Unnamed: 0,bant_submit,com_reg_ver_win_rate,customer_idx,customer_type,id_strategic_ver,it_strategic_ver,idit_strategic_ver,customer_job,lead_desc_length,inquiry_type,...,response_corporate_lgesp,response_corporate_lgesw,response_corporate_lgeth,response_corporate_lgetk,response_corporate_lgett,response_corporate_lgeuk,response_corporate_lgeur,response_corporate_lgeus,response_corporate_lgevh,response_corporate_lgeyk
0,1.00,0.066667,32160,3,,,,9,0,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.00,0.066667,23122,3,,,,4,0,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.00,0.088889,1755,3,,,,5,0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.00,0.088889,4919,3,,,,4,0,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.00,0.088889,17126,8,,,,2,0,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58996,1.00,,33747,3,,,,5,0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58997,1.00,,33747,3,,,,5,0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58998,0.75,0.040000,35420,8,,,,4,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58999,0.75,0.040000,19249,8,,,,5,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [89]:
nan_cols = ['com_reg_ver_win_rate', 'id_strategic_ver', 'it_strategic_ver',
            'idit_strategic_ver', 'ver_win_rate_x', 'ver_win_ratio_per_bu']

for col in nan_cols:
    df_train.loc[df_train[col].isna(), col] = 0
    df_test.loc[df_test[col].isna(), col] = 0


# Word2vec ('expected timeline)

In [90]:
df_train['expected_timeline']

0         less than 3 months
1         less than 3 months
2         less than 3 months
3         less than 3 months
4         less than 3 months
                ...         
58996    3 months ~ 6 months
58997    3 months ~ 6 months
58998      9 months ~ 1 year
58999     less than 3 months
59000       more than a year
Name: expected_timeline, Length: 59001, dtype: object

In [91]:
# 전처리 함수 선언
def preprocessing(text):
    # 개행문자 제거
    text = re.sub('\\\\n', ' ', str(text))
    # 영문, 숫자만 남기고 모두 제거
    text = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣa-zA-Z0-9]', ' ', str(text))
    return text
sentences_train = df_train['expected_timeline'].apply(preprocessing)
sentences_test= df_test['expected_timeline'].apply(preprocessing)

In [92]:
for i in range(len(sentences_train)):
    sentences_train.iloc[i] = list(sentences_train.iloc[i].split())
for i in range(len(sentences_test)):
    sentences_test.iloc[i] = list(sentences_test.iloc[i].split())

In [93]:
sentences_train

0        [less, than, 3, months]
1        [less, than, 3, months]
2        [less, than, 3, months]
3        [less, than, 3, months]
4        [less, than, 3, months]
                  ...           
58996     [3, months, 6, months]
58997     [3, months, 6, months]
58998       [9, months, 1, year]
58999    [less, than, 3, months]
59000      [more, than, a, year]
Name: expected_timeline, Length: 59001, dtype: object

In [94]:
import gensim
w2v_model = gensim.models.Word2Vec(sentences_train,
                                   vector_size=100,
                                   window=5,
                                   min_count=1)

In [95]:
%pip install numpy==1.22.4

Note: you may need to restart the kernel to use updated packages.


In [96]:
words = set(w2v_model.wv.index_to_key)
train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                      for ls in sentences_train], dtype=object)
test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                      for ls in sentences_test], dtype=object)

In [97]:
train_vect_avg = []
for v in train_vect:
    if v.size:
        train_vect_avg.append(v.mean(axis=0))
    else:
        train_vect_avg.append(np.zeros(100, dtype=float))

test_vect_avg = []
for v in test_vect:
    if v.size:
        test_vect_avg.append(v.mean(axis=0))
    else:
        test_vect_avg.append(np.zeros(100, dtype=float))

In [98]:
from sklearn.decomposition import PCA

num_components = 50
pca = PCA(n_components=num_components)
pca_result_train = pca.fit_transform(train_vect_avg)
pca_result_test = pca.fit_transform(test_vect_avg)

In [99]:
expected_timeline_cols = [f'expected_timeline{i}' for i in range(num_components)]
expected_timeline_train = pd.DataFrame(pca_result_train, columns=expected_timeline_cols)
expected_timeline_test = pd.DataFrame(pca_result_test, columns=expected_timeline_cols)

In [100]:
df_train.drop('expected_timeline', axis=1, inplace=True)
df_test.drop('expected_timeline', axis=1, inplace=True)

In [101]:
df_train = pd.concat([df_train, expected_timeline_train], axis=1)
df_test = pd.concat([df_test, expected_timeline_test], axis=1)

In [102]:
df_train.to_csv('train_preprocess.csv', index=False)
df_test.to_csv('submission_preprocess.csv', index=False)

# 모델 학습/검증

In [103]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    stratify=df_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=400,
)

In [104]:
model = DecisionTreeClassifier()

In [105]:
model.fit(x_train, y_train)

In [106]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [107]:
pred = model.predict(x_val)
get_clf_eval(y_val, pred)

오차행렬:
 [[  768   201]
 [  198 10634]]

정확도: 0.9662
정밀도: 0.7950
재현율: 0.7926
F1: 0.7938


In [108]:
df_train

Unnamed: 0,bant_submit,com_reg_ver_win_rate,customer_idx,customer_type,id_strategic_ver,it_strategic_ver,idit_strategic_ver,customer_job,lead_desc_length,inquiry_type,...,expected_timeline40,expected_timeline41,expected_timeline42,expected_timeline43,expected_timeline44,expected_timeline45,expected_timeline46,expected_timeline47,expected_timeline48,expected_timeline49
0,1.00,0.066667,32160,3,0.0,0.0,0.0,9,0,3,...,3.164900e-07,3.043594e-07,-8.943626e-08,3.144355e-07,6.061622e-07,3.220422e-08,-6.546520e-08,-4.830832e-07,3.493563e-07,4.292071e-07
1,1.00,0.066667,23122,3,0.0,0.0,0.0,4,0,3,...,3.164906e-07,3.043601e-07,-8.943748e-08,3.144347e-07,6.061625e-07,3.220473e-08,-6.546320e-08,-4.830826e-07,3.493547e-07,4.292073e-07
2,1.00,0.088889,1755,3,0.0,0.0,0.0,5,0,2,...,3.164903e-07,3.043608e-07,-8.943711e-08,3.144346e-07,6.061631e-07,3.220519e-08,-6.546276e-08,-4.830823e-07,3.493553e-07,4.292065e-07
3,1.00,0.088889,4919,3,0.0,0.0,0.0,4,0,3,...,3.164906e-07,3.043601e-07,-8.943685e-08,3.144345e-07,6.061630e-07,3.220498e-08,-6.546244e-08,-4.830829e-07,3.493549e-07,4.292071e-07
4,1.00,0.088889,17126,8,0.0,0.0,0.0,2,0,3,...,3.164906e-07,3.043600e-07,-8.943712e-08,3.144346e-07,6.061629e-07,3.220504e-08,-6.546264e-08,-4.830829e-07,3.493549e-07,4.292070e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58996,1.00,0.000000,33747,3,0.0,0.0,0.0,5,0,2,...,-7.678994e-07,-1.147775e-06,-3.708853e-07,-1.626695e-08,-9.429775e-07,-1.713044e-06,1.166401e-06,1.183131e-06,-8.040167e-07,-1.758758e-06
58997,1.00,0.000000,33747,3,0.0,0.0,0.0,5,0,2,...,-7.678994e-07,-1.147775e-06,-3.708853e-07,-1.626695e-08,-9.429775e-07,-1.713044e-06,1.166401e-06,1.183131e-06,-8.040167e-07,-1.758758e-06
58998,0.75,0.040000,35420,8,0.0,0.0,0.0,4,0,0,...,-2.158623e-06,3.127939e-06,1.053751e-06,3.682969e-06,9.448735e-06,6.024517e-06,-3.066002e-06,-8.459886e-06,3.950318e-06,6.779321e-06
58999,0.75,0.040000,19249,8,0.0,0.0,0.0,5,0,0,...,3.164906e-07,3.043601e-07,-8.943717e-08,3.144346e-07,6.061628e-07,3.220501e-08,-6.546254e-08,-4.830828e-07,3.493548e-07,4.292070e-07


In [109]:
df_test

Unnamed: 0,bant_submit,com_reg_ver_win_rate,customer_idx,customer_type,id_strategic_ver,it_strategic_ver,idit_strategic_ver,customer_job,lead_desc_length,inquiry_type,...,expected_timeline40,expected_timeline41,expected_timeline42,expected_timeline43,expected_timeline44,expected_timeline45,expected_timeline46,expected_timeline47,expected_timeline48,expected_timeline49
0,0.00,0.073248,47466,,0.0,0.0,0.0,,0,,...,4.379186e-16,5.243558e-16,-9.589458e-16,-4.707791e-16,4.475868e-16,-7.255220e-17,-2.210806e-16,-2.738386e-16,-1.326248e-16,-1.528264e-16
1,0.25,0.000000,5405,3.0,0.0,0.0,0.0,4.0,0,3.0,...,-2.092640e-16,8.791192e-17,1.293815e-16,-4.281686e-16,5.849078e-16,6.661296e-17,4.638688e-16,-2.593921e-16,4.269631e-17,-2.372970e-16
2,1.00,0.000000,13597,8.0,0.0,0.0,0.0,6.0,0,3.0,...,7.627422e-17,3.130029e-16,-3.422158e-16,1.892514e-16,-1.243738e-16,1.379405e-16,-3.792776e-16,7.379998e-16,-8.314423e-17,-5.079735e-16
3,0.50,0.118644,17204,2.0,0.0,0.0,0.0,10.0,0,3.0,...,-3.662636e-17,6.657173e-17,-3.839985e-16,1.672836e-16,1.465604e-16,-1.484965e-16,-1.487133e-16,-3.443948e-16,-1.350986e-16,2.710708e-17
4,1.00,0.074949,2329,3.0,1.0,0.0,1.0,5.0,0,3.0,...,3.193247e-16,-2.173355e-16,-4.492742e-16,3.630048e-16,-4.544584e-16,3.332150e-17,3.538377e-16,-1.156820e-15,-2.192752e-16,1.269841e-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5266,0.50,0.000000,40292,2.0,0.0,0.0,0.0,10.0,0,3.0,...,-6.753327e-17,2.885285e-16,1.209913e-16,-1.226058e-16,5.393576e-16,-7.630161e-16,-3.368123e-16,-9.025349e-17,-8.399899e-16,3.482202e-16
5267,0.25,0.000000,47466,2.0,0.0,0.0,0.0,4.0,0,3.0,...,-1.599542e-16,1.782724e-16,9.733785e-17,-1.377177e-16,5.517793e-16,-7.455231e-16,-2.972051e-16,-9.249480e-17,-8.454649e-16,3.576611e-16
5268,0.75,0.000000,46227,8.0,0.0,0.0,0.0,4.0,0,0.0,...,2.612864e-15,2.060250e-15,1.985029e-15,-4.087738e-16,8.589568e-16,6.137268e-16,5.819730e-16,3.705421e-16,-2.305194e-16,6.001036e-16
5269,0.00,0.000000,45667,3.0,0.0,0.0,0.0,4.0,0,0.0,...,-8.481968e-16,-5.532412e-16,-4.468886e-16,8.927039e-16,8.985525e-16,-7.244708e-16,-2.501117e-16,-1.535974e-16,-1.316737e-15,-3.853709e-16


In [110]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted"], axis=1)

In [113]:
df_test[col].isna()

0       False
1       False
2       False
3       False
4       False
        ...  
5266    False
5267    False
5268    False
5269    False
5270    False
Name: ver_win_ratio_per_bu, Length: 5271, dtype: bool

In [116]:
test_pred = model.predict(x_test.fillna(0)) # test의 1번만 NaN값이 조금 있음
sum(test_pred) # True로 예측된 개수

1021

In [126]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("/kaggle/input/b2b-customer/submission (1).csv")
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

===================================