In [1]:
#!pip install seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import time

#!pip install geopy
#!pip install googletrans==3.1.0a0
from googletrans import Translator
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import combinations
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import WordPunctTokenizer

#import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
#!pip install sentence_transformers
from sentence_transformers import SentenceTransformer, util

In [2]:
train = pd.read_csv('train.csv')
submission = pd.read_csv('submission.csv')

In [4]:
def translate_func(text, attempt = 1, max_attempt = 5):
    translator = Translator()
    try:
        return translator.translate(text, dest = "en").text  # 번역
    except:          # 에러의 경우
        if attempt <= max_attempt:   # 5번 이하로 다시 시도
            return translate_func(text, attempt = attempt + 1)
        raise

# Customer_Job

In [7]:
original = len(set(train.customer_job))

In [8]:
# 번역
job_lst = set(train.customer_job)
job_lst_trans = [translate_func(i) if not pd.isna(i) else i for i in job_lst] 
job_dict = pd.DataFrame([job_lst, job_lst_trans]).T
job_dict.columns = ["job_lst", "job_lst_trans"]
job_dict = job_dict.set_index("job_lst").T.to_dict()
train.customer_job = [job_dict[i]["job_lst_trans"] if not pd.isna(i) else i for i in train.customer_job]

job_lst = set(submission.customer_job)
job_lst_trans = [translate_func(i) if not pd.isna(i) else i for i in job_lst] 
job_dict = pd.DataFrame([job_lst, job_lst_trans]).T
job_dict.columns = ["job_lst", "job_lst_trans"]
job_dict = job_dict.set_index("job_lst").T.to_dict()
submission.customer_job = [job_dict[i]["job_lst_trans"] if not pd.isna(i) else i for i in submission.customer_job]

In [12]:
original_trans = len(set(train.customer_job))

In [9]:
# 방법 1. TfidfVectorizer와 코사인 유사도 이용
# 글자만 고려
customer_set = sorted(list(set(train.customer_job.dropna())))
le_train = len(customer_set)

# 특수문자 제거 및 토큰화
customer_re = [re.sub(r"[.,~!@#$%^&*\(\)_+=\-~|\\\"\';:\?/\<\>\[\]\{\}]", " ", customer) for customer in customer_set]
customer_re = [customer.strip().lower() for customer in customer_re]
customer_token = [WordPunctTokenizer().tokenize(i) for i in customer_re]

# 불용어 사전
stop = stopwords.words('english')
stop.remove("it") # IT 때문에 삭제

# 불용어 제거
customer_stop = []
for i in customer_token:
    for j in i:
        sent = i
        if j in stop: # 불용어 사전에 있으면 제거
            sent.remove(j)
    customer_stop.append(sent)
customer_stop = [" ".join(i) for i in customer_stop] # 다시 결합

# 문장 벡터화
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(customer_stop)

In [13]:
# 방법 1-1. 유사하다고 생각되는 모든 값을 그룹화
# 장점 : 카테고리수를 많이 줄일 수 있음, 단점 : 오차가 큼
# 알고리즘
# n*n으로 구성된 유사도 matrix를 순차적으로 돌아가며 확인한다.
# 유사도 기준(standard)를 넘으면 cos_similar에 key를 i번째 값, value를 j 번째 값으로 넣고, customer_stop의 j번째 값을 i번째 값으로 변경


# job들을 각각 하나씩 비교해 코사인 유사도가 standard를 넘는 것을 추출 및 그룹화 되는 첫번째 값으로 전부 변경
standard = 0.6
cos = cosine_similarity(tfidf_matrix, tfidf_matrix) > standard # 유사도가 0.6 초과인 경우만 True
cos_similar = dict()
c = customer_stop.copy() # 테스트를 위한 copy
for i in range(len(cos)):       # i는 1부터 n까지 loop
    for j in range(i+1, len(cos[i])): # 대각을 기준으로 위쪽만 고려(위와 아래는 똑같으니 아래쪽은 생략)
        if c[i] == "" or c[j] == "":    # 빈값이면 그대로 추출
            pass
        elif cos[i][j] and c[i] in cos_similar.keys():  # True 즉, 유사도가 0.6 초과이고, 이미 cos_similar에 있는 경우
            cos_similar[c[j]] = cos_similar[c[i]]  # 디렉터리에 넣음
            c[j] = c[i]            # 값 자체를 변경
        elif cos[i][j]:            # 유사도가 0.6 초과인 경우
            cos_similar[c[j]] = c[i]  
            c[j] = c[i]


# train의 customer_job 변경
customer_df_train = pd.DataFrame([customer_set, c]).T  # DataFrame 생성
customer_df_train.columns = ["customer_set", "c"]            # colunm명 생성
customer_dict_train = customer_df_train.set_index("customer_set").T.to_dict()   # dict형으로 변환
customer_job_t = [customer_dict_train[i]["c"] if not pd.isna(i) and customer_dict_train[i]["c"] != "" else i for i in train.customer_job] # country 기준 conversion으로 변경
len_1 = len(set(customer_job_t)); len_1

294

In [14]:
# manage라는 단어를 포함한 단어들이 어떻게 변경되었는지 확인
manage = [i for i in customer_set if "manage" in i]
manage_trans_1 = [customer_dict_train[i]["c"] if not pd.isna(i) and customer_dict_train[i]["c"] != "" else i for i in manage]

In [15]:
# 어떤 값이 변화되었는지 확인하기 위한 코드
# key 값이 변화된 값, value가 변화되기 전의 값으로 구성
def test_func(cos_similar):
    test = dict()
    for i in cos_similar.keys():
        if cos_similar[i] not in test.keys():
            test[cos_similar[i]] = [i]
        else:
            test[cos_similar[i]] += [i]
    return test
test_func(cos_similar)

{'v project manager': ['project manager',
  'designer project manager',
  'digital project manager',
  'general manager project manager',
  'manager',
  'owner project manager',
  'producer project manager',
  'program project manager',
  'project manager estimator',
  'project manager principal',
  'project manager designer',
  'project sales manager',
  'r project manager'],
 'account management': ['management'],
 'admin': ['admin assistant', 'it admin'],
 'administrative': ['administrative assistant'],
 'architect': ['architect consultant',
  'architect owner',
  'project architect',
  'solutions architect',
  'consultant',
  'owner'],
 'art design': ['architect',
  'art e design',
  'design',
  'design install',
  'design engineer',
  'design purchaser',
  'design install training support',
  'engineering design install',
  'hardware design engineer',
  'senior design engineer'],
 'hardware': ['art design', 'hardware selection'],
 'artist lead equipment selection': ['equipment sele

In [16]:
# 방법 1-2. 유사하다고 생각되는 모든 값을 그룹화
# 장점 : 조금더 깐깐하게 그룹화해서 오차가 적음
# 단점 : 카테고리 수가 많이 줄어들진 않음
# 알고리즘 : 유사도가 높은 것부터 차례로 cos_similar에 넣음

# 유사도 기준
standard = 0.6
cos = cosine_similarity(tfidf_matrix, tfidf_matrix)

# 대각행렬은 nan -> 고려하지 않기 위해
cos = np.array(cos)
np.fill_diagonal(cos, np.nan)

cos_similar = dict()
cos = pd.DataFrame(cos)
c = customer_stop.copy()
while cos.max().max() > standard: # 전부 유사도가 0.6 이하가 될때까지
    i = cos.max(axis = 1).idxmax()  # 최댓값을 가진 행 추출
    j = cos.iloc[i,:].idxmax()     # 최댓값을 가진 열 추출
    
    if customer_stop[i] == "" or customer_stop[j] == "":  # 둘 중 하나라도 빈값이면 수행 x
        pass
    elif c[i] in cos_similar.keys():  # 이미 딕셔너리에 있는 경우
        cos_similar[c[j]] = cos_similar[c[i]]
    else:
        cos_similar[c[j]] = c[i]
    
    cos.iloc[i, j] = 0  # 처리된 단어간의 유사도는 0으로 설정
    cos.iloc[j, i] = 0

c = [cos_similar[i] if i in cos_similar.keys() else i for i in c] # 저장된 딕셔너리에 맞춰 값 변경

# train의 customer_job 변경
customer_df_train = pd.DataFrame([customer_set, c]).T  # DataFrame 생성
customer_df_train.columns = ["customer_set", "c"]            # colunm명 생성
customer_dict_train = customer_df_train.set_index("customer_set").T.to_dict()   # dict형으로 변환
customer_job_t = [customer_dict_train[i]["c"] if not pd.isna(i) and customer_dict_train[i]["c"] != "" else i for i in train.customer_job] # country 기준 conversion으로 변경
len_2 = len(set(customer_job_t)); len_2

313

In [17]:
manage_trans_2 = [customer_dict_train[i]["c"] if not pd.isna(i) and customer_dict_train[i]["c"] != "" else i for i in manage]

In [18]:
test_func(cos_similar)

{'art design': ['art design',
  'art e design',
  'design consultant',
  'design',
  'design install',
  'design install training support'],
 'business development': ['business development'],
 'creation design': ['creation design'],
 'director': ['engineering director',
  'director it',
  'it director',
  'operations manager',
  'it integrator',
  'purchasing director',
  'technical director',
  'project director',
  'it',
  'director operations',
  'director purchaser',
  'it specialist',
  'director engineering',
  'it administrator',
  'project administrator',
  'it support'],
 'human resources': ['human resources'],
 'information technology': ['information technology \u200b',
  'information technology',
  'it information technology'],
 'medical imaging specialist': ['medical imaging specialist',
  'specialist medical imaging'],
 'medical solution provider': ['medical solution provider',
  'medical solution provider \u200b',
  'solution provider'],
 'military protective services': [

In [19]:
# 방법 1-3. 길이가 적은 값으로 그룹화
# 장점 : 핵심 단어로 분류가 가능
# 단점 : 더 깐깐하게 그룹화
# 알고리즘
# 가장 유사도가 높은 두 단어를 추출
# 두 단어 중 길이가 짧은 것을 value, 길이가 긴 것을 key로 설정해 길이가 긴 것으로 짧은 것으로 변환할 수 있도록 함
# 길이가 짧은 단어의 유사도를 두고, 길이가 긴 단어의 유사도는 삭제 -> 그래야 길이가 긴 단어와 유사도가 높은 단어가 포함되지 않음
standard = 0.6
cos = cosine_similarity(tfidf_matrix, tfidf_matrix)

# 대각행렬은 nan -> 고려하지 않기 위해
cos = np.array(cos)
np.fill_diagonal(cos, np.nan)

cos_similar = dict()
cos = pd.DataFrame(cos)
c = customer_stop.copy()

while cos.max().max() > standard: # 전부 유사도가 0.6 이하가 될때까지
    i = cos.max(axis = 1).idxmax()  # 최댓값을 가진 행 추출
    j = cos.iloc[i,:].idxmax()     # 최댓값을 가진 열 추출
    
    # 처리된 단어간의 유사도는 0으로 설정
    cos.iloc[i, j] = 0
    cos.iloc[j, i] = 0

    # 길이가 짧으면 idx, 길면 del_idx
    idx, del_idx = [i, j] if len(c[i]) <= len(c[j]) else [j, i]
    
    if c[idx] == "": # ""는 무조건 짧은 쪽일 것이고, 만약 idx가 ""이면 ""인 쪽을 삭제하기 위해 del_idx로 변경 -> ""는 it나 tech 등 상관없는 단어와 유사도가 높음
        idx, del_idx = del_idx, idx
    elif c[idx] in cos_similar.keys():  # 이미 딕셔너리에 있는 경우
        cos_similar[c[del_idx]] = cos_similar[c[idx]]
    else:
        cos_similar[c[del_idx]] = c[idx]
    
    # 길이가 긴 단어를 우선 삭제 -> it, it developer, tech developer 이라고 한다면 it라는 핵심 단어로 우선 그룹화 그러면 it와 관련있는 단어들이 그룹화
    # 이전 알고리즘은 it developer가 먼저 나오거나 유사도가 높으면 위 세개의 단어가 it developer로 묶일 수 있었음 -> 그래서 이 알고리즘이 더 깐깐
    # 단어 길이가 짧은 것은 핵심단어라는 뜻이므로 이를 기준으로 그룹화
    cos = cos.drop(del_idx).drop(del_idx, axis = 1).reset_index(drop = True) # 길이가 긴 단어의 유사도 행을 우선 삭제
    cos.columns = list(range(len(cos))) # index와 columns 정렬
    del c[del_idx]  # 단어 자체도 삭제
    
c = [cos_similar[i] if i in cos_similar.keys() else i for i in customer_stop]

# train의 customer_job 변경
customer_df_train = pd.DataFrame([customer_set, c]).T  # DataFrame 생성
customer_df_train.columns = ["customer_set", "c"]            # colunm명 생성
customer_dict_train = customer_df_train.set_index("customer_set").T.to_dict()   # dict형으로 변환
customer_job_t = [customer_dict_train[i]["c"] if not pd.isna(i) and customer_dict_train[i]["c"] != "" else i for i in train.customer_job] # country 기준 conversion으로 변경
len_3 = len(set(customer_job_t)); len_3

323

In [20]:
manage_trans_3 = [customer_dict_train[i]["c"] if not pd.isna(i) and customer_dict_train[i]["c"] != "" else i for i in manage]

In [21]:
test_func(cos_similar)

{'design': ['art design', 'design install'],
 'art design': ['art e design'],
 'business development': ['business development'],
 'creation design': ['creation design'],
 'director engineering': ['engineering director'],
 'human resources': ['human resources'],
 'information technology': ['information technology \u200b',
  'information technology',
  'it information technology'],
 'medical imaging specialist': ['medical imaging specialist',
  'specialist medical imaging'],
 'solution provider': ['medical solution provider'],
 'medical solution provider': ['medical solution provider \u200b'],
 'military protective services': ['military protective services'],
 'quality assurance': ['quality assurance'],
 'radiology professional': ['radiology professional'],
 'project manager': ['v project manager',
  'r project manager',
  'owner project manager',
  'program project manager'],
 'arts design': ['arts design'],
 'manager': ['project manager'],
 'tech': ['tech', 'it tech', 'tech service'],


In [24]:
# 방법 2. 사전학습모델 SentenceTransformer
# 단어의 숨겨진 의미까지 파악해 위 방법보다 유연
model = SentenceTransformer('paraphrase-distilroberta-base-v1')
customer_set = sorted(list(set(train.customer_job.dropna())))
le_train = len(customer_set)

# 특수문자 제거 및 토큰화
customer_re = [re.sub(r"[.,~!@#$%^&*\(\)_+=\-~|\\\"\';:\?/\<\>\[\]\{\}]", " ", customer) for customer in customer_set]
customer_re = [customer.strip().lower() for customer in customer_re]
customer_token = [WordPunctTokenizer().tokenize(i) for i in customer_re]

# 불용어 사전
stop = stopwords.words('english')
stop.remove("it") # IT 때문에 삭제

# 불용어 제거
customer_stop = []
for i in customer_token:
    for j in i:
        sent = i
        if j in stop: # 불용어 사전에 있으면 제거
            sent.remove(j)
    customer_stop.append(sent)
customer_stop = [" ".join(i) for i in customer_stop] # 다시 결합

embeddings = model.encode(customer_stop, convert_to_tensor=True)

  torch.utils._pytree._register_pytree_node(


In [25]:
# 방법 2-1. 방법 1-1과 동일
standard = 0.6

cosine_scores = util.pytorch_cos_sim(embeddings, embeddings)
cosine_scores = cosine_scores>standard
cos_similar = dict()
c = customer_stop.copy() # 테스트를 위한 copy
for i in range(len(cos)):
    for j in range(i+1, len(cosine_scores[i])):
        if c[i] == "" or c[j] == "":
            pass
        elif cosine_scores[i][j] and c[i] in cos_similar.keys():
            cos_similar[c[j]] = cos_similar[c[i]]
            c[j] = c[i]
        elif cosine_scores[i][j]:
            cos_similar[c[j]] = c[i]
            c[j] = c[i]

# train의 customer_job 변경
customer_df_train = pd.DataFrame([customer_set, c]).T  # DataFrame 생성
customer_df_train.columns = ["customer_set", "c"]            # colunm명 생성
customer_dict_train = customer_df_train.set_index("customer_set").T.to_dict()   # dict형으로 변환
customer_job_t = [customer_dict_train[i]["c"] if not pd.isna(i) and customer_dict_train[i]["c"] != "" else i for i in train.customer_job] # country 기준 conversion으로 변경
len_4 = len(set(customer_job_t)); len_4

257

In [26]:
manage_trans_4 = [customer_dict_train[i]["c"] if not pd.isna(i) and customer_dict_train[i]["c"] != "" else i for i in manage]

In [27]:
test_func(cos_similar)

{'v project manager': ['project manager',
  'construction manager',
  'designer project manager',
  'digital project manager',
  'general manager project manager',
  'it manager',
  'owner project manager',
  'producer project manager',
  'program project management',
  'program project manager',
  'project administrator',
  'project coordinator',
  'project director',
  'project engineer',
  'project head',
  'project manage',
  'project management',
  'project management program project management',
  'project manager estimator',
  'project manager principal',
  'project manager designer',
  'project sales manager',
  'projection manager',
  'r project manager',
  'site manager',
  'tv studio manager'],
 'account executive manager': ['account management',
  'office manager',
  'operations manager',
  'accounting',
  'asset management',
  'general management',
  'management',
  'product management',
  'accounts payable'],
 'it project lead': ['v project manager'],
 'admin': ['admin as

In [28]:
# 방법 2-2. 방법 1-2와 동일

standard = 0.6

cosine_scores = util.pytorch_cos_sim(embeddings, embeddings)
cosine_scores = np.array(cosine_scores)
np.fill_diagonal(cosine_scores, np.nan)

cos_similar = dict()
cosine_scores = pd.DataFrame(cosine_scores)
c = customer_stop.copy()
while cosine_scores.max().max() > standard:
    i = cosine_scores.max(axis = 1).idxmax()
    j = cosine_scores.iloc[i,:].idxmax()
    
    
    if customer_stop[i] == "" or customer_stop[j] == "":
        pass
    elif customer_stop[i] in cos_similar.keys():
        cos_similar[c[j]] = cos_similar[c[i]]
    else:
        cos_similar[c[j]] = c[i]
    
    cosine_scores.iloc[i, j] = 0
    cosine_scores.iloc[j, i] = 0
    
c = [cos_similar[i] if i in cos_similar.keys() else i for i in customer_stop]

# train의 customer_job 변경
customer_df_train = pd.DataFrame([customer_set, c]).T  # DataFrame 생성
customer_df_train.columns = ["customer_set", "c"]            # colunm명 생성
customer_dict_train = customer_df_train.set_index("customer_set").T.to_dict()   # dict형으로 변환
customer_job_t = [customer_dict_train[i]["c"] if not pd.isna(i) and customer_dict_train[i]["c"] != "" else i for i in train.customer_job] # country 기준 conversion으로 변경
len_5 = len(set(customer_job_t)); len_5

251

In [29]:
manage_trans_5 = [customer_dict_train[i]["c"] if not pd.isna(i) and customer_dict_train[i]["c"] != "" else i for i in manage]

In [30]:
test_func(cos_similar)

{'art design': ['art design',
  'creation design',
  'program project management',
  'arts design',
  'designers',
  'art e design',
  'project management',
  'designer project manager',
  'design engineer',
  'designer producer',
  'purchaser it installer',
  'art installation',
  'design',
  'supplier installation',
  'engineering',
  'design installation company',
  'design purchaser',
  'solutions architect',
  'installation purchaser'],
 'clinical specialist': ['medical solution provider',
  'medical imaging specialist',
  'surgery professional'],
 'business development': ['research development', 'business development'],
 'v project manager': ['project manager',
  'project management program project management',
  'project lead',
  'project sales manager',
  'owner project manager',
  'digital project manager',
  'project coordinator',
  'manager',
  'site manager',
  'project manage',
  'it manager',
  'tv studio manager',
  'project team member',
  'studio manager',
  'resource 

In [31]:
# 방법 2-3. 방법 1-3과 동일
standard = 0.6

cosine_scores = util.pytorch_cos_sim(embeddings, embeddings)
cosine_scores = np.array(cosine_scores)
np.fill_diagonal(cosine_scores, np.nan)

cos_similar = dict()
cosine_scores = pd.DataFrame(cosine_scores)
c = customer_stop.copy()
while cosine_scores.max().max() > standard:
    i = cosine_scores.max(axis = 1).idxmax()
    j = cosine_scores.iloc[i,:].idxmax()
    
    cosine_scores.iloc[i, j] = 0
    cosine_scores.iloc[j, i] = 0

    idx, del_idx = [i, j] if len(c[i]) <= len(c[j]) else [j, i]
    
    if c[idx] == "":
        idx, del_idx = del_idx, idx
    elif c[idx] in cos_similar.keys():
        cos_similar[c[del_idx]] = cos_similar[c[idx]]
    else:
        cos_similar[c[del_idx]] = c[idx]
    
    cosine_scores = cosine_scores.drop(del_idx).drop(del_idx, axis = 1).reset_index(drop = True)
    cosine_scores.columns = list(range(len(cosine_scores)))
    del c[del_idx]
    
c = [cos_similar[i] if i in cos_similar.keys() else i for i in customer_stop]

# train의 customer_job 변경
customer_df_train = pd.DataFrame([customer_set, c]).T  # DataFrame 생성
customer_df_train.columns = ["customer_set", "c"]            # colunm명 생성
customer_dict_train = customer_df_train.set_index("customer_set").T.to_dict()   # dict형으로 변환
customer_job_t = [customer_dict_train[i]["c"] if not pd.isna(i) and customer_dict_train[i]["c"] != "" else i for i in train.customer_job] # country 기준 conversion으로 변경
len_6 = len(set(customer_job_t)); len_6

317

In [32]:
manage_trans_6 = [customer_dict_train[i]["c"] if not pd.isna(i) and customer_dict_train[i]["c"] != "" else i for i in manage]

In [33]:
test_func(cos_similar)

{'design': ['art design', 'design build', 'designer', 'design purchaser'],
 'art design': ['creation design',
  'arts design',
  'art e design',
  'art installation'],
 'medical solution provider': ['medical solution provider',
  'medical solution provider \u200b'],
 'program project manager': ['program project management'],
 'research': ['research development', 'product research', 'research install'],
 'business development': ['business development'],
 'project head': ['project manager',
  'project director',
  'project designer',
  'project lead'],
 'tech': ['tech',
  'information technology',
  'technical',
  'it tech',
  'tech service',
  'energy'],
 'community social services': ['community social services'],
 'director': ['director it',
  'it director',
  'director finance',
  'creative director',
  'director lodging'],
 'estimator': ['estimator'],
 'healthcare services': ['healthcare services', 'healthcare professionals'],
 'installer': ['installer',
  'installation purchaser',
 

In [34]:
# manage란 단어를 포함한 단어들이 방법에 따라 어떻게 분류되었는지
manage = pd.DataFrame([manage, manage_trans_1, manage_trans_2, manage_trans_3, manage_trans_4, manage_trans_5, manage_trans_6]).T
manage.columns = ["original","방법1","방법2","방법3","방법4","방법5","방법6"]
manage

Unnamed: 0,original,방법1,방법2,방법3,방법4,방법5,방법6
0,a/v project manager,v project manager,v project manager,project manager,v project manager,v project manager,project manager
1,account executive/manager,account executive manager,account executive manager,account executive manager,account executive manager,account executive manager,account management
2,account management,account management,account management,management,account executive manager,account executive manager,accounting
3,asset management,asset management,asset management,asset management,account executive manager,account executive manager,asset management
4,by project manager,v project manager,v project manager,manager,v project manager,v project manager,project head
5,comanager,comanager,comanager,comanager,comanager,comanager,comanager
6,construction manager,construction manager,construction manager,construction manager,v project manager,architect consultant,facility manager
7,designer/ project manager,designer,designer,project designer,architect,art design,project manager
8,digital project manager,v project manager,v project manager,digital project manager,architect,v project manager,project manager
9,facility manager,facility administrator,facility administrator,facility manager,facilitator,facilitator,studio manager


In [None]:
# 방법 3. https://zephyrus1111.tistory.com/191 -> SequenceMatcher 단어 유사도 => 비슷한 것 같음
# 방법 4. 텍스트 군집화 -> K-means
# https://romg2.github.io/mlguide/03_%EB%A8%B8%EC%8B%A0%EB%9F%AC%EB%8B%9D-%EC%99%84%EB%B2%BD%EA%B0%80%EC%9D%B4%EB%93%9C-08.-%ED%85%8D%EC%8A%A4%ED%8A%B8%EB%B6%84%EC%84%9D-%EB%AC%B8%EC%84%9C-%EA%B5%B0%EC%A7%91%ED%99%94/
# 위 방법이 더 좋지 않을까?

In [37]:
print(f"original 카테고리 수 : {original}")
print(f"번역 후 카테고리 수 : {original_trans}")
print(f"방법 1-1 카테고리 수 : {len_1}")
print(f"방법 1-2 카테고리 수 : {len_2}")
print(f"방법 1-3 카테고리 수 : {len_3}")
print(f"방법 2-1 카테고리 수 : {len_4}")
print(f"방법 2-2 카테고리 수 : {len_5}")
print(f"방법 2-3 카테고리 수 : {len_6}")

original 카테고리 수 : 561
번역 후 카테고리 수 : 527
방법 1-1 카테고리 수 : 294
방법 1-2 카테고리 수 : 313
방법 1-3 카테고리 수 : 323
방법 2-1 카테고리 수 : 257
방법 2-2 카테고리 수 : 251
방법 2-3 카테고리 수 : 317


# Customer_Position

In [39]:
original = len(set(train.customer_position))

In [40]:
# 번역
position_lst = set(train.customer_position)
position_lst_trans = [translate_func(i) if not pd.isna(i) else i for i in position_lst] 
position_dict = pd.DataFrame([position_lst, position_lst_trans]).T
position_dict.columns = ["position_lst", "position_lst_trans"]
position_dict = position_dict.set_index("position_lst").T.to_dict()
train.customer_position = [position_dict[i]["position_lst_trans"] if not pd.isna(i) else i for i in train.customer_position]

position_lst = set(submission.customer_position)
position_lst_trans = [translate_func(i) if not pd.isna(i) else i for i in position_lst] 
position_dict = pd.DataFrame([position_lst, position_lst_trans]).T
position_dict.columns = ["position_lst", "position_lst_trans"]
position_dict = position_dict.set_index("position_lst").T.to_dict()
submission.customer_position = [position_dict[i]["position_lst_trans"] if not pd.isna(i) else i for i in submission.customer_position]

In [41]:
original_trans = len(set(train.customer_position))

In [42]:
# 방법 1. TfidfVectorizer와 코사인 유사도 이용
# 글자만 고려
customer_set = sorted(list(set(train.customer_position.dropna())))
le_train = len(customer_set)

# 특수문자 제거 및 토큰화
customer_re = [re.sub(r"[.,~!@#$%^&*\(\)_+=\-~|\\\"\';:\?/\<\>\[\]\{\}]", " ", customer) for customer in customer_set]
customer_re = [customer.strip().lower() for customer in customer_re]
customer_token = [WordPunctTokenizer().tokenize(i) for i in customer_re]

# 불용어 사전
stop = stopwords.words('english')

# 불용어 제거
customer_stop = []
for i in customer_token:
    for j in i:
        sent = i
        if j in stop: # 불용어 사전에 있으면 제거
            sent.remove(j)
    customer_stop.append(sent)
customer_stop = [" ".join(i) for i in customer_stop] # 다시 결합

# 문장 벡터화
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(customer_stop)

In [43]:
# 방법 1-1. 유사하다고 생각되는 모든 값을 그룹화
# 장점 : 카테고리수를 많이 줄일 수 있음, 단점 : 오차가 큼
# 알고리즘
# n*n으로 구성된 유사도 matrix를 순차적으로 돌아가며 확인한다.
# 유사도 기준(standard)를 넘으면 cos_similar에 key를 i번째 값, value를 j 번째 값으로 넣고, customer_stop의 j번째 값을 i번째 값으로 변경


# job들을 각각 하나씩 비교해 코사인 유사도가 standard를 넘는 것을 추출 및 그룹화 되는 첫번째 값으로 전부 변경
standard = 0.6
cos = cosine_similarity(tfidf_matrix, tfidf_matrix) > standard
cos_similar = dict()
c = customer_stop.copy() # 테스트를 위한 copy
for i in range(len(cos)):
    for j in range(i+1, len(cos[i])):
        if c[i] == "" or c[j] == "":
            pass
        elif cos[i][j] and c[i] in cos_similar.keys():
            cos_similar[c[j]] = cos_similar[c[i]]
            c[j] = c[i]
        elif cos[i][j]:
            cos_similar[c[j]] = c[i]
            c[j] = c[i]


# train의 customer_job 변경
customer_df_train = pd.DataFrame([customer_set, c]).T  # DataFrame 생성
customer_df_train.columns = ["customer_set", "c"]            # colunm명 생성
customer_dict_train = customer_df_train.set_index("customer_set").T.to_dict()   # dict형으로 변환
customer_position_t = [customer_dict_train[i]["c"] if not pd.isna(i) and customer_dict_train[i]["c"] != "" else i for i in train.customer_position] # country 기준 conversion으로 변경
len_1 = len(set(customer_position_t)); len_1

94

In [44]:
# 어떤 값이 변화되었는지 확인하기 위한 코드
# key 값이 변화된 값, value가 변화되기 전의 값으로 구성
def test_func(cos_similar):
    test = dict()
    for i in cos_similar.keys():
        if cos_similar[i] not in test.keys():
            test[cos_similar[i]] = [i]
        else:
            test[cos_similar[i]] += [i]
    return test
test_func(cos_similar)

{'vice president': ['president', 'vice president'],
 'architect consultant': ['consultant'],
 'you prof': ['prof'],
 'assistant professor': ['assistant professor english', 'professor'],
 'associate professor': ['associate professor electronics engg',
  'assistant professor',
  'professor mathematics'],
 'business development': ['business development sales'],
 'business partner': ['partner'],
 'management': ['product management'],
 'ceo founder': ['founder'],
 'co founder': ['ceo founder'],
 'commercial consultant': ['architect consultant'],
 'commercial end user': ['end user'],
 'decision influencer': ['decision influencer'],
 'decision maker': ['decision maker'],
 'director': ['principal director'],
 'education': ['education professional'],
 'general manager': ['manager'],
 'math physics teacher': ['physics teacher'],
 'physics mathematics teacher': ['math physics teacher', 'teacher'],
 'science teacher': ['physics mathematics teacher']}

In [45]:
# 방법 1-2. 유사하다고 생각되는 모든 값을 그룹화

standard = 0.6
cos = cosine_similarity(tfidf_matrix, tfidf_matrix)

cos = np.array(cos)
np.fill_diagonal(cos, np.nan)

cos_similar = dict()
cos = pd.DataFrame(cos)
c = customer_stop.copy()
while cos.max().max() > standard:
    i = cos.max(axis = 1).idxmax()
    j = cos.iloc[i,:].idxmax()
    
    if customer_stop[i] == "" or customer_stop[j] == "":
        pass
    elif c[i] in cos_similar.keys():
        cos_similar[c[j]] = cos_similar[c[i]]
    else:
        cos_similar[c[j]] = c[i]
    
    cos.iloc[i, j] = 0
    cos.iloc[j, i] = 0

c = [cos_similar[i] if i in cos_similar.keys() else i for i in c]

# train의 customer_job 변경
customer_df_train = pd.DataFrame([customer_set, c]).T  # DataFrame 생성
customer_df_train.columns = ["customer_set", "c"]            # colunm명 생성
customer_dict_train = customer_df_train.set_index("customer_set").T.to_dict()   # dict형으로 변환
customer_position_t = [customer_dict_train[i]["c"] if not pd.isna(i) and customer_dict_train[i]["c"] != "" else i for i in train.customer_position] # country 기준 conversion으로 변경
len_2 = len(set(customer_position_t)); len_2

94

In [46]:
test_func(cos_similar)

{'vice president': ['vice president', 'president'],
 'decision influencer': ['decision influencer'],
 'decision maker': ['decision maker'],
 'commercial end user': ['end user'],
 'business development': ['business development sales'],
 'assistant professor': ['assistant professor english',
  'professor',
  'professor mathematics'],
 'business partner': ['partner'],
 'math physics teacher': ['physics teacher'],
 'education': ['education professional'],
 'architect consultant': ['consultant'],
 'you prof': ['prof'],
 'management': ['product management'],
 'general manager': ['manager'],
 'science teacher': ['teacher'],
 'director': ['principal director'],
 'co founder': ['founder'],
 'associate professor': ['associate professor electronics engg']}

In [47]:
# 방법 1-3. 길이가 적은 값으로 그룹화
# 장점 : 핵심 단어로 분류가 가능
# 단점 : 더 깐깐하게 그룹화
# 알고리즘
# 가장 유사도가 높은 두 단어를 추출
# 두 단어 중 길이가 짧은 것을 value, 길이가 긴 것을 key로 설정해 길이가 긴 것으로 짧은 것으로 변환할 수 있도록 함
# 길이가 짧은 단어의 유사도를 두고, 길이가 긴 단어의 유사도는 삭제 -> 그래야 길이가 긴 단어와 유사도가 높은 단어가 포함되지 않음
standard = 0.6
cos = cosine_similarity(tfidf_matrix, tfidf_matrix)

cos = np.array(cos)
np.fill_diagonal(cos, np.nan)

cos_similar = dict()
cos = pd.DataFrame(cos)
c = customer_stop.copy()
while cos.max().max() > standard:
    i = cos.max(axis = 1).idxmax()
    j = cos.iloc[i,:].idxmax()
    
    cos.iloc[i, j] = 0
    cos.iloc[j, i] = 0

    idx, del_idx = [i, j] if len(c[i]) <= len(c[j]) else [j, i]
    
    if c[idx] == "":
        idx, del_idx = del_idx, idx
    elif c[idx] in cos_similar.keys():
        cos_similar[c[del_idx]] = cos_similar[c[idx]]
    else:
        cos_similar[c[del_idx]] = c[idx]
    
    cos = cos.drop(del_idx).drop(del_idx, axis = 1).reset_index(drop = True)
    cos.columns = list(range(len(cos)))
    del c[del_idx]
    
c = [cos_similar[i] if i in cos_similar.keys() else i for i in customer_stop]

# train의 customer_job 변경
customer_df_train = pd.DataFrame([customer_set, c]).T  # DataFrame 생성
customer_df_train.columns = ["customer_set", "c"]            # colunm명 생성
customer_dict_train = customer_df_train.set_index("customer_set").T.to_dict()   # dict형으로 변환
customer_position_t = [customer_dict_train[i]["c"] if not pd.isna(i) and customer_dict_train[i]["c"] != "" else i for i in train.customer_position] # country 기준 conversion으로 변경
len_3 = len(set(customer_position_t)); len_3


92

In [48]:
test_func(cos_similar)

{'president': ['vice president'],
 'decision influencer': ['decision influencer'],
 'decision maker': ['decision maker'],
 'end user': ['commercial end user'],
 'business development': ['business development sales'],
 'assistant professor': ['assistant professor english'],
 'partner': ['business partner'],
 'physics teacher': ['physics mathematics teacher', 'math physics teacher'],
 'education': ['education professional'],
 'consultant': ['commercial consultant', 'architect consultant'],
 'prof': ['you prof'],
 'management': ['product management'],
 'manager': ['general manager'],
 'teacher': ['physics teacher', 'science teacher'],
 'director': ['principal director'],
 'professor': ['associate professor',
  'assistant professor',
  'professor mathematics'],
 'founder': ['ceo founder', 'co founder']}

In [49]:
# 방법 2. 사전학습모델 SentenceTransformer
# 단어의 숨겨진 의미까지 파악해 위 방법보다 유연
model = SentenceTransformer('paraphrase-distilroberta-base-v1')
customer_set = sorted(list(set(train.customer_position.dropna())))
le_train = len(customer_set)

# 특수문자 제거 및 토큰화
customer_re = [re.sub(r"[.,~!@#$%^&*\(\)_+=\-~|\\\"\';:\?/\<\>\[\]\{\}]", " ", customer) for customer in customer_set]
customer_re = [customer.strip().lower() for customer in customer_re]
customer_token = [WordPunctTokenizer().tokenize(i) for i in customer_re]

# 불용어 사전
stop = stopwords.words('english')

# 불용어 제거
customer_stop = []
for i in customer_token:
    for j in i:
        sent = i
        if j in stop: # 불용어 사전에 있으면 제거
            sent.remove(j)
    customer_stop.append(sent)
customer_stop = [" ".join(i) for i in customer_stop] # 다시 결합

embeddings = model.encode(customer_stop, convert_to_tensor=True)

In [53]:
# 방법 2-1. 방법 1-1과 동일
standard = 0.6

cosine_scores = util.pytorch_cos_sim(embeddings, embeddings)
cosine_scores = cosine_scores>standard
cos_similar = dict()
c = customer_stop.copy() # 테스트를 위한 copy
for i in range(len(cos)):
    for j in range(i+1, len(cosine_scores[i])):
        if c[i] == "" or c[j] == "":
            pass
        elif cosine_scores[i][j] and c[i] in cos_similar.keys():
            cos_similar[c[j]] = cos_similar[c[i]]
            c[j] = c[i]
        elif cosine_scores[i][j]:
            cos_similar[c[j]] = c[i]
            c[j] = c[i]

# train의 customer_job 변경
customer_df_train = pd.DataFrame([customer_set, c]).T  # DataFrame 생성
customer_df_train.columns = ["customer_set", "c"]            # colunm명 생성
customer_dict_train = customer_df_train.set_index("customer_set").T.to_dict()   # dict형으로 변환
customer_position_t = [customer_dict_train[i]["c"] if not pd.isna(i) and customer_dict_train[i]["c"] != "" else i for i in train.customer_position] # country 기준 conversion으로 변경
len_4 = len(set(customer_position_t)); len_4


78

In [54]:
test_func(cos_similar)

{'vice president': ['president', 'vice president'],
 'academic specialist': ['medical imaging specialist', 'surgery professional'],
 'architect consultant': ['architecture consult'],
 'assistant professor': ['assistant professor english',
  'associate professor',
  'assistant professor'],
 'business development': ['business development sales'],
 'management': ['manager', 'product management'],
 'ceo founder': ['co founder', 'founder'],
 'chemistry teacher': ['math physics teacher',
  'physics mathematics teacher',
  'physics teacher',
  'science teacher',
  'teacher'],
 'chief executive officer': ['leadership executive office owner'],
 'commercial end user': ['end user'],
 'consultant': ['consulting'],
 'decision influencer': ['decision maker', 'decision influencer'],
 'education': ['education professional',
  'educator',
  'education',
  'professional trainer',
  'maths lecturer',
  'guest faculty',
  'professor mathematics',
  'senior lecturer',
  'professor'],
 'prof': ['chemistry t

In [51]:
# 방법 2-2. 방법 1-2와 동일

standard = 0.6

cosine_scores = util.pytorch_cos_sim(embeddings, embeddings)
cosine_scores = np.array(cosine_scores)
np.fill_diagonal(cosine_scores, np.nan)

cos_similar = dict()
cosine_scores = pd.DataFrame(cosine_scores)
c = customer_stop.copy()
while cosine_scores.max().max() > standard:
    i = cosine_scores.max(axis = 1).idxmax()
    j = cosine_scores.iloc[i,:].idxmax()
    
    
    if customer_stop[i] == "" or customer_stop[j] == "":
        pass
    elif customer_stop[i] in cos_similar.keys():
        cos_similar[c[j]] = cos_similar[c[i]]
    else:
        cos_similar[c[j]] = c[i]
    
    cosine_scores.iloc[i, j] = 0
    cosine_scores.iloc[j, i] = 0
    
c = [cos_similar[i] if i in cos_similar.keys() else i for i in customer_stop]

# train의 customer_job 변경
customer_df_train = pd.DataFrame([customer_set, c]).T  # DataFrame 생성
customer_df_train.columns = ["customer_set", "c"]            # colunm명 생성
customer_dict_train = customer_df_train.set_index("customer_set").T.to_dict()   # dict형으로 변환
customer_position_t = [customer_dict_train[i]["c"] if not pd.isna(i) and customer_dict_train[i]["c"] != "" else i for i in train.customer_position] # country 기준 conversion으로 변경
len_5 = len(set(customer_position_t)); len_5


79

In [52]:
test_func(cos_similar)

{'decision influencer': ['decision maker', 'decision influencer'],
 'vice president': ['vice president', 'president'],
 'math physics teacher': ['physics teacher',
  'professor',
  'maths lecturer',
  'professor mathematics',
  'physics faculty',
  'senior lecturer'],
 'entry level': ['entrylevel'],
 'chemistry teacher': ['physics mathematics teacher', 'math physics teacher'],
 'owner': ['homeowner'],
 'assistant professor': ['associate professor', 'assistant professor english'],
 'business development': ['business development sales'],
 'exhibition': ['exhibitiontv'],
 'prof': ['science teacher'],
 'general manager': ['manager'],
 'consultant': ['consulting'],
 'commercial end user': ['end user'],
 'architect consultant': ['architecture consult'],
 'education': ['education professional',
  'professional trainer',
  'educator',
  'teacher',
  'teacher middle school coordinator'],
 'management': ['product management'],
 'ceo founder': ['co founder', 'founder'],
 'employee': ['trainee'],


In [55]:
# 방법 2-3. 방법 1-3과 동일
standard = 0.6

cosine_scores = util.pytorch_cos_sim(embeddings, embeddings)
cosine_scores = np.array(cosine_scores)
np.fill_diagonal(cosine_scores, np.nan)

cos_similar = dict()
cosine_scores = pd.DataFrame(cosine_scores)
c = customer_stop.copy()
while cosine_scores.max().max() > standard:
    i = cosine_scores.max(axis = 1).idxmax()
    j = cosine_scores.iloc[i,:].idxmax()
    
    cosine_scores.iloc[i, j] = 0
    cosine_scores.iloc[j, i] = 0

    idx, del_idx = [i, j] if len(c[i]) <= len(c[j]) else [j, i]
    
    if c[idx] == "":
        idx, del_idx = del_idx, idx
    elif c[idx] in cos_similar.keys():
        cos_similar[c[del_idx]] = cos_similar[c[idx]]
    else:
        cos_similar[c[del_idx]] = c[idx]
    
    cosine_scores = cosine_scores.drop(del_idx).drop(del_idx, axis = 1).reset_index(drop = True)
    cosine_scores.columns = list(range(len(cosine_scores)))
    del c[del_idx]
    
c = [cos_similar[i] if i in cos_similar.keys() else i for i in customer_stop]

# train의 customer_job 변경
customer_df_train = pd.DataFrame([customer_set, c]).T  # DataFrame 생성
customer_df_train.columns = ["customer_set", "c"]            # colunm명 생성
customer_dict_train = customer_df_train.set_index("customer_set").T.to_dict()   # dict형으로 변환
customer_position_t = [customer_dict_train[i]["c"] if not pd.isna(i) and customer_dict_train[i]["c"] != "" else i for i in train.customer_position] # country 기준 conversion으로 변경
len_6 = len(set(customer_position_t)); len_6


83

In [56]:
test_func(cos_similar)

{'decision maker': ['decision maker', 'decision influencer'],
 'president': ['vice president'],
 'physics teacher': ['physics mathematics teacher',
  'math physics teacher',
  'science teacher'],
 'entrylevel': ['entry level'],
 'owner': ['homeowner'],
 'assistant professor': ['associate professor', 'assistant professor english'],
 'business development': ['business development sales'],
 'exhibition': ['exhibitiontv'],
 'manager': ['management', 'general manager'],
 'physics faculty': ['physics teacher'],
 'prof': ['professor'],
 'consultant': ['consulting'],
 'end user': ['commercial end user'],
 'guest faculty': ['physics faculty'],
 'architect consultant': ['architecture consult'],
 'educator': ['education professional', 'education'],
 'co founder': ['ceo founder'],
 'maths lecturer': ['senior lecturer', 'professor mathematics'],
 'trainee': ['employee'],
 'teacher': ['chemistry teacher',
  'educator',
  'teacher middle school coordinator'],
 'founder': ['co founder'],
 'surgery pro

In [59]:
print(f"original 카테고리 수 : {original}")
print(f"번역 후 카테고리 수 : {original_trans}")
print(f"방법 1-1 카테고리 수 : {len_1}")
print(f"방법 1-2 카테고리 수 : {len_2}")
print(f"방법 1-3 카테고리 수 : {len_3}")
print(f"방법 2-1 카테고리 수 : {len_4}")
print(f"방법 2-2 카테고리 수 : {len_5}")
print(f"방법 2-3 카테고리 수 : {len_6}")

original 카테고리 수 : 117
번역 후 카테고리 수 : 114
방법 1-1 카테고리 수 : 94
방법 1-2 카테고리 수 : 94
방법 1-3 카테고리 수 : 92
방법 2-1 카테고리 수 : 78
방법 2-2 카테고리 수 : 79
방법 2-3 카테고리 수 : 83
