***필요한 함수들***

In [1]:
# -- 전처리 함수 -- 
import re
import pandas as pd
import string
from emoji import core

def remove_emojis(text):
    return core.replace_emoji(text, replace='')

def remove_punct(text: str) -> str:
    remove_punct_dict = dict((ord(punct), ' ') for punct in string.punctuation)
    text = text.lower().translate(remove_punct_dict)
    return text

def remove_english_and_numbers(text: str) -> str:
    text = re.sub('[a-zㄱ-ㅎ0-9]', '', text).strip() 
    text = re.sub('\s{2,}', ' ', text)
    return text

def remove_numbers(text: str) -> str:
    text = re.sub('[ㄱ-ㅎ0-9]', '', text).strip() 
    text = re.sub('\s{2,}', ' ', text)
    return text

def preprocess_product_name(product_name: str) -> str:
    product_name = remove_punct(product_name)
    # -- 영어 날리는 버전 --
#     product_name = remove_english_and_numbers(product_name)
    # -- 영어 살리는 버전 --
    product_name = remove_numbers(product_name)
    
    return product_name

# -- 전체 데이터 프레임 가져오는 함수 --
def concat_bungae_files():
    base_df = pd.DataFrame()
    for idx in range(9):
        df = pd.read_csv(f'./bungae_df_{idx}_fashion.csv')
        df = df.dropna(axis=0)
        df['cat_id'] = df['cat_id'].astype(int).astype(str)
        # -- 필요한 컬럼만 가져오기 -- 
        df = df[['product_id', 'product_name', 'cat_id']].copy()
        # -- concat 할 때마다 base_df 업데이트 -- 
        base_df = pd.concat([base_df, df], ignore_index=True)

    # -- base_df 드라이브에 저장해놓기 --
    # base_df.to_csv('/content/drive/MyDrive/bungae_base_df_fashion.csv', index=False)

    return base_df

def pull_and_preprocess_base_df():
    base_df = concat_bungae_files()
    base_df['product_name'] = base_df['product_name'].apply(lambda x : preprocess_product_name(x))
    base_df['product_name'] = base_df['product_name'].apply(lambda x : remove_emojis(x))
    return base_df

def bring_fashion_cat_csv():
    """
    번개장터 카테고리 id와 카테고리 이름이 들어있는
    파일 가져오는 함수
    """
    final_cat = pd.read_csv('./final_category.csv')
    final_cat = final_cat.drop('Unnamed: 0', axis=1).copy()
    final_cat['cat_id'] = final_cat['cat_id'].astype(str)
    
    return final_cat

def get_cat_name(cat_id:str) -> str:
    """
    카테고리 id를 입력하면 카테고리 이름을 반환하는 함수 => 중분류에 대해서
    """
    final_cat = bring_fashion_cat_csv()
    if len(cat_id) == 6: # 중분류
        return remove_punct(list(final_cat[final_cat['cat_id'] == cat_id]['cat2'])[0])
    elif len(cat_id) == 9: # 대분류
        return remove_punct(list(final_cat[final_cat['cat_id'] == cat_id]['cat3'])[0])

# -- subs 가 있는 것들만 dictionary로 뽑아내기 --
def make_category_dict_with_subs() -> dict:
    
    with open('./bgzt_fashion_category_nums.json', 'r') as file:
        data = json.load(file)
        
    category_dict_with_subs = dict()
    for main, mids in data.items():
        for mid, subs in mids.items():
            d = defaultdict()
            if subs != [None]: # 하위 sub들이 있을 때 
                d[mid] = subs
            category_dict_with_subs.update(d)
    return category_dict_with_subs


# -- 중분류 내에서의 소분류 morphs 키워드 가져오는 함수 --
def make_subcat_morphs_by_midcat(cat_dict:dict) -> dict:
    """
    make_category_dict_with_subs 함수로 부터 받은 딕션어리로
    midcat의 하위 subcat들의 morphs 키워드 가져오는 함수 
    """
    mecab = Mecab()
    d = defaultdict(list)

    for mid, subs in cat_dict.items():
        tmp_dict = dict()
        for sub in subs:

            mid_morph = mecab.morphs(get_cat_name(mid))
            sub_morph = mecab.morphs(remove_punct(get_cat_name(sub)))
            unique_sub_morph = set(sub_morph) - set(mid_morph)
            unique_sub_morph = list(unique_sub_morph)
            if unique_sub_morph == []:
                d[mid] += sub_morph
            d[mid] += unique_sub_morph

    return dict(d)

# -- 위의 두 함수 합치기 --
def get_dict_from_midcat_with_subs():
    cat_dict = make_category_dict_with_subs()
    morphs_dict = make_subcat_morphs_by_midcat(cat_dict)
    return morphs_dict



In [2]:
from konlpy.tag import Mecab
from gensim.models.fasttext import FastText
import json
from collections import defaultdict
import gc


def get_df_by_fasttext_test_version3(fashion_df, cat_id:str):
    
    # -- 카테고리 id에 맞는 데이터 프레임 가져오기 --
    df = fashion_df[fashion_df['cat_id'] == cat_id]
    df = df.reset_index(drop=True)
    
    # -- tokenizing --
    mecab = Mecab()
    tokens = []
    for idx in range(len(df)): # 명사인 토큰만 가져오기 
        token = [pos[0] for pos in mecab.pos(df.loc[idx]['product_name']) if pos[1][0] == 'N']            
        if len(token) > 1:
            tokens.append(token)
            
    # -- model 1 training --       
    model1 = FastText(tokens, vector_size=100, window=10, min_count=5, workers=4, sg=0)
    model1.build_vocab(tokens)
    total_examples1 = model1.corpus_count
    model1.train(tokens, total_examples=total_examples1, epochs=10)
    
    # -- get most similar words -- 
    cat_name_morphs = mecab.morphs(get_cat_name(cat_id))
    mid_name_morphs = mecab.morphs(get_cat_name(cat_id[:6]))
    cat_name_morphs = list(set(cat_name_morphs) - set(mid_name_morphs))
    print(get_cat_name(cat_id))
    
    results = model1.wv.most_similar(positive=cat_name_morphs, topn=30)
    keywords_list = [result[0] for result in results]
    keywords = str('|'.join(keywords_list))
    
    # -- 반대되는 데이터 프레임 --
    # -- category id가 9개 자리수인 것만 소분류 내에서 다른 카테고리 키워드 들어간 데이터 삭제해주기 -- 
    cat_id_keywords_dict = get_dict_from_midcat_with_subs()
    if len(cat_id) == 9:
        keyword_list = cat_id_keywords_dict[cat_id[:6]]
        opposites = list(set(keyword_list) - set(cat_name_morphs))
        opposite_keywords = '|'.join(opposites)
        df = df[~df['product_name'].str.contains(opposite_keywords)].copy()
    
    # -- get df --
    df_filtered = df[df['product_name'].str.contains(keywords)]
    df_filtered = df_filtered.reset_index(drop=True)
    
    if len(df_filtered) > 400:
        df_filtered = df_filtered.sample(400)
    
    del model1
    gc.collect()
    
    return df_filtered

def reduce_data():
    
    base_df = pull_and_preprocess_base_df()
    fashion_df = base_df.copy()
    
    cat_id_list = list(fashion_df['cat_id'].unique())
    final_df = pd.DataFrame()
    for cat_id in cat_id_list:
        try:
            df = get_df_by_fasttext_test_version3(fashion_df, cat_id)
            final_df = pd.concat([final_df, df], ignore_index=True)
        except:
            pass
    
    return final_df

def get_and_preprocess_final_df():
    final_df = reduce_data()
    final_df = final_df.reset_index(drop=True)
    return final_df

In [None]:
# -- 영어도 같이 포함되어 있는 버전 --
# -- 아무래도 한국말만 쓰는 사람이 있는 것은 아니므로 --
# -- 영어에서 그 상품에 대한 정보를 더 가져올 수 있을 것으로 예상 (ex. 사이즈) --
final_df = get_and_preprocess_final_df()
final_df.head()

In [None]:
# -- 한국말도 같이 있는 버전 --
final_df2 = reduce_data()
final_df2.head()

***소분류, 중분류로만 학습시켰을 때***

In [6]:
# -- 데이터 셋 스플릿 -- 
from sklearn.model_selection import train_test_split

def split_train_test_data(df):
    # -- train, test 스플릿 --
    train_x, test_x, train_y, test_y = train_test_split(df['product_name'],
                                                        df['cat_id'].values, 
                                                        random_state=5, 
                                                        test_size=.2)
    # -- train_df, test_df 분리 --
    train_df, test_df = df.loc[train_x.index], df.loc[test_x.index]
    
    # -- resetting index -- 
    train_df = train_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)

    return train_df, test_df



In [None]:
# -- making train_df txt file --
train_df['label'] = '__label__' + train_df['cat_id']
train_df = train_df.drop(['cat_id', 'product_id'], axis=1)
train_df.to_csv('train_fasttext.txt', sep='\t', index=False)
    
# -- making train_df txt file --
test_df['label'] = '__label__' + test_df['cat_id']
test_df = test_df.drop(['cat_id', 'product_id'], axis=1)
test_df.to_csv('test_fasttext.txt', sep='\t', index=False)

In [None]:
import fasttext
model = fasttext.train_supervised(input='./train_fasttext.txt', wordNgrams=3, epoch=25, 
                                  lr=0.4
#                                   loss=
                                  )
# loss = hs 보다 안할때가 성능이 더 좋음. 

model.test('./test_fasttext.txt')

In [None]:
model.predict('안토니모라토 티셔츠')

# 대분류

***대분류로 먼저 학습시켜보기***

In [None]:
# -- 데이터 셋 스플릿 -- 
from sklearn.model_selection import train_test_split

def split_train_test_data(df):
    # -- train, test 스플릿 --
    train_x, test_x, train_y, test_y = train_test_split(df['product_name'],
                                                        df['main_cat_id'].values,  # main_cat_id
                                                        random_state=5, 
                                                        test_size=.2)
    # -- train_df, test_df 분리 --
    train_df, test_df = df.loc[train_x.index], df.loc[test_x.index]
    
    # -- resetting index -- 
    train_df = train_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)

    return train_df, test_df


In [None]:
final_df['main_cat_id'] = final_df['cat_id'].apply(lambda x : x[:3])
sample_df = final_df.copy()

In [None]:
train_df, test_df = split_train_test_data(sample_df)

# -- making train_df txt file --
train_df['label'] = '__label__' + train_df['main_cat_id']
train_df = train_df.drop(['cat_id', 'product_id', 'main_cat_id'], axis=1)
train_df.to_csv('train_fasttext_maincat.txt', sep='\t', index=False)
    
# -- making train_df txt file --
test_df['label'] = '__label__' + test_df['main_cat_id']
test_df = test_df.drop(['cat_id', 'product_id', 'main_cat_id'], axis=1)
test_df.to_csv('test_fasttext_maincat.txt', sep='\t', index=False)

In [None]:
import fasttext
model = fasttext.train_supervised(input='./train_fasttext_maincat.txt', 
                                  wordNgrams=2, 
                                  epoch=25, 
                                  lr=0.34,
#                                   loss='hs' # 모델 빠르게 학습 가능
                                  loss='ova',
                                  )
# loss = hs 보다 안할때가 성능이 더 좋음. 

model.test('./test_fasttext_maincat.txt')


In [None]:
test_df

In [None]:
model.predict('미쏘플리츠원피스')

***데이터가 더 많은 전체 데이터에 대해서 학습***

**FastText**

In [10]:
# -- 데이터 셋 스플릿 -- 
from sklearn.model_selection import train_test_split

def split_train_test_data(df):
    # -- train, test 스플릿 --
    train_x, test_x, train_y, test_y = train_test_split(df['product_name'],
                                                        df['main_cat_id'].values,  # main_cat_id
                                                        random_state=5, 
                                                        test_size=.2)
    # -- train_df, test_df 분리 --
    train_df, test_df = df.loc[train_x.index], df.loc[test_x.index]
    
    # -- resetting index -- 
    train_df = train_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)

    return train_df, test_df



In [3]:
base_df = pull_and_preprocess_base_df()
base_df.head()

  df = pd.read_csv(f'./bungae_df_{idx}_fashion.csv')
  df = pd.read_csv(f'./bungae_df_{idx}_fashion.csv')
  df = pd.read_csv(f'./bungae_df_{idx}_fashion.csv')


Unnamed: 0,product_id,product_name,cat_id
0,201926367,여성야상패딩,310090050
1,173779753,여성파라점퍼스xl,310090050
2,213485472,경량 패딩 새상품 가니 점퍼 스타일,310090050
3,219480633,휠라 롱패딩 아디다스 패딩 양털 점퍼 코트 일괄,310090050
4,202566238,무료배송 날씬해보이는 거위솜털 여성패딩점퍼,310090050


In [11]:
# -- base_df 자체를 가져와서 학습시켜 보기 -- 
base_df['main_cat_id'] = base_df['cat_id'].map(lambda x : x[:3])
base_df

Unnamed: 0,product_id,product_name,cat_id,main_cat_id
0,201926367,여성야상패딩,310090050,310
1,173779753,여성파라점퍼스xl,310090050,310
2,213485472,경량 패딩 새상품 가니 점퍼 스타일,310090050,310
3,219480633,휠라 롱패딩 아디다스 패딩 양털 점퍼 코트 일괄,310090050,310
4,202566238,무료배송 날씬해보이는 거위솜털 여성패딩점퍼,310090050,310
...,...,...,...,...
1391171,164097143,전통매듭목걸이 목걸이 전통목걸이 쥬얼리 매듭,400999,400
1391172,174926618,인조모 가발 붙임머리 긴머리,400999,400
1391173,219603119,마리떼 헤어클립 구해요,400999,400
1391174,224250961,새제품 호피 헤어밴드,400999,400


In [12]:
train_df, test_df = split_train_test_data(base_df)

# -- making train_df txt file --
train_df['label'] = '__label__' + train_df['main_cat_id']
train_df = train_df.drop(['cat_id', 'product_id', 'main_cat_id'], axis=1)
train_df.to_csv('train_fasttext_maincat.txt', sep='\t', index=False)
    
# -- making train_df txt file --
test_df['label'] = '__label__' + test_df['main_cat_id']
test_df = test_df.drop(['cat_id', 'product_id', 'main_cat_id'], axis=1)
test_df.to_csv('test_fasttext_maincat.txt', sep='\t', index=False)

In [13]:
import fasttext
model = fasttext.train_supervised(input='./train_fasttext_maincat.txt', 
                                  wordNgrams=2, 
                                  epoch=25, 
                                  lr=0.34,
#                                   loss='hs' # 모델 빠르게 학습 가능
                                  loss='ova',
                                  )
# loss = hs 보다 안할때가 성능이 더 좋음. 

model.test('./test_fasttext_maincat.txt')


Read 6M words
Number of words:  280507
Number of labels: 8
Progress: 100.0% words/sec/thread:  614657 lr:  0.000000 avg.loss:  0.275470 ETA:   0h 0m 0s


(278234, 0.8800254462071494, 0.8800254462071494)

In [None]:
# 용량이 적은 모델 
model.quantize(input='./train_fasttext_maincat.txt', retrain=True)
model.test('./test_fasttext_maincat.txt')

In [None]:
del model

**TF-IDF**

In [25]:
tf_base_df = base_df.copy()
tf_base_df = tf_base_df.drop(['cat_id', 'product_id'], axis=1)

In [26]:
# -- TF-IDF 용 --
def split_train_test_data(df):
    # -- train, test 스플릿 --
    train_x, test_x, train_y, test_y = train_test_split(df['product_name'],
                                                        df['main_cat_id'].values, 
                                                        random_state=5, 
                                                        test_size=.2)

    return train_x, test_x, train_y, test_y

x_train, x_test, y_train, y_test = split_train_test_data(tf_base_df)

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(ngram_range=(1,2), max_df=300, min_df=5)
# -- x_train vectorization --
X_train_cnt_vect = tfidf_vect.fit_transform(x_train)
# -- x_test vectorization --
X_test_cnt_vect = tfidf_vect.transform(x_test)

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
lr_clf = LogisticRegression(solver='liblinear')
lr_clf.fit(X_train_cnt_vect , y_train)
pred = lr_clf.predict(X_test_cnt_vect)
print('TF-IDF Vectorized Logistic Regression 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test,pred)))

TF-IDF Vectorized Logistic Regression 의 예측 정확도는 0.778


In [37]:
from sklearn.metrics import precision_score, recall_score
print('precision_score : ', precision_score(y_test, pred, average='weighted'))
print('recall_score : ', recall_score(y_test, pred, average='weighted'))


  _warn_prf(average, modifier, msg_start, len(result))


precision_score :  0.7993397180117504
recall_score :  0.777900774881755


# 중분류

In [46]:
# -- 데이터 셋 스플릿 -- 
from sklearn.model_selection import train_test_split

def split_train_test_data(df):
    # -- train, test 스플릿 --
    train_x, test_x, train_y, test_y = train_test_split(df['product_name'],
                                                        df['mid_cat_id'].values,  # mid_cat_id
                                                        random_state=5, 
                                                        test_size=.2)
    # -- train_df, test_df 분리 --
    train_df, test_df = df.loc[train_x.index], df.loc[test_x.index]
    
    # -- resetting index -- 
    train_df = train_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)

    return train_df, test_df



In [39]:
sample_df = base_df.copy()

In [43]:
sample_df['mid_cat_id'] = sample_df['cat_id'].apply(lambda x : x[:6])
sample_df.head()

Unnamed: 0,product_id,product_name,cat_id,main_cat_id,mid_cat_id
0,201926367,여성야상패딩,310090050,310,310090
1,173779753,여성파라점퍼스xl,310090050,310,310090
2,213485472,경량 패딩 새상품 가니 점퍼 스타일,310090050,310,310090
3,219480633,휠라 롱패딩 아디다스 패딩 양털 점퍼 코트 일괄,310090050,310,310090
4,202566238,무료배송 날씬해보이는 거위솜털 여성패딩점퍼,310090050,310,310090


In [45]:
# -- 여성의류 > 패딩/점퍼 --
women = sample_df[sample_df['main_cat_id'] == '310']
women

Unnamed: 0,product_id,product_name,cat_id,main_cat_id,mid_cat_id
0,201926367,여성야상패딩,310090050,310,310090
1,173779753,여성파라점퍼스xl,310090050,310,310090
2,213485472,경량 패딩 새상품 가니 점퍼 스타일,310090050,310,310090
3,219480633,휠라 롱패딩 아디다스 패딩 양털 점퍼 코트 일괄,310090050,310,310090
4,202566238,무료배송 날씬해보이는 거위솜털 여성패딩점퍼,310090050,310,310090
...,...,...,...,...,...
1300641,199376528,롱 페티코트 종,310220,310,310220
1300642,221066031,코스프레 의상,310220,310,310220
1300643,165984924,옷 처분,310220,310,310220
1300644,224319538,섹시잠옷 프리사이즈 새상품,310220,310,310220


In [50]:
women_train_df, women_test_df = split_train_test_data(women)

# -- making train_df txt file --
women_train_df['label'] = '__label__' + women_train_df['mid_cat_id']
women_train_df = women_train_df.drop(['cat_id', 'product_id', 'main_cat_id', 'mid_cat_id'], axis=1)
women_train_df.to_csv('women_train_fasttext_maincat.txt', sep='\t', index=False)
    
# -- making train_df txt file --
women_test_df['label'] = '__label__' + women_test_df['mid_cat_id']
women_test_df = women_test_df.drop(['cat_id', 'product_id', 'main_cat_id', 'mid_cat_id'], axis=1)
women_test_df.to_csv('women_test_fasttext_maincat.txt', sep='\t', index=False)

In [51]:
import fasttext
model = fasttext.train_supervised(input='./women_train_fasttext_maincat.txt', 
                                  wordNgrams=2, 
                                  epoch=25, 
                                  lr=0.34,
#                                   loss='hs' # 모델 빠르게 학습 가능
                                  loss='ova',
                                  )
# loss = hs 보다 안할때가 성능이 더 좋음. 

model.test('./women_test_fasttext_maincat.txt')


Read 2M words
Number of words:  132455
Number of labels: 22
Progress: 100.0% words/sec/thread:  365802 lr:  0.000000 avg.loss:  0.448905 ETA:   0h 0m 0s


(115126, 0.784748883831628, 0.784748883831628)

# 소분류

In [61]:
# -- 데이터 셋 스플릿 -- 
from sklearn.model_selection import train_test_split

def split_train_test_data(df):
    # -- train, test 스플릿 --
    train_x, test_x, train_y, test_y = train_test_split(df['product_name'],
                                                        df['cat_id'].values,  # mid_cat_id
                                                        random_state=5, 
                                                        test_size=.2)
    # -- train_df, test_df 분리 --
    train_df, test_df = df.loc[train_x.index], df.loc[test_x.index]
    
    # -- resetting index -- 
    train_df = train_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)

    return train_df, test_df



In [62]:
women_padded = sample_df[sample_df['mid_cat_id'] == '310090']
women_padded

Unnamed: 0,product_id,product_name,cat_id,main_cat_id,mid_cat_id
0,201926367,여성야상패딩,310090050,310,310090
1,173779753,여성파라점퍼스xl,310090050,310,310090
2,213485472,경량 패딩 새상품 가니 점퍼 스타일,310090050,310,310090
3,219480633,휠라 롱패딩 아디다스 패딩 양털 점퍼 코트 일괄,310090050,310,310090
4,202566238,무료배송 날씬해보이는 거위솜털 여성패딩점퍼,310090050,310,310090
...,...,...,...,...,...
1241140,113235462,s 나이키 여성 오리털 패딩 점퍼,310090999,310,310090
1241141,177989474,여성m 나이키 패딩 점퍼,310090999,310,310090
1241142,175582587,무이드 퍼 조끼,310090999,310,310090
1241143,113563556,매긴 오리털패딩,310090999,310,310090


In [63]:
women_padded_train_df, women_padded_test_df = split_train_test_data(women_padded)

# -- making train_df txt file --
women_padded_train_df['label'] = '__label__' + women_padded_train_df['cat_id']
women_padded_train_df = women_padded_train_df.drop(['cat_id', 'product_id', 'main_cat_id', 'mid_cat_id'], axis=1)
women_padded_train_df.to_csv('women_padded_train_fasttext_maincat.txt', sep='\t', index=False)
    
# -- making train_df txt file --
women_padded_test_df['label'] = '__label__' + women_padded_test_df['cat_id']
women_padded_test_df = women_padded_test_df.drop(['cat_id', 'product_id', 'main_cat_id', 'mid_cat_id'], axis=1)
women_padded_test_df.to_csv('women_padded_test_fasttext_maincat.txt', sep='\t', index=False)

In [66]:
import fasttext
model = fasttext.train_supervised(input='./women_padded_train_fasttext_maincat.txt', 
                                  wordNgrams=2, 
                                  epoch=25, 
                                  lr=0.34,
#                                   loss='hs' # 모델 빠르게 학습 가능
                                  loss='ova',
                                  )
# loss = hs 보다 안할때가 성능이 더 좋음. 

model.test('./women_padded_test_fasttext_maincat.txt')


Read 0M words
Number of words:  17253
Number of labels: 4
Progress: 100.0% words/sec/thread:  688237 lr:  0.000000 avg.loss:  0.625597 ETA:   0h 0m 0s


(7929, 0.6210114768571068, 0.6210114768571068)