# Fasttext

- w2v를 기본으로 하면서 부분단어들을 임베딩하면서 학습하는 기법
- 원래의 단어를 부분단어의 벡터들로 표현함
- w2v처럼 단어들의 동시 등장 정보를 보존
- https://ratsgo.github.io/from%20frequency%20to%20semantics/2017/07/06/fasttext/

In [1]:
!pip install fasttext

Collecting fasttext
[?25l  Downloading https://files.pythonhosted.org/packages/f8/85/e2b368ab6d3528827b147fdb814f8189acc981a4bc2f99ab894650e05c40/fasttext-0.9.2.tar.gz (68kB)
[K     |████▊                           | 10kB 16.2MB/s eta 0:00:01[K     |█████████▌                      | 20kB 11.7MB/s eta 0:00:01[K     |██████████████▎                 | 30kB 6.8MB/s eta 0:00:01[K     |███████████████████             | 40kB 3.1MB/s eta 0:00:01[K     |███████████████████████▉        | 51kB 3.8MB/s eta 0:00:01[K     |████████████████████████████▋   | 61kB 4.1MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 2.5MB/s 
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp36-cp36m-linux_x86_64.whl size=3038376 sha256=d7180480f4aebb077962dca977e3abba7e74b1819e89632f5dafaf5ef091b478
  Stored in directory: /root/.cache/pip/wheels/98/ba/7f/b154944a1cf5a8cee91c15

In [2]:
import fasttext
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import StratifiedKFold

In [3]:
base_path = '/content/drive/MyDrive/데이콘 - 소셜 작가 분류/open'
train = pd.read_csv(os.path.join(base_path,'train.csv'))
test = pd.read_csv(os.path.join(base_path,'test_x.csv'))
sub = pd.read_csv(os.path.join(base_path,'sample_submission.csv'), index_col = 0)

In [None]:
# line하나가 예측하고자 하는 하나의 글 혹은 문서가 될 수 있도록
file = open(os.path.join(base_path,'fasttexttrain.txt'),'w+')
for i in train.index:
    line = '__label__' + str(train['author'][i])+' '+train['text'][i]
    file.write(line+"\n")

In [4]:
# 지도학습 모델
# text_clf_model = fasttext.train_supervised(os.path.join(base_path,'fasttexttrain.txt'), epoch=70, minCount=2, maxn=10, verbose=2)
text_clf_model = fasttext.train_supervised(os.path.join(base_path,'fasttexttrain.txt'), epoch=70, minCount=2, maxn=15, verbose=2, lr = 0.1, )

In [None]:
# 비지도학습 모델
# text_clf_model = fasttext.train_unsupervised(os.path.join(base_path,'fasttexttrain.txt'),model = 'skipgram', epoch=70, minCount=2, maxn=10, verbose=0)

In [5]:
for i in test.index:
    lable, proba = text_clf_model.predict(test['text'][i], k=5)
    for la, pr in zip(lable, proba):
        if '__label__0' == la:
            sub.loc[i, '0'] = pr
        elif '__label__1' == la:
            sub.loc[i, '1'] = pr
        elif '__label__2' == la:
            sub.loc[i, '2'] = pr
        elif '__label__3' == la:
            sub.loc[i, '3'] = pr
        elif '__label__4' == la:
            sub.loc[i, '4'] = pr
    # submission.loc[i, '0'] = proba[lable.loc('__label__0')]
    # submission.loc[i, '1'] = proba[4]
    # submission.loc[i, '2'] = proba[2]
    # submission.loc[i, '3'] = proba[0]
    # submission.loc[i, '4'] = proba[3]

sub.to_csv(os.path.join(base_path,'submission_fasttext_ver1_epoch70_lr0.1_maxn15.csv'))
print('end')


end


# fasttext cv

In [None]:
fold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 0)
sub2 = sub.copy()

In [None]:
for n_fold, (train_idx, valid_idx) in enumerate(fold.split(train['text'], train['author'])):
    train_x, train_y = train['text'].iloc[train_idx], train['author'].iloc[train_idx]
    valid_x, valid_y = train['text'].iloc[valid_idx], train['author'].iloc[valid_idx]

    train_x.index = range(len(train_x))
    train_y.index = range(len(train_y))

    file = open(os.path.join(base_path,f'fasttexttrain_cv{n_fold}.txt'),'w+')
    for i in train_x.index:
        line = '__label__' + str(train_y[i])+' '+train_x[i]
        file.write(line+"\n")


    text_clf_model = fasttext.train_supervised(
          os.path.join(base_path,f'fasttexttrain_cv{n_fold}.txt'),
          epoch=60,
          minCount=2, 
          maxn=10, 
          verbose=0
          )
    
    for i in test.index:
        lable, proba = text_clf_model.predict(test['text'][i], k=5)
        for la, pr in zip(lable, proba):
            if '__label__0' == la:
                sub.loc[i, '0'] = pr
            elif '__label__1' == la:
                sub.loc[i, '1'] = pr
            elif '__label__2' == la:
                sub.loc[i, '2'] = pr
            elif '__label__3' == la:
                sub.loc[i, '3'] = pr
            elif '__label__4' == la:
                sub.loc[i, '4'] = pr

    sub2['0'] += sub['0'] / fold.n_splits
    sub2['1'] += sub['1'] / fold.n_splits
    sub2['2'] += sub['2'] / fold.n_splits
    sub2['3'] += sub['3'] / fold.n_splits
    sub2['4'] += sub['4'] / fold.n_splits
    print(n_fold)

sub2.to_csv(os.path.join(base_path,'submission_fasttext_cv5_ver1.csv'))
print('end')

0
1
2
3
4
end


- fast text cv성능 0.31
- fast text 단일 epoch 70성능 0.305