In [1]:
!pip install sentencepiece



In [2]:
import sentencepiece as spm
import pandas as pd
import urllib.request
import csv


In [3]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/LawrenceDuan/IMDb-Review-Analysis/master/IMDb_Reviews.csv", filename="IMDb_Reviews.csv")
train_df = pd.read_csv('IMDb_Reviews.csv')
train_df['review']


Unnamed: 0,review
0,My family and I normally do not watch local mo...
1,"Believe it or not, this was at one time the wo..."
2,"After some internet surfing, I found the ""Home..."
3,One of the most unheralded great works of anim...
4,"It was the Sixties, and anyone with long hair ..."
...,...
49995,the people who came up with this are SICK AND ...
49996,"The script is so so laughable... this in turn,..."
49997,"""So there's this bride, you see, and she gets ..."
49998,Your mind will not be satisfied by this nobud...


In [4]:
print('리뷰 개수 :',len(train_df)) # 리뷰 개수 출력


리뷰 개수 : 50000


In [6]:
with open('imdb_review.txt', 'w', encoding='utf8') as f:
    f.write('\n'.join(train_df['review']))


In [8]:
spm.SentencePieceTrainer.Train('--input=imdb_review.txt --model_prefix=imdb --vocab_size=5000 --model_type=bpe --max_sentence_length=9999')


In [9]:
vocab_list = pd.read_csv('imdb.vocab', sep='\t', header=None, quoting=csv.QUOTE_NONE)
vocab_list.sample(10)


Unnamed: 0,0,1
2858,ites,-2855
3204,▁avail,-3201
885,▁3,-882
340,ves,-337
4971,L,-4968
3114,▁suspect,-3111
769,ead,-766
4792,▁owner,-4789
4230,▁thin,-4227
2278,▁mil,-2275


In [10]:
len(vocab_list)


5000

In [11]:
sp = spm.SentencePieceProcessor()
vocab_file = "imdb.model"
sp.load(vocab_file)


True

In [12]:
lines = [
  "I didn't at all think of it this way.",
  "I have waited a long time for someone to film"
]
for line in lines:
  print(line)
  print(sp.encode_as_pieces(line))
  print(sp.encode_as_ids(line))
  print()


I didn't at all think of it this way.
['▁I', '▁didn', "'", 't', '▁at', '▁all', '▁think', '▁of', '▁it', '▁this', '▁way', '.']
[41, 624, 4950, 4926, 139, 170, 378, 30, 58, 73, 413, 4945]

I have waited a long time for someone to film
['▁I', '▁have', '▁wa', 'ited', '▁a', '▁long', '▁time', '▁for', '▁someone', '▁to', '▁film']
[41, 142, 1364, 1121, 4, 668, 285, 93, 1079, 33, 91]



In [13]:
sp.GetPieceSize()


5000

In [14]:
sp.IdToPiece(430)


'▁character'

In [15]:
sp.PieceToId('▁character')


430

In [16]:
sp.DecodeIds([41, 141, 1364, 1120, 4, 666, 285, 92, 1078, 33, 91])


'Iul wa fall aold timeooland to film'

In [19]:
sp.DecodePieces(['▁I', '▁have', '▁wa', 'ited', '▁a', '▁long', '▁time', '▁for', '▁someone', '▁to', '▁film'])

'I have waited a long time for someone to film'

In [20]:
print(sp.encode('I have waited a long time for someone to film', out_type=str))
print(sp.encode('I have waited a long time for someone to film', out_type=int))


['▁I', '▁have', '▁wa', 'ited', '▁a', '▁long', '▁time', '▁for', '▁someone', '▁to', '▁film']
[41, 142, 1364, 1121, 4, 668, 285, 93, 1079, 33, 91]


In [21]:
import pandas as pd
import sentencepiece as spm
import urllib.request
import csv


In [22]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt", filename="ratings.txt")
naver_df = pd.read_table('ratings.txt')
naver_df[:5]


Unnamed: 0,id,document,label
0,8112052,어릴때보고 지금다시봐도 재밌어요ㅋㅋ,1
1,8132799,"디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산...",1
2,4655635,폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.,1
3,9251303,와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...,1
4,10067386,안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.,1


In [23]:
print('리뷰 개수 :',len(naver_df)) # 리뷰 개수 출력


리뷰 개수 : 200000


In [24]:
print(naver_df.isnull().values.any())


True


In [26]:
naver_df = naver_df.dropna(how = 'any') # Null 값이 존재하는 행 제거
print(naver_df.isnull().values.any()) # Null 값이 존재하는지 확인


False


In [27]:
print('리뷰 개수 :',len(naver_df)) # 리뷰 개수 출력


리뷰 개수 : 199992


In [28]:
with open('naver_review.txt', 'w', encoding='utf8') as f:
    f.write('\n'.join(naver_df['document']))


In [29]:
spm.SentencePieceTrainer.Train('--input=naver_review.txt --model_prefix=naver --vocab_size=5000 --model_type=bpe --max_sentence_length=9999')


In [30]:
vocab_list = pd.read_csv('naver.vocab', sep='\t', header=None, quoting=csv.QUOTE_NONE)
vocab_list[:10]


Unnamed: 0,0,1
0,<unk>,0
1,<s>,0
2,</s>,0
3,..,0
4,영화,-1
5,▁영화,-2
6,▁이,-3
7,▁아,-4
8,...,-5
9,ᄏᄏ,-6


In [31]:
vocab_list.sample(10)


Unnamed: 0,0,1
502,었음,-499
1041,▁포스터,-1038
4776,쾅,-4773
3495,결,-3492
911,▁시나리오,-908
32,....,-29
3678,훈,-3675
2699,▁개판,-2696
2557,▁폐,-2554
4962,썪,-4959


In [32]:
len(vocab_list)


5000

In [33]:
sp = spm.SentencePieceProcessor()
vocab_file = "naver.model"
sp.load(vocab_file)


True

In [34]:
lines = [
  "뭐 이딴 것도 영화냐.",
  "진짜 최고의 영화입니다 ㅋㅋ",
]
for line in lines:
  print(line)
  print(sp.encode_as_pieces(line))
  print(sp.encode_as_ids(line))
  print()


뭐 이딴 것도 영화냐.
['▁뭐', '▁이딴', '▁것도', '▁영화냐', '.']
[136, 970, 1299, 2593, 3276]

진짜 최고의 영화입니다 ㅋㅋ
['▁진짜', '▁최고의', '▁영화입니다', '▁ᄏᄏ']
[54, 204, 825, 121]



In [35]:
sp.GetPieceSize()


5000

In [36]:
sp.IdToPiece(4)


'영화'

In [37]:
sp.PieceToId('영화')


4

In [38]:
sp.DecodeIds([54, 200, 821, 85])


'진짜 원 산~~'

In [39]:
sp.DecodePieces(['▁진짜', '▁최고의', '▁영화입니다', '▁ᄏᄏ'])


'진짜 최고의 영화입니다 ᄏᄏ'

In [40]:
print(sp.encode('진짜 최고의 영화입니다 ㅋㅋ', out_type=str))
print(sp.encode('진짜 최고의 영화입니다 ㅋㅋ', out_type=int))


['▁진짜', '▁최고의', '▁영화입니다', '▁ᄏᄏ']
[54, 204, 825, 121]
