In [None]:
!git clone https://github.com/SKTBrain/KoBERT.git

In [None]:
%cd /content/KoBERT/
!pip install -r requirements.txt

In [None]:
from google.colab import auth
auth.authenticate_user()
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, notebook

from torch.nn import init
import gc
import unicodedata
import re

import requests
import pprint
import json

import sys

In [None]:
class BERTDataset(Dataset):
    def __init__(self, data, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = []

        if len(data)<=max_len:
            self.sentences.append(transform([data]))
        else:
            self.sentences.append(transform([data[:max_len]]))


    def __getitem__(self, i):
        return (self.sentences[i])

    def __len__(self):
        return (len(self.sentences))

In [None]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=11,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [None]:
def GetMediaCategory(text):
    text = unicodedata.normalize('NFC',text)
    text = ' '.join(re.compile('[가-힣]+').findall(text))
    if len(text) == 0:
        text = '기타'

    #datalist = BERTDataset(captionlist,tok, max_len, True, False)
    data = BERTDataset(text, tok, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(data, batch_size=batch_size, num_workers=num_workers)
    gc.collect()
    wholeout=[]
    wholevalue=[]
    # for (token_ids, valid_length, segment_ids) in datalist:
    for batch_id,(token_ids, valid_length, segment_ids) in enumerate(notebook.tqdm(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        outlist = []
        valuelist = []
        out = modelbest(token_ids, valid_length, segment_ids)
        print('out: ',out)
        for outi in out:
            valuelist.append(outi.max().tolist())
            if outi.max().tolist() > threshold:
                outlist.append(categorylist[outi.argmax()])
            else:
                outlist.append('기타')
        wholeout+=outlist
        wholevalue+=valuelist

    return wholeout, wholevalue

In [None]:
##GPU 사용 시
# there are totally five GPU in server,we can routed to 0:4.
# device = torch.device("cuda:0")
device = torch.device('cpu')

##CPU 사용 시
# device = torch.device('cpu')
gc.collect()

bertmodel, vocab = get_pytorch_kobert_model()
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

/content/KoBERT/.cache/kobert_v1.zip[██████████████████████████████████████████████████]
/content/KoBERT/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece[██████████████████████████████████████████████████]
using cached model. /content/KoBERT/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [None]:
# define parameter
max_len = 512
batch_size = 6
warmup_ratio = 0.1
num_epochs = 20
max_grad_norm = 1
log_interval = 20
learning_rate =  5e-6  #5e-5  2e-5
num_workers = 2
n_splits = 5
model_name = 'kobertbest_512.pt'

categorylist = ["화장품","패션","요리음식","여행아웃도어","인테리어","엔터테인먼트","육아","아이티","자동차","헬스/피트니스","반려동물"]
threshold = 5.26

modelbest = torch.load("/content/gdrive/MyDrive/Colab Notebooks/" + model_name, map_location=device)
modelbest.to(device)
# modelbest.eval()

BERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True

In [None]:
#유튜브 url에서 데이터들 가져오기
! pip install --upgrade google-api-python-client
! pip install --upgrade google-auth-oauthlib google-auth-httplib2
! pip install oauth2client
! pip install youtube-dl
! pip install git+https://github.com/Cupcakus/pafy
! pip install requests
!pip install youtube-transcript-api # for windows

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting google-api-python-client
  Downloading google_api_python_client-2.64.0-py2.py3-none-any.whl (10.4 MB)
[K     |████████████████████████████████| 10.4 MB 4.8 MB/s 
Collecting google-auth-httplib2>=0.1.0
  Downloading google_auth_httplib2-0.1.0-py2.py3-none-any.whl (9.3 kB)
Installing collected packages: google-auth-httplib2, google-api-python-client
  Attempting uninstall: google-auth-httplib2
    Found existing installation: google-auth-httplib2 0.0.4
    Uninstalling google-auth-httplib2-0.0.4:
      Successfully uninstalled google-auth-httplib2-0.0.4
  Attempting uninstall: google-api-python-client
    Found existing installation: google-api-python-client 1.12.11
    Uninstalling google-api-python-client-1.12.11:
      Successfully uninstalled google-api-python-client-1.12.11
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are 

In [None]:
import pafy
import requests
import nltk
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from oauth2client.tools import argparser
from youtube_transcript_api import YouTubeTranscriptApi

In [None]:
!pip install soynlp #한국어 토큰화
!pip install git+https://github.com/haven-jeon/PyKoSpacing.git #띄어쓰기 패키지
!pip install git+https://github.com/ssut/py-hanspell.git #스펠링 체크
!pip install konlpy
from konlpy.tag import Okt
okt = Okt()
from collections import Counter
from soynlp.tokenizer import LTokenizer
from nltk import FreqDist
from pykospacing import Spacing
from hanspell import spell_checker
from youtube_transcript_api import YouTubeTranscriptApi
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
import urllib.request
from soynlp import DoublespaceLineCorpus
from soynlp.word import WordExtractor

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting soynlp
  Downloading soynlp-0.0.493-py3-none-any.whl (416 kB)
[K     |████████████████████████████████| 416 kB 5.2 MB/s 
Installing collected packages: soynlp
Successfully installed soynlp-0.0.493
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/haven-jeon/PyKoSpacing.git
  Cloning https://github.com/haven-jeon/PyKoSpacing.git to /tmp/pip-req-build-hvv4sk_m
  Running command git clone -q https://github.com/haven-jeon/PyKoSpacing.git /tmp/pip-req-build-hvv4sk_m
Collecting tensorflow==2.7.2
  Downloading https://us-python.pkg.dev/colab-wheels/public/tensorflow/tensorflow-2.7.2%2Bzzzcolab20220516114640-cp37-cp37m-linux_x86_64.whl (671.4 MB)
[K     |████████████████████████████████| 671.4 MB 1.8 kB/s 
Collecting argparse>=1.4.0
  Downloading argparse-1.4.0-py2.py3-none-any.whl (23 kB)
Collecting t

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/ssut/py-hanspell.git
  Cloning https://github.com/ssut/py-hanspell.git to /tmp/pip-req-build-_vwy35nx
  Running command git clone -q https://github.com/ssut/py-hanspell.git /tmp/pip-req-build-_vwy35nx
Building wheels for collected packages: py-hanspell
  Building wheel for py-hanspell (setup.py) ... [?25l[?25hdone
  Created wheel for py-hanspell: filename=py_hanspell-1.1-py3-none-any.whl size=4868 sha256=c41a3cae60f1d2fe64705528eead887bac9ea75fa4a8d6c01c26a6452bc98081
  Stored in directory: /tmp/pip-ephem-wheel-cache-p9ex2hid/wheels/ab/f5/7b/d4124bb329c905301baed80e2ae45aa14e824f62ebc3ec2cc4
Successfully built py-hanspell
Installing collected packages: py-hanspell
Successfully installed py-hanspell-1.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-n

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


여기까지 건들지 말것

In [None]:
def GetYoutubeData1(youtube_url):
  url = youtube_url
  DEVELOPER_KEY = "AIzaSyC1yBL6YbPZj5nwrtDa0tlXa6-7A3Ur5B8"
  #YOUTUBE_API_SERVICE_NAME="youtube"
  #YOUTUBE_API_VERSION="v3"
  #youtube = build(YOUTUBE_API_SERVICE_NAME,YOUTUBE_API_VERSION,developerKey=DEVELOPER_KEY)

  video = pafy.new(url)
  ID = video.videoid
  thumbnail = video.thumb
  title = video.title

  response = requests.get("https://www.googleapis.com/youtube/v3/videos?&part=snippet&key=" + DEVELOPER_KEY + "&id=" + ID)
  #title description
  snippet = response.json()["items"][0]['snippet']
  description = snippet['description']
  title += ' '
  title += description
  #print(title.replace("\n",""))

  tag_data = None
  if 'tags' in snippet:
    tag = snippet['tags']
    tag_data = ' '.join(tag)
    # print('tag_data: ',tag_data)
  else:
    print("no tag")

  caption = ''
  word_list = []

  try:
    srt = YouTubeTranscriptApi.get_transcript(ID,languages=['ko']) #한국말 자막
    for i in srt:
      caption = caption + i['text']
    word_list = caption.split(' ')
  except:
    print('no caption')
    
  result = {}
  #형태소 분석
  morph = []

  for word in word_list:
    morph.append(okt.pos(word))

  noun_list = []
  for sentence in morph:
      for word, tag in sentence:
          # 품사 불용어 딕셔너리나 리스트를 만들어서 한눈에 보기 쉽게하며 업데이트가 용이하게 만들고 코드도 그 리스트나 딕셔너리를 토대로 append 하게함.
          if tag in ['Noun'] and ('은' not in word) and ('는' not in word) and ('이' not in word) and ('가' not in word) and ('을' not in word) and ('를' not in word) and ('와' not in word) and ('과' not in word) and('도' not in word) and ('에' not in word) and ('에서' not in word) and ('의' not in word) and ('거' not in word) and ('또' not in word) and ('것' not in word) and ('그리고' not in word):
            noun_list.append(word)
  count = Counter(noun_list)          
  words = dict(count.most_common())

  spacing = Spacing()
  spacing_caption = spacing(caption) 
#   print('전체 캡션: ',caption)
#   print('result2: ',spacing_caption) 
  nouns = okt.nouns(spacing_caption)

#   print('단어들',nouns) 
  vocab = FreqDist(word_list)
  
#   sorted_word = sorted(result.items(), key = lambda item: item[1], reverse = True)
#   sorted_noun = sorted(nouns)
  
  return title, tag_data, ' '.join(words)

다 합친거

In [None]:
def GetYoutubeData2(youtube_url):
  url = youtube_url
  DEVELOPER_KEY = "AIzaSyC1yBL6YbPZj5nwrtDa0tlXa6-7A3Ur5B8"

  video = pafy.new(url)
  ID = video.videoid
  title = video.title

  response = requests.get("https://www.googleapis.com/youtube/v3/videos?&part=snippet&key=" + DEVELOPER_KEY + "&id=" + ID)
  #title description
  snippet = response.json()["items"][0]['snippet']
  description = snippet['description']
  title += ' '
  title += description
  #print(title.replace("\n",""))

  tag_data = None
  if 'tags' in snippet:
    tag = snippet['tags']
    tag_data = ' '.join(tag)
    # print('tag_data: ',tag_data)
  else:
    print("no tag")

  caption = ''
  word_list = []

  try:
    srt = YouTubeTranscriptApi.get_transcript(ID,languages=['ko']) #한국말 자막
    for i in srt:
      caption = caption + i['text']
    word_list = caption.split(' ')
  except:
    print('no caption')
    
  word_list = caption.split(' ') 
  word_list += title
  word_list += tag_data
  result = {}
  #형태소 분석
  morph = []

  for word in word_list:
    morph.append(okt.pos(word))

  noun_list = []
  for sentence in morph:
      for word, tag in sentence:
          # 품사 불용어 딕셔너리나 리스트를 만들어서 한눈에 보기 쉽게하며 업데이트가 용이하게 만들고 코드도 그 리스트나 딕셔너리를 토대로 append 하게함.
          if tag in ['Noun'] and ('은' not in word) and ('는' not in word) and ('이' not in word) and ('가' not in word) and ('을' not in word) and ('를' not in word) and ('와' not in word) and ('과' not in word) and('도' not in word) and ('에' not in word) and ('에서' not in word) and ('의' not in word) and ('거' not in word) and ('또' not in word) and ('것' not in word) and ('그리고' not in word):
            noun_list.append(word)
  count = Counter(noun_list)          
  words = dict(count.most_common())

  spacing = Spacing()
  spacing_caption = spacing(caption) 
#   print('전체 캡션: ',caption)
#   print('result2: ',spacing_caption) 
  nouns = okt.nouns(spacing_caption)

#   print('단어들',nouns) 
  vocab = FreqDist(word_list)
  
#   sorted_word = sorted(result.items(), key = lambda item: item[1], reverse = True)
#   sorted_noun = sorted(nouns)
  
  return ' '.join(words)

In [None]:
#categorylist = ["화장품","패션","요리음식","여행아웃도어","인테리어","엔터테인먼트","육아","아이티","자동차","헬스/피트니스","반려동물"]
def Check_Data(url):
  title, tag , subtitle = GetYoutubeData1(url)
  all = GetYoutubeData2(url)
  print('제목과 해시태그 추출: ',title)

  #오리지널 제목, 디스크립션 들어갔을때
  title_classlist, title_valuelist = GetMediaCategory(title)
  print('모델에 넣은 결과: ', title_classlist[0],title_valuelist)
  print('\n')

  #오리지널 해시태그 들어갔을때
  tag_classlist, tag_valuelist = GetMediaCategory(tag)
  print('모델에 넣은 결과: ', tag_classlist[0],tag_valuelist)
  print('\n')

  #오리지널 자막 들어갔을때
  subtitle_classlist, subtitle_valuelist = GetMediaCategory(subtitle)
  print('모델에 넣은 결과: ', subtitle_classlist[0],subtitle_valuelist)
  print('\n')

  #다 합쳐서 명사만 뽑았을때
  all_classlist, all_valuelist = GetMediaCategory(all)
  print('모델에 넣은 결과: ', all_classlist[0],all_valuelist)
  print('\n')

In [None]:
Check_Data("https://www.youtube.com/watch?v=wJsTLv-o5G8&ab_channel=%ED%9A%8C%EC%82%AC%EC%9B%90A") #url입력

제목과 해시태그 추출:  2층짜리 올리브영 통째로 빌려 쇼핑하기｜보타닉힐보 광고 #유료광고 #보타닉힐보 #시카장벽크림
이 영상은 보타닉힐보 유료광고를 포함하고 있습니다.

1/12일 (화) 저녁 10시!!
회사원A와 권혁수가 함께하는 올라이브   http://bit.ly/35qT1bU
                  
❤ 시카판테놀블레미쉬 크림 기획 + 앰플 (선착순 증정)
60,000원 → 19,900원 ( 74% OFF)

❤ 판테놀크림미스트 + 마스크팩 7일  (선착순증정)
46,500원 → 10,800원 (74% OFF)

딱 한시간동안만 이렇게 판매한다고 하니, 많은 관심부탁드려요 :)


  0%|          | 0/1 [00:00<?, ?it/s]

out:  tensor([[ 6.1959, -0.1960,  0.3074, -0.7863, -0.2906, -0.5272, -0.2420, -0.6507,
         -0.9006, -0.2793, -0.2370]], grad_fn=<AddmmBackward0>)
모델에 넣은 결과:  화장품 [6.195860385894775]




  0%|          | 0/1 [00:00<?, ?it/s]

out:  tensor([[ 6.1457,  0.3281,  0.1817, -0.7382, -0.2401, -0.5820, -0.3017, -0.6743,
         -0.9986, -0.6665, -0.2012]], grad_fn=<AddmmBackward0>)
모델에 넣은 결과:  화장품 [6.145674228668213]




  0%|          | 0/1 [00:00<?, ?it/s]

out:  tensor([[ 6.2188, -0.1927,  0.1185, -0.7249, -0.3155, -0.5064, -0.2924, -0.5903,
         -0.8115, -0.2886, -0.2114]], grad_fn=<AddmmBackward0>)
모델에 넣은 결과:  화장품 [6.218812942504883]




  0%|          | 0/1 [00:00<?, ?it/s]

out:  tensor([[ 6.2193, -0.1611,  0.1242, -0.7226, -0.3267, -0.5050, -0.2938, -0.6030,
         -0.8116, -0.3164, -0.2106]], grad_fn=<AddmmBackward0>)
모델에 넣은 결과:  화장품 [6.219254970550537]


