# 데이터 생성/전처리

In [None]:
from pytube import YouTube
from pytube.innertube import _default_clients

_default_clients[ "ANDROID"][ "context"]["client"]["clientVersion"] = "19.08.35"
_default_clients["IOS"]["context"]["client"]["clientVersion"] = "19.08.35"
_default_clients[ "ANDROID_EMBED"][ "context"][ "client"]["clientVersion"] = "19.08.35"
_default_clients[ "IOS_EMBED"][ "context"]["client"]["clientVersion"] = "19.08.35"
_default_clients["IOS_MUSIC"][ "context"]["client"]["clientVersion"] = "6.41"
_default_clients[ "ANDROID_MUSIC"] = _default_clients[ "ANDROID_CREATOR" ]

yt = YouTube('https://www.youtube.com/watch?v=pCnG7O_mbCs')
yt.streams.filter(only_audio=True).first().download(output_path='.', filename='youtube_output_6.mp3')

In [None]:
import torch
torch.cuda.is_available()

In [None]:
import whisper
model = whisper.load_model('small', device='cuda')
result = model.transcribe('youtube_output_6.mp3')

In [None]:
result['text']

In [None]:
for temp in result['segments']:
    print(f'[{temp['start']:.2f} --> {temp['end']:.2f}] {temp['text']}')

In [61]:
for index, temp in enumerate(result['segments']):
    !ffmpeg -y -i youtube_output_6.mp3 -ss {temp['start']} -to {temp['end']} -hide_banner -loglevel error voice_data/output_6_{index+1}.wav

In [62]:
import os

existing_data = []
if os.path.exists('textdata.txt'):
    with open('textdata.txt', 'r', encoding='utf-8') as f:
        existing_data = f.readlines()

new_data = []
for index, temp in enumerate(result['segments']):
    new_line = f'output_6_{index+1}|{temp["text"].strip()}|{temp["text"].strip()}\n'
    new_data.append(new_line)
    
merged_data = existing_data + new_data

with open('textdata.txt', 'w', encoding='utf-8') as f:
    f.writelines(merged_data)

In [None]:
from IPython.display import Audio
Audio('voice_data/output_4_26.wav')

# 모델 추론

In [1]:
import torch
from TTS.api import TTS
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
model_path = 'tts_model/tts_model.pth'
config_path = 'tts_model/config.json'
tts = TTS(model_path=model_path, config_path=config_path)

In [None]:
tts.to(device)

In [None]:
print(tts.speakers)

In [29]:
text = '지금 시간 얼마나 됐니?'
output_path = './t.wav'

In [None]:
tts.tts_to_file(text=text, file_path=output_path, language='ko', split_sentences=True)

In [1]:
import os
import torch
import torchaudio
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts

In [2]:
tokenizer_path = 'XTTS_v2.0_original_model_files/vocab.json'
model_path = 'tts_model/tts_model.pth'
config_path = 'tts_model/config.json'

In [3]:
config = XttsConfig()
config.load_json(config_path)
model = Xtts.init_from_config(config)
model.load_checkpoint(config, checkpoint_path=model_path, vocab_path=tokenizer_path, use_deepspeed=False)
model.cuda()

Xtts(
  (gpt): GPT(
    (conditioning_encoder): ConditioningEncoder(
      (init): Conv1d(80, 1024, kernel_size=(1,), stride=(1,))
      (attn): Sequential(
        (0): AttentionBlock(
          (norm): GroupNorm32(32, 1024, eps=1e-05, affine=True)
          (qkv): Conv1d(1024, 3072, kernel_size=(1,), stride=(1,))
          (attention): QKVAttention()
          (x_proj): Identity()
          (proj_out): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
        )
        (1): AttentionBlock(
          (norm): GroupNorm32(32, 1024, eps=1e-05, affine=True)
          (qkv): Conv1d(1024, 3072, kernel_size=(1,), stride=(1,))
          (attention): QKVAttention()
          (x_proj): Identity()
          (proj_out): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
        )
        (2): AttentionBlock(
          (norm): GroupNorm32(32, 1024, eps=1e-05, affine=True)
          (qkv): Conv1d(1024, 3072, kernel_size=(1,), stride=(1,))
          (attention): QKVAttention()
          (x_proj): Ide

In [4]:
import os
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=['voice_data/wavs/output_6_195.wav'])

In [148]:
out = model.inference(
    '우리는 도쿄 디즈니랜드에 도착했습니다. 이번에는 몰입 환경에 들어가 마법 같은 테마파크 속에서 환상적인 체험을 시작해볼까요?',
    'ko',
    gpt_cond_latent,
    speaker_embedding,
    temperature=0.2,
)

In [149]:
torchaudio.save('fix_audio_file/도쿄디즈니랜드.wav', torch.tensor(out["wav"]).unsqueeze(0), 24000)

In [160]:
from pydub import AudioSegment

def combine_wav(file1, file2, output_file):
    # 첫 번째 파일 불러오기
    sound1 = AudioSegment.from_wav(file1)
    # 두 번째 파일 불러오기
    sound2 = AudioSegment.from_wav(file2)
    
    # 두 파일 이어 붙이기
    combined_sound = sound1 + sound2
    
    # 결합된 파일 저장
    combined_sound.export(output_file, format="wav")

# 파일 경로 및 출력 파일 설정
file1 = 'fix_audio_file/일본_0.wav'
file2 = 'fix_audio_file/일본_1.wav'
output_file = 'fix_audio_file/일본.wav'

combine_wav(file1, file2, output_file)
