# AlbumDreamer
Structure: All 3 modular generations then a "Full Pipeline" combining the three. Lastly, a SQL database automatic builder.

Tweak the code for your own use (for example: file storage for generated files, or not using loras/textual inversion)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Text generation

In [None]:
!pip install openai

In [39]:
import openai
import os
import pandas as pd

openai.api_key = "INSERT KEY HERE IF USING GPT-4"

class AlbumChat:
  MODEL = "gpt-4-0613"

  def __init__(self, topic):
    self.messages = [{"role": "system", "content": "You are a helpful assistant."}]
    self.topic = topic

  #Generates all the text data necessary to create the album.
  def plan_full_album(self):
    self.analyse_topic()
    self.plan_soundtrack_characteristics()
    self.create_tracks_table()
    album = self.create_album_table()
    tracks = self.refine_tracks_table()
    df_album = pd.read_csv(StringIO(album), sep = '|')
    df_tracks = pd.read_csv(StringIO(tracks), sep = '|')
    return df_album, df_tracks

  def chat(self, message):
    self.messages.append({"role": "user", "content": message})
    response = openai.ChatCompletion.create(model=self.MODEL, messages=self.messages).choices[0]["message"]["content"]
    self.messages.append({"role": "assistant", "content": response})
    return response

  def analyse_topic(self):
    message = "The topic is: "+ self.topic+".\nGive a thorough analysis and summary. If unsure, imagine what it could be like."
    return self.chat(message)

  def plan_soundtrack_characteristics(self):
    message = "Give an overview of what characteristics a soundtrack/album for it should be like. If unsure, imagine what it could be like. Focus on the following sections: cultural elements, tones, instruments, genres, unique twists. No vocals. Keep it short but be specific."
    return self.chat(message)

  def create_tracks_table(self):
    message = "Let's create that album/soundtrack.\nThere will be 5 instrumental pieces within, each being creative, interesting, unique and fits the previous analyses. For each entry in the table:\n- Generate a track title.\n- Generate a short prompt for a concept artwork to inspire the music. Ensure it is creative, visually interesting, unique and fits the previous analyses. Example of an Imagery prompt: \"futuristic resort with beach, dreamy summer palette, surrealism, smooth, epic details, travel, bird view\"\n- Generate a short prompt of the soundscape of the track, with no fluff. Include all instruments and exclude any vocals. Ensure it's interesting, creative, unique and utilize the prior analyses. Example of a Music prompt: \"A whimsical instrumental with glockenspiel, ukulele, and xylophone, capturing the enchanting spirit of Steven's early adventures.\"\nGenerate a csv table for all this with \"|\" as a seperator, with these fields: Title, Imagery, Music.\nDon't output anything else than the table and don't surround with quotations marks. Make sure to include the field titles."
    return self.chat(message)

  def create_album_table(self):
    message = "Let's finalize the album:\n-Generate a very short prompt to be used to generate an image as a cover art. Make it standout, creative and at least a bit surreal. Never directly use names within, replace by detailed appearance descriptions (such as color and clothes). Don't write full sentences and keep it concise.\n-Generate a description for the back of the album. Make it interesting and based on its contents.\n-Generate a title for the album. Make it strange, creative and unique.\nGenerate a csv table for all this with \"|\" as a seperator, with these fields: Imagery, Description, Title.\nDon't output anything else than the table and don't surround with quotations marks. Make sure to include the field titles."
    return self.chat(message)

  #Replace names with visual description.
  def refine_tracks_table(self):
    message = "For the table of tracks before, you should never directly use names for the Imagery section, so replace those there with detailed appearance descriptions (such as gender, color, clothes...).\nGenerate a csv table for all this with \"|\" as a seperator, with these fields: Title, Imagery, Music.\nDon't output anything else than the table and don't surround with quotations marks. Make sure to include the field titles."
    return self.chat(message)

## Art generation

### Convert ckpt or safetensors model+vae to diffusers

In [None]:
# !git clone https://github.com/huggingface/diffusers.git
# %cd ./diffusers/
# !pip install -e ".[torch]"
# !pip install transformers
# !pip install omegaconf
# !pip install pytorch_lightning
# %cd ../

In [None]:
# !python3 ./diffusers/scripts/convert_original_stable_diffusion_to_diffusers.py --checkpoint_path /content/drive/MyDrive/data/models/diffusion/jimEIDOMODE_version10.safetensors  --dump_path /content/drive/MyDrive/data/models/diffusion/jim --from_safetensors

In [None]:
# !python3 ./diffusers/scripts/convert_vae_pt_to_diffusers.py --vae_pt_path /content/drive/MyDrive/data/models/diffusion/vaes/kl-f8-anime2.ckpt --dump_path /content/drive/MyDrive/data/models/diffusion/vaes/kl-f8-anime2-diffuser



### Prepare pipeline. Diffusers model and Lora safetensors.

In [None]:
!pip install diffusers transformers accelerate safetensors

In [None]:
import safetensors
import torch
from PIL import Image
from io import BytesIO

import torch
from diffusers import StableDiffusionImg2ImgPipeline, DPMSolverMultistepScheduler

#INSERT OWN PATHS HERE
model_path = "/content/drive/MyDrive/data/models/diffusion/model"
lora_path = "/content/drive/MyDrive/data/models/diffusion/LoRA/lora.safetensors"
textual_inversion_path = "/content/drive/MyDrive/sd/stable-diffusion-webui/embeddings"
textual_inversion_name = "easynegative.pt"

class ImageGenerator:
  def __init__(self, model_path, lora_path, text_invers_path, text_invers_name):
    self.model_path = model_path
    self.lora_path = lora_path
    self.text_invers_path = text_invers_path
    self.text_invers_name = text_invers_name

  def load_pipeline(self):
    scheduler = DPMSolverMultistepScheduler.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="scheduler")

    self.pipe_img2img = StableDiffusionImg2ImgPipeline.from_pretrained(model_path,
                                                          scheduler = scheduler,
                                                          custom_pipeline="lpw_stable_diffusion",
                                                          torch_dtype=torch.float16,
                                                          safety_checker=None).to("cuda")

    lora_state_dict = safetensors.torch.load_file(
      lora_path, device="cpu"
    )

    self.pipe_img2img.load_lora_weights(lora_state_dict)
    self.pipe_img2img._lora_scale = 0.8

    self.pipe_img2img.load_textual_inversion(textual_inversion_path, weight_name=textual_inversion_name, token="easynegative")

  def generate_image(self, imagery, cover=False):
    prompt = "(masterpiece, concept art, artstation, abstract, surrealism, jr korpa, oil painting, abstract brush strokes.:1.1) "\
    + imagery
    negative_prompt= "easynegative, text, naked"
    height = 448
    width = 704
    if cover:
      heigh = 512
      width = 512

    with torch.no_grad():
        # Step 1: Generate low res image
        image = Image.new(mode="RGB", size=(width, height))
        image = self.pipe_img2img(prompt=prompt,
                            negative_prompt=negative_prompt,
                            image=image,
                            strength=1.0,
                            num_inference_steps=20,
                            guidance_scale=7).images[0]

        # Step 2: Generate high res image from low res image
        width = int(width*1.5)
        height = int(height*1.5)
        image = image.resize((width, height))

        image = self.pipe_img2img(prompt=prompt,
                    negative_prompt=negative_prompt,
                    image=image,
                    strength=0.60,
                    num_inference_steps=20,
                    guidance_scale=7).images[0]


    image.save("./{}.png".format(prompt[:50]))
    display(image)

## Music generation

In [None]:
!python3 -m pip install -U git+https://github.com/facebookresearch/audiocraft#egg=audiocraft

In [None]:
import torchaudio
from audiocraft.models import MusicGen
from audiocraft.data.audio import audio_write

class MusicGenerator:
  def generate_samples(self, descriptions, duration=30):
    model = MusicGen.get_pretrained('large', device='cuda')
    model.set_generation_params(duration)
    wav = model.generate(descriptions)  # generates samples.

    for idx, one_wav in enumerate(wav):
        # Will save under {idx}.mp3, with loudness normalization at -14 db LUFS.
        audio_write(f'{idx}', one_wav.cpu(), model.sample_rate, strategy="loudness", loudness_compressor=True, format="mp3")

Downloading state_dict.bin:   0%|          | 0.00/6.51G [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

CLIPPING 0 happening with proba (a bit of clipping is okay): 0.0001968750002561137 maximum scale:  1.2023330926895142
CLIPPING 1 happening with proba (a bit of clipping is okay): 0.00011249999806750566 maximum scale:  1.1829293966293335
CLIPPING 2 happening with proba (a bit of clipping is okay): 1.0416666782475659e-06 maximum scale:  1.010634183883667
CLIPPING 3 happening with proba (a bit of clipping is okay): 0.004565624985843897 maximum scale:  1.713952660560608
CLIPPING 4 happening with proba (a bit of clipping is okay): 0.0076166666112840176 maximum scale:  3.3917903900146484


# Full Pipeline
Assumes sufficient VRAM, otherwise needs unloading models between the steps.

In [None]:
class AlbumPipeline:
  def generate_full_album(self, topic):
    #Generate text data and directions
    album_chat = AlbumChat()
    df_album, df_tracks = album_chat.plan_full_album(topic)

    #Generate image data (cover and tracks)
    image_gen = ImageGenerator(model_path, model_path, lora_path, textual_inversion_path, textual_inversion_name)
    image_gen.generate_image(df_album["Imagery"], cover=True)
    df_tracks.apply(lambda row: image_gen.generate_image(row["Imagery"]), axis=1)

    #Generate music data
    music_gen = MusicGenerator()
    music_gen.generate_samples(df_tracks["Imagery"].tolist())

## SQL Database

Generate album then stored in SQL database. WIP for finalizing image and music storage (since storing directly within database unwise).

In [None]:
import pandas as pd
from io import StringIO
import sqlite3
PATH_TO_DB = "/example/path/database.db"

class AlbumDatabaseGenerator:
  def __init__(self, conn):
    self.conn = conn
    self.cursor = self.conn.cursor()
    self.try_initialize_tables()

  def try_initialize_tables(self):
    self.cursor.execute('CREATE TABLE IF NOT EXISTS Albums ( \
      ID INTEGER PRIMARY KEY,\
      Imagery TEXT,\
      Description TEXT ,\
      Title TEXT,\
      Cover_URL VARCHAR(512))')

    self.cursor.execute('CREATE TABLE IF NOT EXISTS Tracks( \
      ID INTEGER PRIMARY KEY,\
      AlbumID INTEGER, \
      NumberInAlbum INTEGER,\
      Title TEXT,\
      Imagery TEXT,\
      Music TEXT,\
      Image_URL VARCHAR(512),\
      FOREIGN KEY(AlbumID) REFERENCES Albums(ID))'
      )

  def generate_albums_into_sql_db(self, topics):
    for topic in topics:
      albumChat = AlbumChat(topic)
      df_album, df_tracks = albumChat.plan_full_album()

      #Transaction for Inserting Album and Tracks data.
      self.cursor.execute("BEGIN TRANSACTION")
      self.store_album(df_album)
      album_id = self.cursor.lastrowid
      self.store_tracks(df_tracks, album_id)
      self.cursor.execute("COMMIT TRANSACTION")

  def store_album(self, df_album):
    imagery = df_album["Imagery"][0]
    description = df_album["Description"][0]
    title = df_album["Title"][0]

    self.cursor.execute("INSERT INTO Albums (Imagery, Description, Title) VALUES (?,?,?)", (imagery, description, title))

  def store_tracks(self, df_tracks, album_id):
    print(df_tracks.apply(lambda row: self.sql_query_insert_track(album_id, row.name, row["Title"], row["Imagery"], row["Music"]), axis=1))

  def sql_query_insert_track(self, album_id, number_in_album, title, imagery, music):
    self.cursor.execute("INSERT INTO Tracks (AlbumID, NumberInAlbum, Title, Imagery, Music) \
    VALUES (?,?,?,?,?)", (album_id, number_in_album, title, imagery, music))

## EXAMPLE USAGE:
# topics = ["Dracula"]
# conn = sqlite3.connect(PATH_TO_DB)
# generator = AlbumDatabaseGenerator(conn)
# generator.generate_albums_into_sql_db(topics)