In [1]:
# IMPORT LIBARARIES

import os
import gzip
import lxml.etree as etree

import sqlite3

# LIBRARY FOR BATCH PROCESSING
from multiprocessing import Pool, Process, Queue

In [2]:
# DB FILE_PATH
db_name = 'fresh_discogs.db'

# Generate File Paths Dictionary

In [3]:
# This part of the code generates a dictionary of all the files 2020-2022

# Base directory and pattern template
base_dir = "/mnt/data/public/discogs"
pattern_template = "discogs_{year}{month_day}_{file_type}.xml.gz"

# Years and file types of interest
years = [2020, 2021, 2022]
file_types = ["releases", "artists", "labels", "masters"]

def generate_file_paths(years, file_types):
    file_paths = []
    for year in years:
        # Construct the directory path for the year
        year_dir = os.path.join(base_dir, str(year))
        
        # Check if the directory exists
        if not os.path.exists(year_dir):
            print(f"Directory for {year} does not exist. Skipping...")
            continue
        
        # List all files in the directory
        files = os.listdir(year_dir)
        
        # Filter files based on the file types and add them to the list
        for file in files:
            for file_type in file_types:
                if file_type in file:
                    file_paths.append(os.path.join(year_dir, file))
                    
    return file_paths

def separate_file_paths(file_paths, file_types):
    separated_paths = {file_type: [] for file_type in file_types}
    for path in file_paths:
        for file_type in file_types:
            if f"_{file_type}." in path:
                separated_paths[file_type].append(path)
    return separated_paths

# File types to separate
file_types = ['artists', 'masters', 'labels', 'releases']
file_paths = separate_file_paths(generate_file_paths(years, file_types), file_types)

file_paths

{'artists': ['/mnt/data/public/discogs/2020/discogs_20200101_artists.xml.gz',
  '/mnt/data/public/discogs/2020/discogs_20200220_artists.xml.gz',
  '/mnt/data/public/discogs/2020/discogs_20200301_artists.xml.gz',
  '/mnt/data/public/discogs/2020/discogs_20200401_artists.xml.gz',
  '/mnt/data/public/discogs/2020/discogs_20200501_artists.xml.gz',
  '/mnt/data/public/discogs/2020/discogs_20200703_artists.xml.gz',
  '/mnt/data/public/discogs/2020/discogs_20200806_artists.xml.gz',
  '/mnt/data/public/discogs/2020/discogs_20200901_artists.xml.gz',
  '/mnt/data/public/discogs/2020/discogs_20201001_artists.xml.gz',
  '/mnt/data/public/discogs/2020/discogs_20201101_artists.xml.gz',
  '/mnt/data/public/discogs/2020/discogs_20201201_artists.xml.gz',
  '/mnt/data/public/discogs/2021/discogs_20210101_artists.xml.gz',
  '/mnt/data/public/discogs/2021/discogs_20210201_artists.xml.gz',
  '/mnt/data/public/discogs/2021/discogs_20210301_artists.xml.gz',
  '/mnt/data/public/discogs/2021/discogs_20210401_a

# PREPARE BATCH FUNCTIONS

In [20]:
import os

def extract_year_and_month(file_path):
    # Extract year and month from the file path
    file_name = os.path.basename(file_path)
    year = file_name.split('_')[1][:4]  # Extract YYYY from the file name
    month = file_name.split('_')[1][4:6]  # Extract MM from the file name
    return year, month


In [21]:
###############################################################################################
# QUEUE - ensures that only one read write access to SQL without delaying the file parsing process
###############################################################################################

# This creates a batch insert task to a queue
def create_batch_insert_task(file_type, records, write_queue):
    write_queue.put((file_type, records))
    
# This is the process run by our queue
def process_write(db_name, write_queue):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    
    total_records = 0  # Initialize a variable to keep track of total records
    inserted_records = 0  # Initialize a variable to keep track of inserted records
    
    while True:
        write_task = write_queue.get()  # This will block until an item is available
        
        # None is our signal to stop the process
        if write_task is None:
            print('Task None is received. Stopping process.')
            break  

        file_type, records = write_task
        
        if file_type == 'labels' and records:
            # TO DO
            # SHOULD EXECUTE SQL BATCH INSERT LABELS
            continue
        
        elif file_type == 'masters' and records:
            try:
                cursor.executemany("""
                    INSERT OR REPLACE INTO masters (main_release, year, file_year, file_month, artist_name, title, genre) 
                    VALUES (?, ?, ?, ?, ?, ?, ?)
                """, records)
                conn.commit()
            except Exception as e:
                print(f"An error occurred during write: {e}")
                conn.rollback()
        
        elif file_type == 'releases' and records:
             # TO DO
            # SHOULD EXECUTE SQL BATCH RELEASES LABELS
            continue
            
    conn.close()




    
###############################################################################################
# Parallel Processing - creates a separate process for each file
###############################################################################################
def start_process_file_task(file_type, file_path, write_queue):
    # Map each process depending on file type
    if file_type == 'labels':
        process = Process(target=process_labels_files, args=(file_path, write_queue))
        pass
    
    elif file_type == 'masters':
        # process_masters_files(file_path, write_queue)
        process = Process(target=process_masters_files, args=(file_path, write_queue))
        pass

    elif file_type == 'releases':
        process = Process(target=process_releases_files, args=(file_path, write_queue))
        pass
    
    process.start()
    return process


# PROCESS MASTERS FUNCTION

In [22]:
import sqlite3

# Connect to the SQLite database (it will be created if it doesn't exist)
conn = sqlite3.connect(db_name)  # specify the path to your database
cursor = conn.cursor()

# Create the masters table if it doesn't exist with the new artist_name column
cursor.execute('''
CREATE TABLE IF NOT EXISTS masters (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    main_release INTEGER,
    year INTEGER,
    file_year INTEGER,
    file_month INTEGER,
    artist_name TEXT,  -- Add a new column for artist name
    title TEXT,
    genre TEXT
)
''')

# Close the cursor and connection
cursor.close()
conn.close()


In [23]:
import copy

def process_masters_files(file_path, write_queue):
    batch_records = []
    batch_size = 10000  # Number of records to insert at once

    # Extract year and month from the file path
    year, month = extract_year_and_month(file_path)
    print(year, month)

    with gzip.open(file_path, 'rb') as f:
        # Use iterparse to read the file incrementally
        for _, elem in etree.iterparse(f, events=('end',), tag='master'):

            # Extract artist name
            artist_elem = elem.find('artists/artist')
            artist_name = artist_elem.findtext('name') if artist_elem is not None else None

            genres_elem = elem.find('genres')
            genre_name = genres_elem.findtext('genre') if genres_elem is not None else None
            
            master_data = (
                elem.findtext('main_release'),
                elem.findtext('year'),
                int(year),  # Include the extracted year
                int(month),  # Include the extracted month
                artist_name,  # Include the extracted artist name
                elem.findtext('title'),
                genre_name
            )
            batch_records.append(master_data)

            # Clean-up
            elem.clear()
            # Also eliminate now-empty references from the root node to master
            while elem.getprevious() is not None:
                del elem.getparent()[0]

            if len(batch_records) >= batch_size:
                 # print(batch_records[0])
                create_batch_insert_task('masters', copy.deepcopy(batch_records), write_queue)
                batch_records.clear()

        # Make sure to queue the last batch of records as well
        create_batch_insert_task('masters', copy.deepcopy(batch_records), write_queue)


# MAIN PROCESS THAT RUNS ALL THE FILES

In [24]:
# Create queue for batch insert SQL tasks
write_queue = Queue()
writer_process = Process(target=process_write, args=(db_name, write_queue))
writer_process.start()

# Extract all files from the file dictionary we have earlier
masters_file_paths = [
  '/mnt/data/public/discogs/2020/discogs_20200101_masters.xml.gz',
  '/mnt/data/public/discogs/2020/discogs_20200220_masters.xml.gz',
  '/mnt/data/public/discogs/2020/discogs_20200301_masters.xml.gz',
  '/mnt/data/public/discogs/2020/discogs_20200401_masters.xml.gz',
  '/mnt/data/public/discogs/2020/discogs_20200501_masters.xml.gz',
  '/mnt/data/public/discogs/2020/discogs_20200601_masters.xml.gz',
  '/mnt/data/public/discogs/2020/discogs_20200703_masters.xml.gz',
  '/mnt/data/public/discogs/2020/discogs_20200806_masters.xml.gz',
  '/mnt/data/public/discogs/2020/discogs_20200901_masters.xml.gz',
  '/mnt/data/public/discogs/2020/discogs_20201001_masters.xml.gz',
  '/mnt/data/public/discogs/2020/discogs_20201101_masters.xml.gz',
  '/mnt/data/public/discogs/2020/discogs_20201201_masters.xml.gz',
  # '/mnt/data/public/discogs/2021/discogs_20210101_masters.xml.gz',
  # '/mnt/data/public/discogs/2021/discogs_20210201_masters.xml.gz',
  # '/mnt/data/public/discogs/2021/discogs_20210301_masters.xml.gz',
  # '/mnt/data/public/discogs/2021/discogs_20210401_masters.xml.gz',
  # '/mnt/data/public/discogs/2021/discogs_20210501_masters.xml.gz',
  # '/mnt/data/public/discogs/2021/discogs_20210601_masters.xml.gz',
  # '/mnt/data/public/discogs/2021/discogs_20210801_masters.xml.gz',
  # '/mnt/data/public/discogs/2021/discogs_20210901_masters.xml.gz',
  # '/mnt/data/public/discogs/2021/discogs_20211001_masters.xml.gz',
  # '/mnt/data/public/discogs/2021/discogs_20211101_masters.xml.gz',
  # '/mnt/data/public/discogs/2021/discogs_20211201_masters.xml.gz',
  # '/mnt/data/public/discogs/2022/discogs_20220101_masters.xml.gz',
  # '/mnt/data/public/discogs/2022/discogs_20220201_masters.xml.gz',
  # '/mnt/data/public/discogs/2022/discogs_20220301_masters.xml.gz',
  # '/mnt/data/public/discogs/2022/discogs_20220401_masters.xml.gz',
  # '/mnt/data/public/discogs/2022/discogs_20220501_masters.xml.gz',
  # '/mnt/data/public/discogs/2022/discogs_20220601_masters.xml.gz',
  # '/mnt/data/public/discogs/2022/discogs_20220801_masters.xml.gz',
  # '/mnt/data/public/discogs/2022/discogs_20220901_masters.xml.gz',
  # '/mnt/data/public/discogs/2022/discogs_20221001_masters.xml.gz',
  # '/mnt/data/public/discogs/2022/discogs_20221101_masters.xml.gz',
  # '/mnt/data/public/discogs/2022/discogs_20221201_masters.xml.gz'
]

# List to keep track of processes (File Reading)
processes = []

# Create and start a new process for each master file
for file_path in masters_file_paths:
    print(f"Process started with file_path {file_path}")
    process = start_process_file_task('masters', file_path, write_queue)
    processes.append(process)
    
# Wait for all File Reading processes to complete
for process in processes:
    process.join()
    if process.exitcode != 0:
        print(f"Process {process.name} ended with exit code {process.exitcode}")

print('All processes done!')

# Once all tasks are done, put None in the queue to signal the writer process to finish
write_queue.put(None)

# Wait for the SQL process to finish
writer_process.join()

Process started with file_path /mnt/data/public/discogs/2020/discogs_20200101_masters.xml.gz
Process started with file_path /mnt/data/public/discogs/2020/discogs_20200220_masters.xml.gz
2020 Process started with file_path /mnt/data/public/discogs/2020/discogs_20200301_masters.xml.gz
01Process started with file_path /mnt/data/public/discogs/2020/discogs_20200401_masters.xml.gz

2020Process started with file_path /mnt/data/public/discogs/2020/discogs_20200501_masters.xml.gz
 02Process started with file_path /mnt/data/public/discogs/2020/discogs_20200601_masters.xml.gz

2020Process started with file_path /mnt/data/public/discogs/2020/discogs_20200703_masters.xml.gz
 03
Process started with file_path /mnt/data/public/discogs/2020/discogs_20200806_masters.xml.gz
2020 04
Process started with file_path /mnt/data/public/discogs/2020/discogs_20200901_masters.xml.gz
2020 05
Process started with file_path /mnt/data/public/discogs/2020/discogs_20201001_masters.xml.gz
2020 06
2020 Process started w

In [17]:
import sqlite3

# Connect to the SQLite database
conn = sqlite3.connect('fresh_discogs.db')  # Replace 'fresh_discogs.db' with your database file path
cursor = conn.cursor()

# Execute an SQL query to retrieve the row count from the "masters" table
cursor.execute("SELECT COUNT(*) FROM masters")

# Fetch and print the row count
row_count = cursor.fetchone()[0]
print("Total Rows in 'masters' table:", row_count)

# Execute another SQL query to retrieve the first 5 rows from the "masters" table
cursor.execute("SELECT * FROM masters WHERE file_month = 1 LIMIT 5")

# Fetch and print the first 5 rows
print("\nFirst 5 Rows in 'masters' table:")
columns = [description[0] for description in cursor.description]
print("Columns:", columns)
results = cursor.fetchall()
for row in results:
    print(row)

# Close the cursor and connection
cursor.close()
conn.close()


Total Rows in 'masters' table: 1616539

First 5 Rows in 'masters' table:
Columns: ['id', 'main_release', 'year', 'file_year', 'file_month', 'artist_name', 'title', 'genre']
(740001, 3078209, 2011, 2020, 1, 'Cobra Starship', 'Night Shades', 'Electronic')
(740002, 7713354, 2015, 2020, 1, 'Alessia Cara', 'Know It All', 'Hip Hop')
(740003, 2032514, 1996, 2020, 1, 'The Sacados', 'Laberinto De Canciones', 'Electronic')
(740004, 4819393, 2010, 2020, 1, 'Jason Moran', 'Ten', 'Jazz')
(740005, 8920148, 2009, 2020, 1, 'Kimono (2)', 'Easy Music For Difficult People', 'Rock')


In [None]:
# 195152