This is for the matching process in the auto tab

In [1]:
import sqlite3
import requests
import zipfile
import io
import re

def extract_wavelength(filename):
    # Extract the wavelength using regular expression
    wavelength_match = re.search(r'Raman__([\d.]+)__', filename)
    if wavelength_match:
        return wavelength_match.group(1)
    else:
        return None

def extract_file_number(filename):
    # Extract the file number using regular expression
    file_number_match = re.search(r'__(\d+)\.txt$', filename)
    if file_number_match:
        return file_number_match.group(1)
    else:
        return None

def extract_elements(chemical_formula):
    # Extract elements from chemical formula using regular expression
    element_symbols = re.findall(r'[A-Z][a-z]*', chemical_formula)
    return ', '.join(element_symbols)

conn = sqlite3.connect('RRUFFRaman_database.db')  # Replace 'your_database.db' with your database filename
cursor = conn.cursor()
create_table_query = '''
    CREATE TABLE IF NOT EXISTS database_table (
        id INTEGER PRIMARY KEY,
        filename TEXT,
        mineral_name TEXT,
        rruff_id TEXT,
        wavelength TEXT,
        orientation TEXT,
        file_number TEXT,
        elements TEXT,
        x_data REAL,
        y_data REAL
    )
'''
cursor.execute(create_table_query)
conn.commit()

# Correct URLs
urls = [
    "https://rruff.info/zipped_data_files/raman/excellent_oriented.zip",
    "https://rruff.info/zipped_data_files/raman/excellent_unoriented.zip",
    "https://rruff.info/zipped_data_files/raman/fair_oriented.zip",
    "https://rruff.info/zipped_data_files/raman/fair_unoriented.zip",
    "https://rruff.info/zipped_data_files/raman/ignore_unoriented.zip",
    "https://rruff.info/zipped_data_files/raman/poor_oriented.zip",
    "https://rruff.info/zipped_data_files/raman/poor_unoriented.zip",
    "https://rruff.info/zipped_data_files/raman/unrated_oriented.zip",
    "https://rruff.info/zipped_data_files/raman/unrated_unoriented.zip"
]

for url in urls:
    response = requests.get(url)
    if response.status_code == 200:
        with io.BytesIO(response.content) as zip_stream:
            with zipfile.ZipFile(zip_stream) as zip_ref:
                for filename in zip_ref.namelist():
                    try:
                        with zip_ref.open(filename) as file:
                            content = file.read().decode('utf-8')  # Try to decode as UTF-8
                    except UnicodeDecodeError:
                        print(f"Skipping file {filename} due to UnicodeDecodeError.")
                        continue  # Skip this file and move to the next one
                    
                    lines = content.split('\n')
                    mineral_name = filename.split('__')[0]
                    rruff_id = filename.split('__')[1]
                    orientation = filename.split('__')[-3]  # Extract the last part as orientation
                    wavelength = extract_wavelength(filename)
                    file_number = extract_file_number(filename)
                    
                    elements = ""
                    for line in lines:
                        if line.startswith("##IDEAL CHEMISTRY="):
                            elements = line.split('=')[1]
                            elements = extract_elements(elements)
                            break
                    
                    # Store the largest y-values and corresponding x-values
                    largest_y_values = []
                    for line in lines:
                        if not line.startswith("##") and line.strip() != "":
                            parts = line.split(', ')
                            if len(parts) >= 2:  # Ensure there are at least two parts
                                try:
                                    x, y = map(float, parts)
                                    if not largest_y_values or y > largest_y_values[-1][0]:
                                        if len(largest_y_values) >= 10:
                                            largest_y_values.pop()
                                        for i, (y_value, _) in enumerate(largest_y_values):
                                            if y > y_value:
                                                largest_y_values.insert(i, (y, x))
                                                break
                                        else:
                                            largest_y_values.append((y, x))
                                except ValueError:
                                    print(f"Skipping line due to ValueError: {line}")

                    # Insert the largest y-values and corresponding x-values into the database
                    for y_value, x_value in largest_y_values:
                        cursor.execute("INSERT INTO database_table (filename, mineral_name, rruff_id, wavelength, orientation, file_number, elements, x_data, y_data) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)",
                                       (filename, mineral_name, rruff_id, wavelength, orientation, file_number, elements, x_value, y_value))

        conn.commit()
        print(f"Data from {url} processed successfully.")
    else:
        print(f"Failed to download data from {url}")

# Print unique mineral names in alphabetical order
cursor.execute("SELECT DISTINCT mineral_name FROM database_table ORDER BY mineral_name")
unique_minerals = cursor.fetchall()
print("Unique Mineral Names:")
for mineral in unique_minerals:
    print(mineral[0])

conn.close()


Data from https://rruff.info/zipped_data_files/raman/excellent_oriented.zip processed successfully.
Skipping line due to ValueError: 800, -
Data from https://rruff.info/zipped_data_files/raman/excellent_unoriented.zip processed successfully.
Data from https://rruff.info/zipped_data_files/raman/fair_oriented.zip processed successfully.
Data from https://rruff.info/zipped_data_files/raman/fair_unoriented.zip processed successfully.
Data from https://rruff.info/zipped_data_files/raman/ignore_unoriented.zip processed successfully.
Data from https://rruff.info/zipped_data_files/raman/poor_oriented.zip processed successfully.
Data from https://rruff.info/zipped_data_files/raman/poor_unoriented.zip processed successfully.
Skipping file Minium__P001241__Raman__Reference_PDF__40850.pdf due to UnicodeDecodeError.
Skipping file Huntite__P000080__Raman__Reference_PDF__40848.pdf due to UnicodeDecodeError.
Skipping file Heklaite__P001140__Raman__Reference_PDF__40849.pdf due to UnicodeDecodeError.
Sk