workbook for migrating database: filemaker to sqlite

In [42]:
import sqlite3, os
import pandas as pd
import re

In [2]:
#set home directory path
hdir = os.path.expanduser('~')

dh_path = '/Dropbox/Active_Directories/Digital_Humanities/'

database_path = os.path.join(hdir, dh_path.strip('/'), 'database_eurasia_7.0.db')

In [6]:

# Check if database file exists
if not os.path.exists(database_path):
    raise FileNotFoundError(f"Database file not found at: {database_path}")

# Connect to the SQLite database
conn = sqlite3.connect(database_path)

# Create a cursor object to execute SQL commands
cursor = conn.cursor()

try:
    # Your database operations will go here
    pass
finally:
    # Always close the connection when done
    cursor.close()
    conn.close()

In [29]:
# Connect to the SQLite database
conn = sqlite3.connect(database_path)
cursor = conn.cursor()

try:
    # Query to get all table names
    cursor.execute("""
        SELECT name 
        FROM sqlite_master 
        WHERE type='table'
        ORDER BY name;
    """)
    
    # Fetch all results
    tables = cursor.fetchall()
    
    print("Tables in database:")
    for table in tables:
        table_name = table[0]
        # Query to get the number of columns and foreign keys for each table
        cursor.execute(f"PRAGMA table_info({table_name});")
        columns_info = cursor.fetchall()
        num_columns = len(columns_info)

        cursor.execute(f"PRAGMA foreign_key_list({table_name});")
        foreign_keys = cursor.fetchall()
        foreign_keys_info = [fk[3] for fk in foreign_keys]  # Get the foreign key names

        print(f"- Table: {table_name}, Number of Columns: {num_columns}, Foreign Keys: {foreign_keys_info}")

finally:
    cursor.close()
    conn.close()

Tables in database:
- Table: bibliography, Number of Columns: 22, Foreign Keys: ['Repository_ID']
- Table: classical_genre, Number of Columns: 7, Foreign Keys: ['Source_ID', 'Knowledge_Form_ID', 'Classical_ID']
- Table: classical_sources, Number of Columns: 20, Foreign Keys: ['Location_ID']
- Table: commentaries, Number of Columns: 5, Foreign Keys: ['Commentating_Work', 'Commentated_Work']
- Table: conquests, Number of Columns: 10, Foreign Keys: ['Defending_Power_ID', 'Conquering_Power_ID', 'Conquered_Territory_ID']
- Table: copies_holdings, Number of Columns: 14, Foreign Keys: ['Scribe_Individual_ID', 'Repository_ID', 'Reference_Source_ID', 'Location_ID', 'Copied_Source_ID', 'Copied_Classical_ID']
- Table: definitions, Number of Columns: 10, Foreign Keys: ['Source_ID', 'Social_Role_ID', 'Lexicon_ID']
- Table: epochs, Number of Columns: 6, Foreign Keys: []
- Table: gazetteer, Number of Columns: 13, Foreign Keys: []
- Table: honorifics, Number of Columns: 4, Foreign Keys: []
- Table: in

In [66]:


# Function to enable regex in SQLite
def regex_search(pattern, string):
    # Check if the string is valid
    if not isinstance(string, str):
        return False
    try:
        return re.search(pattern, string) is not None
    except Exception as e:
        print(f"Regex error: {e}")
        return False

# Register the regex function with SQLite
def register_regex(conn):
    conn.create_function("REGEXP", 2, regex_search)

def word_search(search_term):
    # Establish a connection to the database
    conn = sqlite3.connect(database_path)
    register_regex(conn)  # Register the regex function
    cursor = conn.cursor()
    
    try:
        # SQL query to search through multiple columns in the lexicon table and join on definitions
        query = """
        SELECT l.UID, l.Term, l.Translation, l.Emic_Term, l.Colonial_Term, l.Transliteration, d.Definition
        FROM lexicon l
        JOIN definitions d ON l.UID = d.Lexicon_ID
        WHERE l.Term REGEXP ? OR l.Translation REGEXP ? OR l.Emic_Term REGEXP ? OR l.Colonial_Term REGEXP ? OR l.Transliteration REGEXP ?;
        """
        
        # Execute the query with the search term
        cursor.execute(query, (search_term, search_term, search_term, search_term, search_term))
        
        # Fetch all results
        results = cursor.fetchall()
        
        # Create a DataFrame with labeled columns
        columns = ['UID', 'Term', 'Translation', 'Emic_Term', 'Colonial_Term', 'Transliteration', 'Definition']
        df = pd.DataFrame(results, columns=columns)
        
        # Set display options for long text
        pd.set_option('display.max_colwidth', None)  # Show full content of the Definition column
        
        # Initialize Related_Terms column
        df['Related_Terms'] = ''
        
        # Check for related terms
        for index, row in df.iterrows():
            uid = row['UID']
            # Query to find related terms
            related_query = """
            SELECT l.Term
            FROM related_terms rt
            JOIN lexicon l ON rt.Child_ID = l.UID
            WHERE rt.Parent_ID = ?;
            """
            cursor.execute(related_query, (uid,))
            related_terms = cursor.fetchall()
            # Concatenate related terms into a single string
            related_terms_list = [term[0] for term in related_terms]
            df.at[index, 'Related_Terms'] = ', '.join(related_terms_list)
        
    except Exception as e:
        print(f"Error during query execution: {e}")
    finally:
        # Close the cursor and connection
        cursor.close()
        conn.close()
    
    return df

# Example usage
# search_results_df = word_search('your_regex_pattern')
# print(search_results_df)

In [None]:
# word_search ('باج')

# next step: optional second argument that filters based on tags

In [93]:


def location_search(search_term):
    # Establish a connection to the database
    conn = sqlite3.connect(database_path)
    register_regex(conn)  # Register the regex function
    cursor = conn.cursor()
    
    try:
        # SQL query to search through multiple columns in the gazetteer table
        query = """
        SELECT UID, Nickname, Location_Name_Arabic, Location_Name_Colonial, Location_Name_Latin
        FROM gazetteer
        WHERE Nickname REGEXP ? OR Location_Name_Arabic REGEXP ? OR Location_Name_Colonial REGEXP ? OR Location_Name_Latin REGEXP ?;
        """
        
        # Execute the query with the search term
        cursor.execute(query, (search_term, search_term, search_term, search_term))
        
        # Fetch all matching records
        matching_records = cursor.fetchall()
        
        # If no matches found, return an empty DataFrame
        if not matching_records:
            return pd.DataFrame()  # Return an empty DataFrame if no matches
        
        # Extract UIDs from the results
        uid_list = [record[0] for record in matching_records]
        
        # SQL query to join with location_attributes based on Location_ID
        attributes_query = """
        SELECT la.*, g.Nickname, g.Location_Name_Arabic, g.Location_Name_Colonial, g.Location_Name_Latin
        FROM location_attributes la
        JOIN gazetteer g ON la.Location_ID = g.UID
        WHERE g.UID IN ({});
        """.format(','.join('?' * len(uid_list)))  # Create placeholders for the UIDs
        
        # Execute the query with the list of UIDs
        cursor.execute(attributes_query, uid_list)
        
        # Fetch all results
        results = cursor.fetchall()
        
        # Create a DataFrame with all columns from location_attributes and additional columns from gazetteer
        columns = [description[0] for description in cursor.description]  # Get column names
        df = pd.DataFrame(results, columns=columns)
        
    except Exception as e:
        print(f"Error during query execution: {e}")
        df = pd.DataFrame()  # Return an empty DataFrame on error
    finally:
        # Close the cursor and connection
        cursor.close()
        conn.close()
    
    return df

# Example usage
# location_results_df = location_search('your_regex_pattern')
# print(location_results_df)

In [94]:
location_search ('sh.hr')

# next steps: output information from location_hierarchies as well

Unnamed: 0,UID,Location_ID,Type,Value,Lattitude,Longitude,Approximate,Start_Date_Greg,Start_Date_Hij,End_Date_Greg,End_Date_Hij,Notes,Source_ID,Page_No,Nickname,Location_Name_Arabic,Location_Name_Colonial,Location_Name_Latin
0,3530,221,name_change,,,,"""",,,,,"""In the Islamic world, the nomadic Chaghatayid elite were known as Moghuls (i.e., Mongols), and Moghulistan originally referred to the steppe region north of the Tianshan (see C.E. Bosworth, 'Mogholistān,' EI2, 7:218). In Kashgari's day the term Moghul was falling out of use, but Moghulistan had become a common designation for the Tarim Basin and its surrounds"" - Muḥammad Ṣadiq Kashghari, In Remembrance of the Saints: The Rise and Fall of an Inner Asian Sufi Dynasty, trans. David Brophy (Columbia University Press, 2021), footnote 4 / p. 233.",978,233.0,,آلتی شهر,,AltishahrYattishahrTarim BasinKashgaria
1,3307,6267,coordinates,,28.966667,50.833333,"""",,,,,,225,,Bushehr,بوشهر‎,,Bushehr
