#### Connect to Big Query

In [1]:
import pandas as pd
import os
# from sql_metadata import Parser
import warnings
warnings.simplefilter("ignore")


In [None]:
from google.cloud import bigquery
client = bigquery.Client(project="project_name")

#### Read Table `bigquery -> pandas`

In [None]:
query = f"SELECT * FROM reports.table limit 5"
dataframe = client.query(query).to_dataframe()
dataframe

In [4]:
def search_jobs(tables):
    # combine tables into a sql format 
    tables_sql = ", ".join(f"'{table}'" for table in tables)
    # query below gets the most up to date query for each table 
    query = f"""
    with jobs as (
    SELECT
        -- job_id, 
        concat(destination_table.dataset_id,'.',destination_table.table_id) table_name,
        row_number() over (partition by concat(destination_table.dataset_id,'.',destination_table.table_id) order by creation_time desc) rank,
        user_email,
        creation_time,
        query
    FROM `region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT`
    WHERE 
        concat(destination_table.dataset_id,'.',destination_table.table_id) IN ({tables_sql})
        AND state = 'DONE'
    )
    select * from jobs
    where rank = 1
    """
    query_job = client.query(query)
    try:
        df = query_job.result().to_dataframe()
    except Exception as e:
        print(e)
        df = pd.DataFrame()
    return df
    

In [None]:
subsets=['reports','manual']
root='queries'
tables=[]
# create folders if they do not exist
for subset in subsets:
    print('\n'+'-'*50+f'\n{subset}:')
    print(f'views:')
    v_count=0
    t_count=0
    folder_path=os.path.join(root,subset)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    element_list=list(client.list_tables(f'project_name.{subset}'))
    for element in element_list:
        element_ref = element.reference
        element_obj = client.get_table(element_ref)
        # help(element_obj)
        #----------------------------------------------- IF IT IS A VIEW WITH A QUERY:
        if element_obj.view_query is not None:
            v_count+=1
            view_query=element_obj.view_query.replace('\r\n','\n')
            # print(repr(view_query[:500])) # one issue that was faced is i couldn't tell if it was a \n\n or \r\n, repr shows which
            view_id=element_obj.table_id
            full_path=os.path.join(folder_path, view_id +'.sql')
            with open(full_path,'w') as f:
                f.write(view_query)
                pass
            print(v_count,element_obj.table_id,end=', ')
            if v_count%10==0:
                print('')
            # break
        else:
            # list tables
            tables.append(element.dataset_id + '.' + element.table_id)
    #----------------------------------------------- IF IT IS A TABLE BASED ON A SCHEDULED QUERY: 
    t_df = search_jobs(tables)
    
    print(f'\ntables:')
    for i,r in t_df.iterrows():
        if r.query is not None:
            t_count+=1
            table_query=r.query.replace('\r\n','\n')
            parent,table_id = r.table_name.split(".")
            table_id = f't_{table_id}'
            full_path=os.path.join(root,parent, table_id +'.sql')
            with open(full_path,'w') as f:
                f.write(table_query)
                pass
            print(t_count,table_id,end=', ')
            if t_count%10==0:
                print('')
                # break       
    print(f'\n{v_count}/{len(element_list)} elements are views with query')
    print(f'{t_count}/{len(element_list)} elements are tables with query')
    




<a id="sql_search"></a>
### SQL Search

In [6]:
import pandas as pd
import os
import glob
import re
# from sql_metadata import Parser

pd.set_option('display.max_colwidth', 120)  # Show full column content

In [7]:
def sql_search(word_1, word_2, method='both',approximate=False,path="queries/*/*.sql", with_comments=False):
    # Validate method
    valid_methods = {'both', 'any', 'word_1', 'word_2','word_1_only'}
    if method not in valid_methods:
        raise ValueError(f"Invalid method. Choose from {valid_methods}.")

    # Find all .sql files in the queries/*/ directory
    sql_files = glob.glob(path,recursive=True) # '../../../**/*.sql' for all the source control
    print(f'searching {len(sql_files)} files')
    matching_files = []

    for sql_file in sql_files:
        with open(sql_file, 'r') as file:
            lines = file.readlines()
            lines = [f'File Name: {sql_file}'] + lines
        matches_infile = []
        
        word_1_found = False
        word_2_found = False

        for line_number, line in enumerate(lines, start=1):
            normalized_line = line.strip().lower()
            
            cleaned_line = re.sub('--.*','',normalized_line) # with no comments
            m={}
            
            # Check for word_1 using regex
            if approximate:
                pattern_1=rf'{re.escape(word_1.lower())}'
                pattern_2=rf'{re.escape(word_2.lower())}'
            else:
                pattern_1=rf'\b{re.escape(word_1.lower())}\b'
                pattern_2=rf'\b{re.escape(word_2.lower())}\b'
                
            if re.search(pattern_1, cleaned_line):
                word_1_found = True
                # matches.append(f"Line {line_number}: {line.strip()} (word_1 match)")
                m['file']=sql_file
                m['line_number']=line_number
                m['line']=line.strip()
                m['match']= word_1
                matches_infile.append(m)

            # Check for word_2 using regex
            if re.search(pattern_2, cleaned_line):
                word_2_found = True
                # matches.append(f"Line {line_number}: {line.strip()} (word_2 match)")
                m['file']=sql_file
                m['line_number']=line_number
                m['line']=line.strip()
                m['match']= word_2
                matches_infile.append(m)

        # Determine if the file meets the criteria
        if method == 'both' and word_1_found and word_2_found:
            matching_files += matches_infile
            
        elif method == 'any' and (word_1_found or word_2_found):
            matching_files += matches_infile
        
        elif method == 'word_1_only' and (word_1_found and not word_2_found):
            matching_files += matches_infile
            
        elif method == 'word_1' and word_1_found:
            matching_files += matches_infile
            
        elif method == 'word_2' and word_2_found:
            matching_files += matches_infile
    
    results = pd.DataFrame(matching_files, columns=['file','line_number','line','match'])
    
    if len(matching_files) == 0:
        print(f'no match!\nfor {method}\n word_1 = {word_1} and word_2 = {word_2}') 
     



    return results

In [None]:
# ----------------------------------
# Search Configuration
# ----------------------------------

word_1 = "isMigrated"
word_2 = "20245"
method = "any"            # Options: 'both', 'any', 'word_1', 'word_2', 'word_1_only'
path = 'queries/**/*.sql' # Use glob patterns to control search scope
appx = True               # Set to False for strict word-boundary matches

# Run search
results = sql_search(word_1, word_2, method, appx, path)

# ----------------------------------
# Ignore List (optional filtering)
# ----------------------------------

ignorelist = [
    r'queries\manual\v_report1.sql',
    r'queries\manual\v_report2.sql',
    r'queries\manual\v_report3.sql',
]

# Example: apply filters to refine search results
# results = results.query('file in @ignorelist')                          # Include only ignored files
# results = results[results.line.str.contains('\(')]                     # Lines containing parentheses
# results = results[~results.line.str.contains('SUBMITTED_TO_CARRIER')]  # Exclude lines with specific text

# ----------------------------------
# Output & Display
# ----------------------------------

matching_files = results['file'].unique()
print(len(matching_files), 'matching files')
for i in matching_files:
    print(i)

# Export results
results.to_csv('matches.csv', index=False)

# Optional: Highlight "File Name" headers in styled DataFrame
def highlight_filename(s):
    return ['color:#aaaa00;font-weight:bold' if 'File Name' in str(v) else '' for v in s]

# Apply styling in notebook (if needed)
# styled_df = results.style.apply(highlight_filename, subset=['line'])


searching 209 files
9 matching files
queries\reports\temp.sql
queries\reports\v_autopay_profile.sql
queries\reports\v_autopay_to_reimbursement.sql
queries\reports\v_booked_deals_platform.sql
queries\reports\v_company_onboarding.sql
queries\reports\v_ee_roster.sql
queries\reports\v_enrollment_app_qa.sql
queries\reports\v_env_rate_validation.sql
queries\reports\v_ye_reporting_contacts.sql
