In [61]:
import pandas as pd
import re
import string
import copy

TERM_EXPRESSION = '([a-zA-Z0-9.@_]+:[a-zA-Z0-9_\- ]+)'

def trim_strings(raw_string):
    trim_string = copy.copy(raw_string)
    for char in string.whitespace + "'\"":
        trim_string = trim_string.strip(char)
    trim_string = re.sub('\W+', ' ', trim_string)
    return trim_string


In [62]:

def keyword_search(df, raw_query, return_query_expression=False, case_sensitive=False):
    terms = re.findall(TERM_EXPRESSION, raw_query)
    compiled_query = raw_query
    
    for term in terms:
        field, keyword = term.split(':', 1)
        field   = trim_strings(field)
        keyword = trim_strings(keyword)
        
        
        if field not in df.columns:
            raise Exception(f'field "{field}" is not present in dataframe')
        
        if field == 'date':
            start_date = int(keyword.split(" ")[0])
            end_date = int(keyword.split(" ")[1])
            df['date'] = df['date'].str.slice(0, 4).astype(int)
            compiled_term = f'date >= {start_date} and date <= {end_date}'
            
        else:
            compiled_term = f'{field}.str.contains("{keyword}", na=False, case={case_sensitive})'

        compiled_query = compiled_query.replace(
            term, 
            compiled_term
        )

    return df.query(compiled_query)


In [63]:
df = pd.DataFrame(
    {"abstract": ["Prostate cancer", "Brain cancer", "Breast cancer"],
     "date": ["2020-01-24", "2023-02-03", "2019-03-31"]}
)

In [64]:
keyword_search(df, "(abstract:cancer) and (date:2018-2021)")

Unnamed: 0,abstract,date
0,Prostate cancer,2020
2,Breast cancer,2019
