# Search documents for a word

Python empowers you to scan through various file formats like txt, docx, PDF, and CSV for specific words. This unifies data from different sources, allowing you to extract keywords, trends, or specific terms. By automating the process with Python scripts, you can save time and ensure consistency. Furthermore, Python integrates with powerful data analysis libraries, enabling you to transform extracted information into meaningful insights that drive informed decisions.


## Search txt files

In [1]:
from bs4 import UnicodeDammit

In [2]:
def search_txt(filename, word):
    # detect txt file encoding
    with open(filename, 'rb') as file:
        content = file.read(1024)

    suggestion = UnicodeDammit(content)
    encoding = suggestion.original_encoding

    # read and search
    with open(filename, encoding=encoding) as file:
        for line in file:
            if word in line.lower():
                return True

    return False

In [3]:
search_txt('data/documents/zen_of_python.txt', 'python')

True

In [4]:
search_txt('data/documents/zen_of_python.txt', 'java')

False

## Search CSV files

In [5]:
import csv

In [6]:
def search_csv(filename, word):
    with open(filename) as file:
        for row in csv.reader(file):
            for column in row:
                if word in column.lower():
                    return True

    return False

In [7]:
search_csv('data/documents/top_films.csv', 'spielberg')

True

In [8]:
search_csv('data/documents/top_films.csv', 'emmerich')

False

## Search PDF files

In [None]:
#!pip install PyPDF2

In [10]:
from PyPDF2 import PdfReader

In [11]:
def search_pdf(filename, word):
    with open(filename, 'rb') as file:
        pdf = PdfReader(file)

        if pdf.is_encrypted:
            return False

        for page in pdf.pages:
            text = page.extract_text()

            if word in text.lower():
                return True

        return False

## Search Microsoft Word files

In [None]:
#!pip install python-docx

In [13]:
import docx

In [14]:
def search_docx(filename, word):
    doc = docx.Document(filename)
    
    for paragraph in doc.paragraphs:
        if word in paragraph.text.lower():
            return True

    return False

## Search a whole folder for a word

In [15]:
# define extension to function map
EXTENSIONS = {
    'txt': search_txt,
    'csv': search_csv,
    'pdf': search_pdf,
    'docx': search_docx,
}

In [16]:
import os

In [17]:
def search(folder, word):
    for root, dirs, files in os.walk(folder):
        for file in files:
            # get file extension
            extension = file.split('.')[-1]

            if extension not in EXTENSIONS:
                continue

            search_func = EXTENSIONS.get(extension)
            full_file_path = os.path.join(root, file)

            if search_func(full_file_path, word):
                print(f'>>> Word "{word}" found in {full_file_path}')

In [18]:
search('data/documents', 'python')

>>> Word "python" found in data/documents/zen_of_python.txt
