In [4]:
import os
import pdfplumber
import docx
import pandas as pd
import re
import win32com.client as win32
import textract
import subprocess
import time
os.environ["GEN_PYTHON_WRITE_PATH"] = "C:/Temp/gen_py"

def file_is_ready(filepath):
    """ Check if a file is ready to be opened by attempting to open it in append mode. """
    try:
        with open(filepath, 'a'):
            pass
        return True
    except IOError:
        return False

def wait_for_file(filepath, timeout=10):
    """ Wait for a file to be ready by checking it repeatedly until it is accessible or timeout is reached. """
    start_time = time.time()
    while time.time() - start_time < timeout:
        if file_is_ready(filepath):
            return True
        time.sleep(1)
    return False

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    """Extract text from the pdf file."""
    with pdfplumber.open(pdf_path) as pdf:
        full_text = ""
        for page in pdf.pages:
            full_text += page.extract_text() or ""
    return full_text
    
def extract_text_from_docx(docx_path):
    """Extract text from the docx file."""
    if wait_for_file(docx_path):
        try:
            doc = docx.Document(docx_path)
            return '\n'.join([para.text for para in doc.paragraphs if para.text])
        except Exception as e:
            print(f"Unable to read {docx_path}, error: {e}")
    else:
        print(f"File not ready or accessible: {docx_path}")
    return None

def convert_doc_to_docx_win32(doc_path, output_dir):
    """Convert .doc to .docx file."""
    word = win32.gencache.EnsureDispatch('Word.Application')
    doc = word.Documents.Open(doc_path)
    doc.Activate()

    new_file_abs = os.path.join(output_dir, os.path.splitext(os.path.basename(doc_path))[0] + '.docx')
    doc.SaveAs2(new_file_abs, FileFormat=16)  # FileFormat=16 for docx
    doc.Close(False)

    word.Quit()
    return new_file_abs

def extract_email(text):
    """Regex functions for extracting emails."""
    email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    return re.findall(email_regex, text)

def extract_phone_number(text):
    """Regex functions for extracting phone numbers"""
    phone_regex = r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b'
    return re.findall(phone_regex, text)

def process_files(directory):
    data = []
    output_dir = os.path.join(directory, 'converted')
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if filename.lower().endswith(".pdf"):
            text = extract_text_from_pdf(file_path)
        elif filename.lower().endswith(".doc"):
            docx_path = convert_doc_to_docx_win32(file_path, output_dir)
            text = extract_text_from_docx(docx_path) if docx_path else None
        elif filename.lower().endswith(".docx"):
            text = extract_text_from_docx(file_path)
        else:
            continue

        if text:
            emails = extract_email(text)
            phones = extract_phone_number(text)
            data.append({"Filename": filename, "Emails": emails, "Phone Numbers": phones, "Text": text})
        else:
            print(f"No text found in {filename}")

    return data

def create_excel(data, filename='output.xlsx'):
    """Write data to Excel."""
    df = pd.DataFrame(data)
    df.to_excel(filename, index=False, engine='openpyxl')

# Directory containing CVs
directory = r'C:\ost_internshala\Sample2'
data = process_files(directory)
create_excel(data)
