In [5]:
import os
from pdf2image import convert_from_path
import pytesseract
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from collections import defaultdict

In [6]:
# Função para extrair texto de uma imagem usando OCR

caminho = "C:/Program Files/Tesseract-OCR/tesseract.exe"
pytesseract.pytesseract.tesseract_cmd = caminho

def extract_text_from_image(image_path):
    return pytesseract.image_to_string(image_path)

In [7]:
# Função para carregar dados de documentos PDF
def load_data_from_pdfs(root_folder):
    classes = defaultdict(list)
    for subdir, dirs, files in os.walk(root_folder):
        for file in files:
            if file.endswith('.pdf'):
                class_name = os.path.basename(subdir)
                pdf_path = os.path.join(subdir, file)
                images = convert_from_path(pdf_path, poppler_path= r'C:\Users\Ana Luiza\poppler-24.02.0\Library\bin')
                for idx, image in enumerate(images):
                    image_text = extract_text_from_image(image)
                    classes[class_name].append(image_text)
    return classes

In [8]:
# Carregar dados e dividir em conjuntos de treinamento e teste
root_folder = "data"
data = load_data_from_pdfs(root_folder)

In [9]:
X = []
y = []
for class_name, documents in data.items():
    X.extend(documents)
    y.extend([class_name] * len(documents))

In [16]:
# Dividir dados em conjuntos de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [20]:
# Extrair características do texto usando TF-IDF
vectorizer = TfidfVectorizer(max_features=50000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [21]:
# Treinar um modelo SVM
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_vectorized, y_train)

In [22]:
# Avaliar o modelo
y_pred = svm_model.predict(X_test_vectorized)
print(classification_report(y_test, y_pred))

                                               precision    recall  f1-score   support

                                        CCMEI       1.00      1.00      1.00        10
                              Contrato social       0.98      0.98      0.98        48
         Não é documento válido - Cartão CNPJ       1.00      1.00      1.00         4
Não é documento válido - Contrato de trabalho       0.98      1.00      0.99        51
                      Requerimento Empresário       1.00      0.91      0.95        11

                                     accuracy                           0.98       124
                                    macro avg       0.99      0.98      0.98       124
                                 weighted avg       0.98      0.98      0.98       124



In [23]:
import joblib

# Salvar o modelo em um arquivo pickle
joblib.dump(svm_model, 'model.pkl')

# Salvar o vectorizer em um arquivo pickle
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']