In [2]:
import warnings
warnings.filterwarnings('ignore')
import unstructured

#DETECT FILE TYPE
from filetype import guess

def detect_document_type(document_path):
    
    guess_file = guess(document_path)
    file_type = ""
    image_types = ['jpg', 'jpeg', 'png', 'gif']
    
    if(guess_file.extension.lower() == "pdf"):
        file_type = "pdf"
        
    elif(guess_file.extension.lower() in image_types):
        file_type = "image"
            
    else:
        file_type = "unkown"
            
    return file_type

# EXTRACT DOCUMENTS CONTENT

from langchain.document_loaders.image import UnstructuredImageLoader
from langchain.document_loaders import UnstructuredFileLoader

def extract_file_content(file_path):
    
    file_type = detect_document_type(file_path)
    
    if(file_type == "pdf"):
        loader = UnstructuredFileLoader(file_path)
        
    elif(file_type == "image"):
        loader = UnstructuredImageLoader(file_path)
        
    document = loader.load()
    documents_content = '\n'.join(doc.page_content for doc in document)
    
    return documents_content

#CREATE CHUNKS
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator = "\n\n",
    chunk_size = 1000,
    chunk_overlap = 100,
    length_function = len)

#CREATEE EMBEDDINGS
from langchain.embeddings.openai import OpenAIEmbeddings
import os

os.environ["OPENAI_API_KEY"] = "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"

embeddings = OpenAIEmbeddings()

#CREATE VECTOR INDEX
from langchain.vectorstores import FAISS

def get_doc_search(text_splitter):
    
    return FAISS.from_texts(text_splitter, embeddings)

#START CHATTING WITH YOUR DOCUMENT

from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

chain = load_qa_chain(OpenAI(), chain_type = "map_rerank",
                     return_intermediate_steps = True)

def chat_with_files(file_path, query):
    
    file_content = extract_file_content(file_path)
    file_splitter = text_splitter.split_text(file_content)
    
    document_search = get_doc_search(file_splitter)
    documents = document_search.similarity_search(query)
    
    results = chain({
                        "input_documents":documents,
                        "question": query
                    },
                    return_only_outputs = True)
    results = results['intermediate_steps'][0]
    
    return results


In [3]:
research_paper1_path = "/Users/lukegeel/Desktop/research/Jaime/Davila_Morris_WCCI_2018.pdf"

research_paper1_content = extract_file_content(research_paper1_path)
research_paper1_chunks = text_splitter.split_text(research_paper1_content)
doc_search_paper1 = get_doc_search(research_paper1_chunks)

print(research_paper1_content)

Created a chunk of size 1160, which is longer than the specified 1000
Created a chunk of size 1154, which is longer than the specified 1000
Created a chunk of size 1476, which is longer than the specified 1000
Created a chunk of size 1017, which is longer than the specified 1000
Created a chunk of size 1636, which is longer than the specified 1000
Created a chunk of size 1109, which is longer than the specified 1000


From Orthography to Semantics: a Study of Morphological Processing through Deep Learning Neural Networks

Jaime D´avila School of Cognitive Science Hampshire College Amherst, MA, USA Email: jdavila@hampshire.edu

Joanna Morris School of Cognitive Science Hampshire College Amherst, MA, USA Email: jmorris@hampshire.edu

Abstract—This paper presents results of using deep learning neural network techniques to map words, represented at the levels of letters, into semantic, distributed vector representations. We study and compare algorithms that have been proposed as models for human orthographic and morphological processing, such as the two layers symbolic network (AKA Naive Discrimi- natory Learning network) proposed by Baayen et al [1], and the dual-route approach presented by Grainger and Ziegler [2]. In addition, we study the effect of representing letters as one-hot vectors or via distributed vector representations, much like the natural language processing ﬁeld has done for words. Exp

In [4]:
print(research_paper1_chunks)

['From Orthography to Semantics: a Study of Morphological Processing through Deep Learning Neural Networks\n\nJaime D´avila School of Cognitive Science Hampshire College Amherst, MA, USA Email: jdavila@hampshire.edu\n\nJoanna Morris School of Cognitive Science Hampshire College Amherst, MA, USA Email: jmorris@hampshire.edu', 'Abstract—This paper presents results of using deep learning neural network techniques to map words, represented at the levels of letters, into semantic, distributed vector representations. We study and compare algorithms that have been proposed as models for human orthographic and morphological processing, such as the two layers symbolic network (AKA Naive Discrimi- natory Learning network) proposed by Baayen et al [1], and the dual-route approach presented by Grainger and Ziegler [2]. In addition, we study the effect of representing letters as one-hot vectors or via distributed vector representations, much like the natural language processing ﬁeld has done for wo

In [5]:
print(doc_search_paper1)

<langchain.vectorstores.faiss.FAISS object at 0x7f7b7baa6fa0>


In [None]:
pl