In [1]:
import pandas as pd
from pprint import pprint 
import json
import openai
import re
import os
import pinecone
import time


import nltk
from nltk.tokenize import word_tokenize


import langchain
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Pinecone
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import SystemMessage, HumanMessage, AIMessage


from dotenv import load_dotenv, find_dotenv

  from tqdm.autonotebook import tqdm


In [2]:
load_dotenv(find_dotenv())

True

In [3]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT")

In [4]:
file_path = "./Data/jira-conversations2.json"

with open(file_path, "r", encoding='utf-8') as f:
    data = json.load(f)

In [30]:
json_data = data

In [31]:
def clean_text(text):
    text = ' '.join(text.split())  # Remove extra whitespaces and newlines
    
    # Remove different kinds of unwanted patterns from text
    # Remove unwanted patterns
    text = re.sub(r'~+\+~+', '', text)  # Remove ~+~+
    text = re.sub(r'\+\~', '', text)  # Remove +~
    text = re.sub(r'----', '', text)  # Remove ----
    text = re.sub(r'\+\+', '', text)    # Remove ++
    

    text = re.sub(r'~accountid:[a-zA-Z0-9]+', '', text)
    text = re.sub(r'\{"type".*?\}\]', '', text)
    text = re.sub(r'\{adf\}.*?\{adf\}', '', text)  # Remove "{adf} ... {adf}" and content within
    text = re.sub(r'\[~~email~~-?\d+~~\]', '', text)  # Remove `[~~email~~-numbers~~]`
    text = re.sub(r'http[s]?://\S+', '', text)  # Remove URLs
    text = re.sub(r'<\[\[~~email~~-?\d+~~\]\]', '', text)  # Remove `<[[~~email~~-numbers~~]]>`
    text = re.sub(r'\|mailto:\[~~email~~-?\d+~~\]', '', text)  # Remove email tags
    text = re.sub(r'—-—-—-—', '', text)  # Remove "—-—-—-—"
    text = re.sub(r'\w{3}, \d{1,2} \w{3} \d{4}, \d{1,2}:\d{2} [apmAPM]{2}', '', text)  # Remove timestamps
    text = re.sub(r'\|\s+\|\s+You don\'t often get email from', '', text)  # Remove headers
    text = re.sub(r'\[Powered by Service Management.*?\]', '', text)  # Remove "[Powered by...]"
    text = re.sub(r'\[View request.*?&reserved=0\]', '', text)  # Remove "[View request...]"
    text = re.sub(r'\*\*\*Please reply above this line\*\*\*', '', text)  # Remove reply line
    text = re.sub(r'\|', '', text)  # Remove "|"
    text = re.sub(r'_', '', text)  # Remove "_"
    text = re.sub(r'\[mailto:\]', '', text)  # Remove "[mailto:]"
    text = re.sub(r'\[|\]', '', text)  # Remove "[" and "]"
    text = re.sub(r'<|>', '', text)  # Remove "<" and ">"
    text = re.sub(r'\*', '', text)  # Remove "*"
    text = re.sub(r'!jira[-a-zA-Z0-9 ()]+!', '', text)  # Remove Jira text
    
    return text

# Clean text in conversations
for conv in json_data:
    for key, value in conv.items():
        if isinstance(value, str):
            try:
                conv[key] = clean_text(value)
            except Exception as e:
                print(f"Error cleaning text in '{key}': {e}")
        else:
            print(f"Skipped cleaning text for key '{key}' as it's not a string.")

In [32]:
texts = [', '.join(f"'{k}': '{v}'" for k, v in item.items()) for item in json_data]

In [33]:
texts

["'question000001': 'Hii This is gadipally UID-U6331114. In process of submitting DS160. I need to send address and phone number of point of contact(school official).So can you please send address and phone number of IRIS BRITO(School official to contact uon arrival).Can i also know the first and last name of IRIS BRITO. Thank you.', 'response000002': 'Thank you for reaching out. You are able to add at . Best,'",
 "'question000001': 'Hello, I am with ID - U20309912. I have not received any mail regarding academic integrity course . Can you please send the link for academic integrity course and details regarding it?! Thanks in advance.', 'response000002': 'Hello, provide me with your USF email.', 'question000003': 'Hello, USF mail Id is Thanks in advance', 'response000004': 'Canvas invite has been sent via email.', 'question000005': 'Hello , I didn’t get any canvas invite to my mail. Can you please send it again?! Thanks in advance', 'response000006': 'Invite has been sent again.'",
 "'

In [34]:
class Document:
    def __init__(self, page_content, metadata):
        self.page_content = page_content
        self.metadata = metadata

    def __repr__(self):
        return f"Document(page_content='{self.page_content}', metadata={self.metadata})"


# Define a simple text splitting function
def split_text(text, chunk_size=1000):
    chunks = []
    for i in range(0, len(text), chunk_size):
        chunk = text[i:i+chunk_size]
        chunks.append(chunk)
    return chunks    


# Instantiate RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

# Assuming the 'texts' list from previous code
documents_list = [Document(page_content=text, metadata={'text': text}) for text in texts]

In [35]:
pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),  
    environment=os.getenv('PINECONE_ENV')  
)

In [36]:
if PINECONE_INDEX_NAME not in pinecone.list_indexes():
    pinecone.create_index(
        PINECONE_INDEX_NAME,
        dimension=1536,
        metric='cosine'
    )
    # wait for index to finish initialization
    while not pinecone.describe_index(PINECONE_INDEX_NAME).status['ready']:
        time.sleep(1)

index = pinecone.Index(PINECONE_INDEX_NAME)

In [12]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.00024,
 'namespaces': {'': {'vector_count': 24}},
 'total_vector_count': 24}

In [13]:
# pinecone.delete_index("langchain-index")

In [14]:
# pinecone.create_index("langchain-index", dimension=1536) 

In [37]:
index_name = "langchain-index"

In [38]:
embeddings = OpenAIEmbeddings(deployment="text-embedding-ada-002") #EXPENSIVE - - - USE CAREFULLY

In [39]:
print(f"Total number of Documents is: {len(documents_list)}")

for i in range(len(documents_list)):
    try:
        document = documents_list[i]
        print(f"Processing document {i + 1}")

        # Split the document content using the split_text function
        chunks = split_text(document.page_content)

        for chunk in chunks:
            chunk_document = Document(page_content=chunk, metadata={'text': chunk})
            search = Pinecone.from_documents([chunk_document], embeddings, index_name=index_name)

    except Exception as e:
        print(f"Error processing document {i + 1}: {e}")

Total number of Documents is: 4412
Processing document 1
Processing document 2
Processing document 3
Processing document 4
Processing document 5
Processing document 6
Processing document 7
Processing document 8
Processing document 9
Processing document 10
Processing document 11
Processing document 12
Processing document 13
Processing document 14
Processing document 15
Processing document 16
Processing document 17
Processing document 18
Processing document 19
Processing document 20
Processing document 21
Processing document 22
Processing document 23
Processing document 24
Processing document 25
Processing document 26
Processing document 27
Processing document 28
Processing document 29
Processing document 30
Processing document 31
Processing document 32
Processing document 33
Processing document 34
Processing document 35
Processing document 36
Processing document 37
Processing document 38
Processing document 39
Processing document 40
Processing document 41
Processing document 42
Processi

Processing document 346
Processing document 347
Processing document 348
Processing document 349
Processing document 350
Processing document 351
Processing document 352
Processing document 353
Processing document 354
Processing document 355
Processing document 356
Processing document 357
Processing document 358
Processing document 359
Processing document 360
Processing document 361
Processing document 362
Processing document 363
Processing document 364
Processing document 365
Processing document 366
Processing document 367
Processing document 368
Processing document 369
Processing document 370
Processing document 371
Processing document 372
Processing document 373
Processing document 374
Processing document 375
Processing document 376
Processing document 377
Processing document 378
Processing document 379
Processing document 380
Processing document 381
Processing document 382
Processing document 383
Processing document 384
Processing document 385
Processing document 386
Processing docum

Processing document 688
Processing document 689
Processing document 690
Processing document 691
Processing document 692
Processing document 693
Processing document 694
Processing document 695
Processing document 696
Processing document 697
Processing document 698
Processing document 699
Processing document 700
Processing document 701
Processing document 702
Processing document 703
Processing document 704
Processing document 705
Processing document 706
Processing document 707
Processing document 708
Processing document 709
Processing document 710
Processing document 711
Processing document 712
Processing document 713
Processing document 714
Processing document 715
Processing document 716
Processing document 717
Processing document 718
Processing document 719
Processing document 720
Processing document 721
Processing document 722
Processing document 723
Processing document 724
Processing document 725
Processing document 726
Processing document 727
Processing document 728
Processing docum

Processing document 1029
Processing document 1030
Processing document 1031
Processing document 1032
Processing document 1033
Processing document 1034
Processing document 1035
Processing document 1036
Processing document 1037
Processing document 1038
Processing document 1039
Processing document 1040
Processing document 1041
Processing document 1042
Processing document 1043
Processing document 1044
Processing document 1045
Processing document 1046
Processing document 1047
Processing document 1048
Processing document 1049
Processing document 1050
Processing document 1051
Processing document 1052
Processing document 1053
Processing document 1054
Processing document 1055
Processing document 1056
Processing document 1057
Processing document 1058
Processing document 1059
Processing document 1060
Processing document 1061
Processing document 1062
Processing document 1063
Processing document 1064
Processing document 1065
Processing document 1066
Processing document 1067
Processing document 1068


Processing document 1357
Processing document 1358
Processing document 1359
Processing document 1360
Processing document 1361
Processing document 1362
Processing document 1363
Processing document 1364
Processing document 1365
Processing document 1366
Processing document 1367
Processing document 1368
Processing document 1369
Processing document 1370
Processing document 1371
Processing document 1372
Processing document 1373
Processing document 1374
Processing document 1375
Processing document 1376
Processing document 1377
Processing document 1378
Processing document 1379
Processing document 1380
Processing document 1381
Processing document 1382
Processing document 1383
Processing document 1384
Processing document 1385
Processing document 1386
Processing document 1387
Processing document 1388
Processing document 1389
Processing document 1390
Processing document 1391
Processing document 1392
Processing document 1393
Processing document 1394
Processing document 1395
Processing document 1396


Processing document 1685
Processing document 1686
Processing document 1687
Processing document 1688
Processing document 1689
Processing document 1690
Processing document 1691
Processing document 1692
Processing document 1693
Processing document 1694
Processing document 1695
Processing document 1696
Processing document 1697
Processing document 1698
Processing document 1699
Processing document 1700
Processing document 1701
Processing document 1702
Processing document 1703
Processing document 1704
Processing document 1705
Processing document 1706
Processing document 1707
Processing document 1708
Processing document 1709
Processing document 1710
Processing document 1711
Processing document 1712
Processing document 1713
Processing document 1714
Processing document 1715
Processing document 1716
Processing document 1717
Processing document 1718
Processing document 1719
Processing document 1720
Processing document 1721
Processing document 1722
Processing document 1723
Processing document 1724


Processing document 2013
Processing document 2014
Processing document 2015
Processing document 2016
Processing document 2017
Processing document 2018
Processing document 2019
Processing document 2020
Processing document 2021
Processing document 2022
Processing document 2023
Processing document 2024
Processing document 2025
Processing document 2026
Processing document 2027
Processing document 2028
Processing document 2029
Processing document 2030
Processing document 2031
Processing document 2032
Processing document 2033
Processing document 2034
Processing document 2035
Processing document 2036
Processing document 2037
Processing document 2038
Processing document 2039
Processing document 2040
Processing document 2041
Processing document 2042
Processing document 2043
Processing document 2044
Processing document 2045
Processing document 2046
Processing document 2047
Processing document 2048
Processing document 2049
Processing document 2050
Processing document 2051
Processing document 2052


Processing document 2341
Processing document 2342
Processing document 2343
Processing document 2344
Processing document 2345
Processing document 2346
Processing document 2347
Processing document 2348
Processing document 2349
Processing document 2350
Processing document 2351
Processing document 2352
Processing document 2353
Processing document 2354
Processing document 2355
Processing document 2356
Processing document 2357
Processing document 2358
Processing document 2359
Processing document 2360
Processing document 2361
Processing document 2362
Processing document 2363
Processing document 2364
Processing document 2365
Processing document 2366
Processing document 2367
Processing document 2368
Processing document 2369
Processing document 2370
Processing document 2371
Processing document 2372
Processing document 2373
Processing document 2374
Processing document 2375
Processing document 2376
Processing document 2377
Processing document 2378
Processing document 2379
Processing document 2380


Processing document 2669
Processing document 2670
Processing document 2671
Processing document 2672
Processing document 2673
Processing document 2674
Processing document 2675
Processing document 2676
Processing document 2677
Processing document 2678
Processing document 2679
Processing document 2680
Processing document 2681
Processing document 2682
Processing document 2683
Processing document 2684
Processing document 2685
Processing document 2686
Processing document 2687
Processing document 2688
Processing document 2689
Processing document 2690
Processing document 2691
Processing document 2692
Processing document 2693
Processing document 2694
Processing document 2695
Processing document 2696
Processing document 2697
Processing document 2698
Processing document 2699
Processing document 2700
Processing document 2701
Processing document 2702
Processing document 2703
Processing document 2704
Processing document 2705
Processing document 2706
Processing document 2707
Processing document 2708


Processing document 2997
Processing document 2998
Processing document 2999
Processing document 3000
Processing document 3001
Processing document 3002
Processing document 3003
Processing document 3004
Processing document 3005
Processing document 3006
Processing document 3007
Processing document 3008
Processing document 3009
Processing document 3010
Processing document 3011
Processing document 3012
Processing document 3013
Processing document 3014
Processing document 3015
Processing document 3016
Processing document 3017
Processing document 3018
Processing document 3019
Processing document 3020
Processing document 3021
Processing document 3022
Processing document 3023
Processing document 3024
Processing document 3025
Processing document 3026
Processing document 3027
Processing document 3028
Processing document 3029
Processing document 3030
Processing document 3031
Processing document 3032
Processing document 3033
Processing document 3034
Processing document 3035
Processing document 3036


Processing document 3325
Processing document 3326
Processing document 3327
Processing document 3328
Processing document 3329
Processing document 3330
Processing document 3331
Processing document 3332
Processing document 3333
Processing document 3334
Processing document 3335
Processing document 3336
Processing document 3337
Processing document 3338
Processing document 3339
Processing document 3340
Processing document 3341
Processing document 3342
Processing document 3343
Processing document 3344
Processing document 3345
Processing document 3346
Processing document 3347
Processing document 3348
Processing document 3349
Processing document 3350
Processing document 3351
Processing document 3352
Processing document 3353
Processing document 3354
Processing document 3355
Processing document 3356
Processing document 3357
Processing document 3358
Processing document 3359
Processing document 3360
Processing document 3361
Processing document 3362
Processing document 3363
Processing document 3364


Processing document 3653
Processing document 3654
Processing document 3655
Processing document 3656
Processing document 3657
Processing document 3658
Processing document 3659
Processing document 3660
Processing document 3661
Processing document 3662
Processing document 3663
Processing document 3664
Processing document 3665
Processing document 3666
Processing document 3667
Processing document 3668
Processing document 3669
Processing document 3670
Processing document 3671
Processing document 3672
Processing document 3673
Processing document 3674
Processing document 3675
Processing document 3676
Processing document 3677
Processing document 3678
Processing document 3679
Processing document 3680
Processing document 3681
Processing document 3682
Processing document 3683
Processing document 3684
Processing document 3685
Processing document 3686
Processing document 3687
Processing document 3688
Processing document 3689
Processing document 3690
Processing document 3691
Processing document 3692


Processing document 3981
Processing document 3982
Processing document 3983
Processing document 3984
Processing document 3985
Processing document 3986
Processing document 3987
Processing document 3988
Processing document 3989
Processing document 3990
Processing document 3991
Processing document 3992
Processing document 3993
Processing document 3994
Processing document 3995
Processing document 3996
Processing document 3997
Processing document 3998
Processing document 3999
Processing document 4000
Processing document 4001
Processing document 4002
Processing document 4003
Processing document 4004
Processing document 4005
Processing document 4006
Processing document 4007
Processing document 4008
Processing document 4009
Processing document 4010
Processing document 4011
Processing document 4012
Processing document 4013
Processing document 4014
Processing document 4015
Processing document 4016
Processing document 4017
Processing document 4018
Processing document 4019
Processing document 4020


Processing document 4309
Processing document 4310
Processing document 4311
Processing document 4312
Processing document 4313
Processing document 4314
Processing document 4315
Processing document 4316
Processing document 4317
Processing document 4318
Processing document 4319
Processing document 4320
Processing document 4321
Processing document 4322
Processing document 4323
Processing document 4324
Processing document 4325
Processing document 4326
Processing document 4327
Processing document 4328
Processing document 4329
Processing document 4330
Processing document 4331
Processing document 4332
Processing document 4333
Processing document 4334
Processing document 4335
Processing document 4336
Processing document 4337
Processing document 4338
Processing document 4339
Processing document 4340
Processing document 4341
Processing document 4342
Processing document 4343
Processing document 4344
Processing document 4345
Processing document 4346
Processing document 4347
Processing document 4348


In [18]:
# Do a simple vector similarity search
query = "I am having a doubt"

result = search.similarity_search(query)
result

[]

In [20]:
# text_field = "text"  # the metadata field that contains our text

# # Initialize the vector store object
# vectorstore = Pinecone(
#     index, embeddings.embed_query, text_field
# )



In [21]:
# scrapped_pages_dir = "./Data/Scrapped Pages"

In [22]:
# # Traverse the directory and get all .txt files
# txt_files = [f for f in os.listdir(scrapped_pages_dir) if f.endswith('.txt')]

In [23]:
# # Read the contents of each file and store in a list
# documents_list = []
# for file in txt_files:
#     with open(os.path.join(scrapped_pages_dir, file), 'r', encoding='utf-8') as f:
#         content = f.read()
#         documents_list.append(Document(page_content=content, metadata={'text': content}))

In [24]:
# print(f"Total number of Documents is: {len(documents_list)}")

# for i in range(len(documents_list)):
#     try:
#         document = documents_list[i]
#         print(f"Processing document {i + 1}")

#         # Split the document into chunks
#         chunks = split_text(document.page_content)

#         for chunk in chunks:
#             chunk_document = Document(page_content=chunk, metadata={'text': chunk})
#             # Vectorize the chunk
#             search = Pinecone.from_documents([chunk_document], embeddings, index_name=index_name)

#     except Exception as e:
#         print(f"Error processing document {i + 1}: {e}")

Total number of Documents is: 374
Processing document 1
Processing document 2
Processing document 3
Processing document 4
Processing document 5
Processing document 6
Processing document 7
Processing document 8
Processing document 9
Processing document 10
Processing document 11
Processing document 12
Processing document 13
Processing document 14
Processing document 15
Processing document 16
Processing document 17
Processing document 18
Processing document 19
Processing document 20
Processing document 21
Processing document 22
Processing document 23
Processing document 24
Processing document 25
Processing document 26
Processing document 27
Processing document 28
Processing document 29
Processing document 30
Processing document 31
Processing document 32
Processing document 33
Processing document 34
Processing document 35
Processing document 36
Processing document 37
Processing document 38
Processing document 39
Processing document 40
Processing document 41
Processing document 42
Processin

Processing document 346
Processing document 347
Processing document 348
Processing document 349
Processing document 350
Processing document 351
Processing document 352
Processing document 353
Processing document 354
Processing document 355
Processing document 356
Processing document 357
Processing document 358
Processing document 359
Processing document 360
Processing document 361
Processing document 362
Processing document 363
Processing document 364
Processing document 365
Processing document 366
Processing document 367
Processing document 368
Processing document 369
Processing document 370
Processing document 371
Processing document 372
Processing document 373
Processing document 374


In [25]:
# # Initialize the vector store object
# vectorstore = Pinecone(
#     index, embeddings.embed_query, text_field
# )

# query = "Michelle Jahn"
# vectorstore.similarity_search(query, k=3)

[Document(page_content="Michelle JahnResearch Administratormmjahn@usf.eduCampus:\xa0TampaRoom:\xa0BSN 3111Phone:\xa0813-974-1512Michelle Jahn is the Muma College of Business research administrator, a role in which\n               she will assist faculty in ferreting out opportunities for funded research, the preparation\n               and submission of competitive proposals and the management of awarded grants.She currently is enrolled in the MBA program with the Muma College of Business and\n               received a bachelor's degree in marketing from USF in 2005. She has been with USF\n               since 2007 and has served as a fiscal and business specialist, a research administrator\n               and administrative specialist with USF Research and Innovation."),
 Document(page_content='ffice Unit Research Administrator:\nMichelle Jahn\nmmjahn@usf.edu\n813-974-1512\nTampa Campus\nBSN 3111\nStaff Spotlight: Michelle Jahn is a one-person team helping faculty reach their research

In [26]:
# text_folder_path = "./Data/Syllabus 2023-selected/Text"

# txt_files = [f for f in os.listdir(text_folder_path) if f.endswith('.txt')]

# # Read the contents of each file and store in a list
# documents_list = []
# for file in txt_files:
#     with open(os.path.join(text_folder_path, file), 'r', encoding='utf-8') as f:
#         content = f.read()
#         documents_list.append(Document(page_content=content, metadata={'text': content}))

In [27]:
# print(f"Total number of Documents is: {len(documents_list)}")

# for i in range(len(documents_list)):
#     try:
#         document = documents_list[i]
#         print(f"Processing document {i + 1}")

#         # Split the document into chunks using the split_text function
#         chunks = split_text(document.page_content)

#         for chunk in chunks:
#             chunk_document = Document(page_content=chunk, metadata={'text': chunk})
#             # Vectorize the chunk using Pinecone
#             search = Pinecone.from_documents([chunk_document], embeddings, index_name=index_name)

#     except Exception as e:
#         print(f"Error processing document {i + 1}: {e}")

Total number of Documents is: 282
Processing document 1
Processing document 2
Processing document 3
Processing document 4
Processing document 5
Processing document 6
Processing document 7
Processing document 8
Error processing document 8: (500)
Reason: Internal Server Error
HTTP response headers: HTTPHeaderDict({'content-type': 'application/json', 'Content-Length': '150', 'date': 'Sun, 22 Oct 2023 20:43:35 GMT', 'x-envoy-upstream-service-time': '34', 'server': 'envoy', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"code":13,"message":"We were unable to process your request. If the problem persists, please contact us at https://support.pinecone.io","details":[]}

Processing document 9
Processing document 10
Processing document 11
Processing document 12
Processing document 13
Processing document 14
Processing document 15
Processing document 16
Processing document 17
Processing document 18
Processing document 19
Processing document

In [None]:
# vectorstore = Pinecone(
#     index, embeddings.embed_query, text_field
# )

# query = "USF Student Conduct Code"
# sample = vectorstore.similarity_search(query, k=3)
# sample

# Adding New

In [1]:
import pandas as pd
from pprint import pprint 
import json
import openai
import re
import os
import pinecone
import time


import nltk
from nltk.tokenize import word_tokenize


import langchain
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Pinecone
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import SystemMessage, HumanMessage, AIMessage


from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

  from tqdm.autonotebook import tqdm


True

In [2]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT")

In [3]:
pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),  
    environment=os.getenv('PINECONE_ENV')  
)

In [4]:
if PINECONE_INDEX_NAME not in pinecone.list_indexes():
    pinecone.create_index(
        PINECONE_INDEX_NAME,
        dimension=1536,
        metric='cosine'
    )
    # wait for index to finish initialization
    while not pinecone.describe_index(PINECONE_INDEX_NAME).status['ready']:
        time.sleep(1)

index = pinecone.Index(PINECONE_INDEX_NAME)

In [5]:
index_name = "langchain-index"

embeddings = OpenAIEmbeddings(deployment="text-embedding-ada-002") #EXPENSIVE - - - USE CAREFULLY

In [6]:
text_field = "text"  # the metadata field that contains our text

# Initialize the vector store object
vectorstore = Pinecone(
    index, embeddings.embed_query, text_field
)



In [7]:
class Document:
    def __init__(self, page_content, metadata):
        self.page_content = page_content
        self.metadata = metadata

    def __repr__(self):
        return f"Document(page_content='{self.page_content}', metadata={self.metadata})"


# Define a simple text splitting function
def split_text(text, chunk_size=1000):
    chunks = []
    for i in range(0, len(text), chunk_size):
        chunk = text[i:i+chunk_size]
        chunks.append(chunk)
    return chunks    

In [8]:
text_folder_path = "./Data/Adding"

txt_files = [f for f in os.listdir(text_folder_path) if f.endswith('.txt')]

# Read the contents of each file and store in a list
documents_list = []
for file in txt_files:
    with open(os.path.join(text_folder_path, file), 'r', encoding='utf-8') as f:
        content = f.read()
        documents_list.append(Document(page_content=content, metadata={'text': content}))

In [9]:
# Instantiate RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

# Assuming the 'texts' list from previous code
documents_list = [Document(page_content=text, metadata={'text': text}) for text in txt_files]

In [10]:
print(f"Total number of Documents is: {len(documents_list)}")

for i in range(len(documents_list)):
    try:
        document = documents_list[i]
        print(f"Processing document {i + 1}")

        # Split the document into chunks using the split_text function
        chunks = split_text(document.page_content)

        for chunk in chunks:
            chunk_document = Document(page_content=chunk, metadata={'text': chunk})
            # Vectorize the chunk using Pinecone
            search = Pinecone.from_documents([chunk_document], embeddings, index_name=index_name)

    except Exception as e:
        print(f"Error processing document {i + 1}: {e}")

Total number of Documents is: 14
Processing document 1
Processing document 2
Processing document 3
Processing document 4
Processing document 5
Processing document 6
Processing document 7
Processing document 8
Processing document 9
Processing document 10
Processing document 11
Processing document 12
Processing document 13
Processing document 14
