In [1]:
from openai import OpenAI
import pandas as pd
import numpy as np
from slugify import slugify
from sklearn.metrics.pairwise import cosine_similarity

from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /Users/nealcaren/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
client = OpenAI()

def embed_text(text):
    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-small"
    )
    return response.data[0].embedding

def embed_texts(text):
    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-small"
    )
    return [_.embedding for _ in response.data]

In [92]:
paragraphs = []

file_path = "/Users/nealcaren/Documents/GitHub/a-century-of-negro-migration/chapters/"
for i in range(1, 10):
    fn = f"chapter-0{i}.md"
    with open(f"{file_path}{fn}", "r") as infile:
        md = infile.read()
        paragraphs_text = md.splitlines()
        paragraphs_text = [line for line in paragraphs_text if line.strip() != '']
        paragraphs_text = [line for line in paragraphs_text if line[:2]!='[^']
        paragraphs_text = [line for line in paragraphs_text if len(line.split())>1]
        
        for idx, paragraph in enumerate(
            paragraphs_text, start=1
        ):  # start=1 to start numbering from 1
            paragraphs.append(
                {"text": paragraph, "chapter": i, "paragraph_number": idx, "filename": fn}
            )

# Convert the list of dictionaries into a DataFrame
df = pd.DataFrame(paragraphs)

In [95]:
df.sample(3)

Unnamed: 0,text,chapter,paragraph_number,filename
230,John M. Langston was taken to Ohio and educate...,6,37,chapter-06.md
191,This progress of the Negroes in the North was ...,5,30,chapter-05.md
58,"| Maine | 1,356 | ...",2,41,chapter-02.md


In [96]:
df['embeddings'] = df['text'].apply(embed_text)

In [5]:
def best_match(df, text_string):
    """
    This function computes the cosine similarity between a given embedding and all the embeddings in the DataFrame.
    It returns the row with the highest similarity score.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing the text embeddings
    embedding (list): The embedding vector to match against the DataFrame's embeddings
    
    Returns:
    pd.Series: The row from the DataFrame that best matches the given embedding
    """
    
    embedding = embed_text(text_string)
    # Convert the embeddings list to a 2D numpy array if they are not already
    df['embeddings'] = df['embeddings'].apply(lambda x: np.array(x))
    embeddings_matrix = np.stack(df['embeddings'].values)

    # Normalize the input embedding and embeddings matrix for cosine similarity
    embedding_normalized = embedding / np.linalg.norm(embedding)
    embeddings_matrix_normalized = embeddings_matrix / np.linalg.norm(embeddings_matrix, axis=1)[:, np.newaxis]
    
    # Calculate the cosine similarities
    similarities = cosine_similarity([embedding_normalized], embeddings_matrix_normalized)

    # Get the index of the most similar embedding
    most_similar_index = np.argmax(similarities)
    return most_similar_index

def best_location(df, text_string):
    most_similar_index = best_match(df, text_string)
    # Return the corresponding row from the DataFrame
    sdf = df.iloc[most_similar_index].to_frame().T
    sdf['sentences'] = sdf['text'].apply(sent_tokenize)
    sdf = sdf.explode('sentences')
    sdf['embeddings'] = embed_texts(sdf['text'].values)
    most_similar_index = best_match(sdf, text_string)
    sdf['slug'] = slug
    
    return sdf.iloc[most_similar_index]
    return sdf


In [98]:
idf.sample(3)

Unnamed: 0,text,paragraph_number,entry,text_minus_entry,md_text
61,"Conventions of Negroes, 99-100",62,Conventions of Negroes,99-100,"Conventions of Negroes, [99-100](chapter-5.md#..."
266,"Turner, Bishop H. M., interested in sending Ne...",267,"Turner, Bishop H. M.","interested in sending Negroes to Africa, 157","Turner, Bishop H. M., interested in sending Ne..."
217,"Reconstruction, promoted to an extent by Negro...",218,Reconstruction,promoted to an extent by Negro natives of Nort...,"Reconstruction, promoted to an extent by Negro..."


In [104]:
b = best_location(df[df['chapter'] == 5].copy(), "Conventions of Negroes")

In [106]:
b['text']

'The Negroes in the North had not only shown their ability to rise in the economic world when properly encouraged but had begun to exhibit power of all kinds. There were Negro inventors, a few lawyers, a number of physicians and dentists, many teachers, a score of intelligent preachers, some scholars of note, and even successful blacks in the finer arts. Some of these, with Frederick Douglass as the most influential, were also doing creditable work in journalism with about thirty newspapers which had developed among the Negroes as weapons of defense.[^40]'

In [74]:
index_fn = f'/Users/nealcaren/Library/CloudStorage/Dropbox/gpt-ocr/index.md'
with open(index_fn, 'r') as infile:
    index_md = infile.read()
    

In [75]:

paragraphs = []
paragraphs_text = index_md.splitlines()
paragraphs_text = [line for line in paragraphs_text if line.strip() != '']
paragraphs_text = [line for line in paragraphs_text if line[:2]!='[^']
paragraphs_text = [line for line in paragraphs_text if line[:2]!='# ']
paragraphs_text = [line for line in paragraphs_text if '[' not in line]
paragraphs_text = [line for line in paragraphs_text if '::' not in line]
paragraphs_text = [line for line in paragraphs_text if '<!' not in line]

paragraphs_text = [line for line in paragraphs_text if len(line.split())>1]
        
for idx, paragraph in enumerate(
            paragraphs_text, start=1
        ):  # start=1 to start numbering from 1
            paragraphs.append(
                {"text": paragraph, "paragraph_number": idx, }
            )

In [76]:
paragraphs[0]

{'text': 'Adams, Henry, leader of the exodus to Kansas, 135  ',
 'paragraph_number': 1}

In [77]:
import re

def extract_entry_name(line):
    # Regular expression pattern to match entry names that start with a capital letter and end before a comma followed by a lowercase letter
    pattern = re.compile(r"^[A-Z].+?(?=(,\s[a-z]|,?\s?\d))")

    # Find the match in the line
    match = pattern.search(line)

    # Return the entry name if found, otherwise None
    return match.group(0).strip() if match else 'None'


In [78]:
idf=pd.DataFrame(paragraphs)

In [79]:
idf['entry'] = idf['text'].apply(extract_entry_name)

In [80]:
idf.to_clipboard()

In [81]:
idf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 296 entries, 0 to 295
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   text              296 non-null    object
 1   paragraph_number  296 non-null    int64 
 2   entry             296 non-null    object
dtypes: int64(1), object(2)
memory usage: 7.1+ KB


In [82]:
idf['text_minus_entry'] = idf.apply(lambda row: row['text'][len(row['entry']) + 2:], axis=1)


In [83]:
idf

Unnamed: 0,text,paragraph_number,entry,text_minus_entry
0,"Adams, Henry, leader of the exodus to Kansas, ...",1,"Adams, Henry","leader of the exodus to Kansas, 135"
1,"Akron, friends of fugitives in, 30",2,Akron,"friends of fugitives in, 30"
2,"Alton Telegraph, comment of, 118",3,Alton Telegraph,"comment of, 118"
3,"Anderson, promoter of settling of Negroes in J...",4,Anderson,"promoter of settling of Negroes in Jamaica, 79"
4,"Anti-slavery, leaders of the movement, became ...",5,Anti-slavery,"leaders of the movement, became more helpful t..."
...,...,...,...,...
291,"Wickham, executor of Samuel Gist, settled Gist...",292,Wickham,"executor of Samuel Gist, settled Gist's Negroe..."
292,"Wilberforce University, established at a slave...",293,Wilberforce University,"established at a slave settlement, 27"
293,"Wilcox, Samuel T., a merchant of Cincinnati, 95",294,"Wilcox, Samuel T.","a merchant of Cincinnati, 95"
294,"Yankees, comment of, on Negro labor, 115-116",295,Yankees,"comment of, on Negro labor, 115-116"


In [84]:
def get_chapter_from_page(page_number):
    # Define the ranges of page numbers for each chapter
    chapters = {
        1: range(1, 18),        # Chapter I: Pages 1-17
        2: range(18, 39),       # Chapter II: Pages 18-38
        3: range(39, 61),       # Chapter III: Pages 39-60
        4: range(61, 81),       # Chapter IV: Pages 61-80
        5: range(81, 101),      # Chapter V: Pages 81-100
        6: range(101, 126),     # Chapter VI: Pages 101-125
        7: range(126, 147),     # Chapter VII: Pages 126-146
        8: range(147, 167),     # Chapter VIII: Pages 147-166
        9: range(167, 193)      # Chapter IX: Pages 167-192
        # Note: Bibliography and Index are not counted as chapters
    }

    # Convert the page number to an integer
    page_number = str(page_number).replace('-','–')
    page_number = int(page_number.split('–')[0])

    # Find the chapter that the page number falls into
    for chapter, pages in chapters.items():
        if page_number in pages:
            return chapter

    # If the page number is not in any chapter range, return None
    return None

# Test the function with the provided example
get_chapter_from_page("111-119")


6

In [85]:
def process_string(input_strings):
    process_text = []
    for input_string in input_strings.split('; '):
        # Extract the last part of the string that contains the page number or range
            page_number_or_range = input_string.split(', ')[-1].strip()

            text_without_page_number = input_string.replace(page_number_or_range,'')
            print(page_number_or_range)
            # Get the chapter number using the page number or range
            chapter_number = get_chapter_from_page(page_number_or_range)

            # Slugify the rest of the string
            slugified_text = slugify(text_without_page_number)

            # Format the new Markdown link
            markdown_link = f'[{page_number_or_range}](chapter-{chapter_number}.md#{slugified_text})'

            # Replace the page number or range at the end of the input string with the new Markdown link
            process_text.append(input_string.replace(page_number_or_range,'') + markdown_link)

    return '; '.join(process_text).replace('   ',' ')


In [86]:
process_string('Adams, Henry, leader of the exodus to Kansas, 135')

135


'Adams, Henry, leader of the exodus to Kansas, [135](chapter-7.md#adams-henry-leader-of-the-exodus-to-kansas)'

In [87]:
idf['md_text'] = idf['text'].apply(process_string)

135
30
118
79
35
3
191
31-34
146
120
95
95
79
90
7
7
9
61
24
79
35
57
57
59-60
169
31
78
95
10
61
12
68
30
33-34
25
30
17
102
112
28-29
29
143
49
35
36
13
14
49
124
87
89
124
30
56-58
92-95
88
63
14-16
114
29
68-80
11
79
90
125
4
10
63
66
65-66
69-70
70
61-80
63
148
70-74
30
6
115
188-189
2
49-50
99-100
97
135
49
134
6
63
119
97
154
79-80
44
39
27
30
35
54-55
59-60
96
12
189
36-38
98
164
138-139
87
120
120
110-111
133-134
184-185
155
30
124
30
68-71
188
167-192
172-176
172
175
176
177
179-180
180-181
134-136
183
119
111
119
167-169
137
6
89
111–112
65–66
67–68
79
7
11
12
30
82
41
30
84
2
41
31
26–27
82
124
94
25
90
95–96
103
103
109
139–141
124
27
147
89
102
95
26
24
44
90
85
79–80
1
92
79
124
109
54
50
14
15
172-173
143
53
24
14
15
58
58-59
146
106
156
41
82
78-79
8
9
9-10
1
92
102
90
141
142
142-143
6
20
39
10
63
75-77
128
173-174
79
134
131-132
124
57
22
103
7
120
134
106
25
7
7
35
156
156
102
39
84
2
2
63
93
56
27
27
54
147–166
166
160
161
162
163
163
135
120
134
106
6
148
18
18
10

In [88]:
for _, row in idf.iterrows():
    index_md = index_md.replace(row['text'], row['md_text'])

print(index_md)

# INDEX

Adams, Henry, leader of the exodus to Kansas, [135](chapter-7.md#adams-henry-leader-of-the-exodus-to-kansas)
Akron, friends of fugitives in, [30](chapter-2.md#akron-friends-of-fugitives-in)
Alton Telegraph, comment of, [118](chapter-6.md#alton-telegraph-comment-of)
Anderson, promoter of settling of Negroes in Jamaica, [79](chapter-4.md#anderson-promoter-of-settling-of-negroes-in-jamaica)
Anti-slavery, leaders of the movement, became more helpful to the refugees, 34, [35](chapter-2.md#anti-slavery-leaders-of-the-movement-became-more-helpful-to-the-refugees-34)
Anti-slavery sentiment, of two kinds, [3](chapter-1.md#anti-slavery-sentiment-of-two-kinds)
American Federation of Labor, attitude of, toward Negro labor, [191](chapter-9.md#american-federation-of-labor-attitude-of-toward-negro-labor)
Appalachian highland, settlers of, aided fugitives, [31-34](chapter-2.md#appalachian-highland-settlers-of-aided-fugitives); exodus of Negroes to, [146](chapter-7.md#exodus-of-negroes-to)
Ark

In [90]:
with open('index.md' ,'w') as outfile:
    outfile.write(index_md)

In [91]:
!pwd

/Users/nealcaren/Documents/GitHub/notes/posts/function-calling
