In [51]:
import pathlib
import pandas as pd
import numpy as np
from langchain_community.document_loaders import PyPDFLoader
from book_registry import books

loader = PyPDFLoader("../data/unstructured/easy_ocr/book-5.pdf")
pages = loader.load()

In [52]:
new_pages = []
for page in pages:
    file_name = pathlib.Path(page.metadata["source"]).stem
    page_number = page.metadata["page"]
    content = page.page_content
    new_pages.append(pd.Series([file_name, page_number, content]))

In [53]:
df = pd.DataFrame(new_pages)
df.rename(columns={0: "book", 1: "page", 2: "content"}, inplace=True)
df

Unnamed: 0,book,page,content
0,book-5,0,\n \nTHE SEMINAR OF JACQUES LACAN \n \n \nBOOK V \n \n \nTHE FORMATIONS OF...
1,book-5,1,6.11.1957(1) 1 \n \nSeminar 1...
2,book-5,2,6.11.1957(1) 2 \n \n \nhttp:/...
3,book-5,3,6.11.1957(1) 3 \n \nLet me re...
4,book-5,4,6.11.1957(1) 4 \n \npoint is ...
...,...,...,...
377,book-5,377,2.7.1958 (28) 377 \n \nhorizon...
378,book-5,378,2.7.1958 (28) 378 \n \nThe Chr...
379,book-5,379,2.7.1958 (28) 379 \n \nIt is o...
380,book-5,380,2.7.1958 (28) 380 \n \nBut I w...


Drop front matter

In [54]:
df = df.drop(df.index[0])

Get chapters and dates from running head

In [55]:
regex_date = r"(\d{1,2}\.\d{1,2}\.\d{2,4})\s*\(\d+\)"
regex_chapter = r"\d{1,2}\.\d{1,2}\.\d{2,4}\s*(\(\d+\))"

df["date"] = df["content"].str.extract(regex_date)
df["chapter"] = df["content"].str.extract(regex_chapter, expand=False)

In [56]:
df["date"] = df["date"].ffill()
df["chapter"] = df["chapter"].ffill()

In [57]:
df["chapter"] = df["chapter"].str.strip("()")

## Content Cleanup

```
removal of noise: hashtags, unicode, html, line breaks, segmentation, emoji, abbreviations, slang, elongated, typos, contractions, punctuation, numbers, lowercaseing, stopwords, stemming , lemmatization, pos, negations
```

Removed:

-   header
-   footer
-   line breaks
-   double spaces


In [58]:
header = r"\d{1,2}\.\d{1,2}\.\d{2,4}\s*\(\d+\)\s+\d+"
footer = r"http://www.lacaninireland.com"

df["content"] = df["content"].str.replace(header, "", regex=True)
df["content"] = df["content"].str.replace(footer, "")
df["content"] = df["content"].str.replace("\n", "")
df["content"] = df["content"].str.replace(r"\s+", " ", regex=True)

In [59]:
# bugs
# pattern_s = r"Seminar .* \d{4}"
# df[df["content"].str.contains(pattern_s)]["content"]
# df["content"] = df["content"].str.replace(pattern_s, "", regex=True)

Categories


In [60]:
df.reset_index(inplace=True, drop=True)

In [61]:
df.memory_usage(deep=True)

Index          128
book         24003
page          3048
content    1511199
date         24506
chapter      22352
dtype: int64

In [62]:
df.dtypes

book       object
page        int64
content    object
date       object
chapter    object
dtype: object

In [63]:
df["book"] = df["book"].astype("category")

In [64]:
df["chapter"] = pd.to_numeric(df["chapter"])

In [65]:
df["date"] = pd.to_datetime(df["date"], format="mixed", dayfirst=True)

In [66]:
df

Unnamed: 0,book,page,content,date,chapter
0,book-5,1,Seminar 1 : Wednesday 6 November 1957 This year we have taken the formation...,1957-11-06,1
1,book-5,2,,1957-11-06,1
2,book-5,3,"Let me remind you then in the first place, since I have given you my three ...",1957-11-06,1
3,book-5,4,point is essentially based on the principle of beginning with complex struc...,1957-11-06,1
4,book-5,5,"promise of heaven"", and ""why do you give up"". The term heaven and some othe...",1957-11-06,1
...,...,...,...,...,...
376,book-5,377,"horizon of the word, and not in its immediacy, we would not have an obsessi...",1958-07-02,28
377,book-5,378,"The Christian logos in so far as he is the incarnate logos , gives a precis...",1958-07-02,28
378,book-5,379,It is one way of reasoning. But on the other hand if one observes the extre...,1958-07-02,28
379,book-5,380,"But I would say that by operating in this way, namely by legitimating when ...",1958-07-02,28


## Saving


In [67]:
df.memory_usage(deep=True)

Index          128
book           552
page          3048
content    1511199
date          3048
chapter       3048
dtype: int64

In [68]:
df.dtypes

book             category
page                int64
content            object
date       datetime64[ns]
chapter             int64
dtype: object

In [69]:
df

Unnamed: 0,book,page,content,date,chapter
0,book-5,1,Seminar 1 : Wednesday 6 November 1957 This year we have taken the formation...,1957-11-06,1
1,book-5,2,,1957-11-06,1
2,book-5,3,"Let me remind you then in the first place, since I have given you my three ...",1957-11-06,1
3,book-5,4,point is essentially based on the principle of beginning with complex struc...,1957-11-06,1
4,book-5,5,"promise of heaven"", and ""why do you give up"". The term heaven and some othe...",1957-11-06,1
...,...,...,...,...,...
376,book-5,377,"horizon of the word, and not in its immediacy, we would not have an obsessi...",1958-07-02,28
377,book-5,378,"The Christian logos in so far as he is the incarnate logos , gives a precis...",1958-07-02,28
378,book-5,379,It is one way of reasoning. But on the other hand if one observes the extre...,1958-07-02,28
379,book-5,380,"But I would say that by operating in this way, namely by legitimating when ...",1958-07-02,28


There is some noise due to segmentation

-   p 237: "butcher's"


In [71]:
df.to_feather("../data/preprocessing/book-5_pypdf_pages.feather")