In [None]:
# Python Text Basics

# Using f-strings

name = "Brandonius III, Son of Sir Majesty Brandonius II"

print(f"my name is {name}") # basically the value inside the curly braces is substituted in

library = [('Author', 'Topic', 'Pages'), ('Twain', 'Rafting', 601), ('Feynman', 'Physics', 95), ('Hamilton', 'Mythology', 144)]

'''
for book in library:
    print(f"Author is {book[0]}")
'''

for book in library:
    if book[1] != "Topic":
        print(f"Topic is {book[1]}")

for author,topic,page in library: # tuple unpacking -> author = idx 0, topic = idx 1, page = idx 2
    print(f"{author:{10}} {topic:{30}} {page:.>{10}}") # convenient way to pad spaces and format as table

In [None]:
from datetime import datetime

today = datetime(year=2019,month=5,day=28)
print(f"{today: %B %d, %Y}")

In [16]:
%%writefile Texts/test.txt
Hello, my name is Brandon and I'm awesome.
It's not bragging if it's a fact.
By the way, everything above is false.

Overwriting Texts/test.txt


In [None]:
# reading and writing files (IO)

myfile = open("Texts/test.txt") # open() only enables us to read the file
print(type(myfile))
content = myfile.read() # However, we can't use read() on a file multiple times. 
# Basically, it reads the file is by placing a cursor that moves across the text
myfile.seek(0) # Place the cursor back at the start
lines = myfile.readlines() # reads each line as a separate element

myfile.close() # close the file when finished, 
               # because there may be issues if it's accessed somewhere else while it's open here

print(lines)
for line in lines:
    print(line.split()[0:2])

In [None]:
myfile = open('Texts/test.txt','w+') # this mode enables us to read and write BUT it overwrites the file contents

myfile.write("New text: Brandon is Brandon. But why Brandon?")

myfile.seek(0)
content = myfile.read()

myfile.close()

# myfile = open('Texts/test.txt','a+') # this mode enables us to append text lines
# myfile.write("\n This is a new line.")

with open('Texts/test.txt','r') as myfile: # the "with" operator -> the file is automatically closed at the end
    lines = myfile.readlines()

In [2]:
import PyPDF2

myfile = open('nlp_course_notes/00-Python-Text-Basics/US_Declaration.pdf','rb') # read-in binary method

pdf_reader = PyPDF2.PdfFileReader(myfile) # pdf reader object, for a specific file
num_pages = pdf_reader.numPages
page_one = pdf_reader.getPage(0) # starting index for the pages in pdf is 0
text_one = page_one.extractText()

pdf_writer = PyPDF2.PdfFileWriter() # pdf writer object, for any file
pdf_writer.addPage(page_one) # add the extracted first page to pdf writer

pdf_output = open('Texts/My_PDF','wb') # new pdf file, write binary permission
pdf_writer.write(pdf_output) # writes the content loaded in pdf_writer onto the specified file

pdf_output.close()
myfile.close()

In [6]:
f = open('nlp_course_notes/00-Python-Text-Basics/US_Declaration.pdf','rb')

pdf_text = []

pdf_reader = PyPDF2.PdfFileReader(f) # read this file

for p in range(pdf_reader.numPages):
    page = pdf_reader.getPage(p)
    pdf_text.append(page.extractText())

f.close()

pdf_text
len(pdf_text)

for page in pdf_text:
    print(page)
    print("\n")

Declaration of IndependenceIN CONGRESS, July 4, 1776. The unanimous Declaration of the thirteen united States of America, When in the Course of human events, it becomes necessary for one people to dissolve the
political bands which have connected them with another, and to assume among the powers of the
earth, the separate and equal station to which the Laws of Nature and of Nature's God entitle

them, a decent respect to the opinions of mankind requires that they should declare the causes

which impel them to the separation. 
We hold these truths to be self-evident, that all men are created equal, that they are endowed by

their Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit
of Happiness.ŠThat to secure these rights, Governments are instituted among Men, deriving

their just powers from the consent of the governed,ŠThat whenever any Form of Government
becomes destructive of these ends, it is the Right of the People to alter or to abolish it,

In [2]:
import re

text = "The phone number of an agent is 408-555-1234. Call the phone now!"

"phone" in text

pattern = "phone"
my_match = re.search(pattern,text) # search() finds the location of a string within a string
start_end_idx = my_match.span() # [starting index of pattern, ending index of pattern]

all_matches = re.findall(pattern,text) # to find multiple matches

for match in re.finditer(pattern,text): # iterate through the matches
    print(match.span())

(4, 9)
(55, 60)


In [15]:
pattern = r'\d\d\d-\d\d\d-\d\d\d\d' # r denotes a special pattern recognition string
phone_number = re.search(pattern,text)
phone_number.group() # group prints out the actual string from the match object

pattern = r'\d{3}-\d{3}-\d{4}' # using quantifiers to make the pattern searching more efficient
phone_number = re.search(pattern,text)
phone_number.group() # group prints out the actual string from the match object

pattern = r'(\d{3})-(\d{3})-(\d{4})' # use parenthesis to denote seperate groups in the match
phone_number = re.search(pattern,text)
phone_number.group(1) # group prints out the actual string from the match object

# to search for multiple possible strings, either use piping or wildcards
re.search(r"brandon|bad","Tennis is brandon and he is not bad")
re.findall(r".don","Blob was a katsudon and brandon was a don") # . is a wildcard character (any char)

re.findall(r"\d$","Bruh moment 2") # $ -> finds the ending digit, if there is one

re.findall(r"[^\d]+","Blob has 3 fish that are 34 inches wide and 2 long.") # this syntax means to exclude any digits
# and the + symbol here basically joins the words together

# removing punctuation
sent = re.findall(r"[^?.!]+","You betrayed me? Oh, the horror! Begone.")

' '.join(sent) # joins together the items in sent with a space in between

re.findall(r'[\w]+-[\w]+','hello this is brandon-pae find the hyperbola-hello') # the brackets indicate any number of alphanumerics (1 or more) bc + is a quantifier


['brandon-pae', 'hyperbola-hello']