**Matthew Ueckermannn**
## PDF Extractor

In [1]:
import fitz
import pandas as pd
from collections import Counter

doc = fitz.open('dissertation.pdf')
text = "".join(page.get_text("text") for page in doc)
words = pd.Series(text.split())
words.value_counts().head(30)

the         2640
.           2323
of          1580
to          1384
and         1174
a            977
in           694
is           612
for          524
that         507
be           431
data         407
The          361
are          349
with         337
as           307
students     259
can          257
this         247
course       233
it           222
or           221
on           210
an           190
their        182
was          176
Data         171
by           171
not          168
I            155
dtype: int64

In [2]:
import camelot
tables = camelot.read_pdf('calendar.pdf')
df = tables[0].df
df # seems to have worked well

Unnamed: 0,0,1,2,3,4
0,Fall,2020 Fall Term,,2021 Fall Term,
1,First Day of Classes,Tue,September 1,Tue,August 31
2,Labor Day - Classes Will Meet,Mon,September 7,,
3,Labor Day - Classes Suspended,,,Mon,September 6
4,Last day to add or drop courses,Tue,September 15,Tue,September 14
...,...,...,...,...,...
70,Summer Session - 10 week - classes begin,Mon,June 7,Mon,June 6
71,Last day to add or drop courses,Wed,June 16,Wed,June 15
72,Last day to change registration or withdraw fr...,Thur,July 15,Thur,July 14
73,Final Exams,Fri,August 13,Fri,August 12


### My example

I am interested in the distribution of words from my VLE report from CHEG345 (Junior lab).

In [3]:
doc = fitz.open('F1-VLE-Final.pdf')
text = "".join(page.get_text("text") for page in doc)
words = pd.Series(text.split())
words.value_counts().head(30)

the            533
of             284
in             233
and            162
for            139
to             127
pressure       109
is              96
that            78
methyl          69
a               65
was             65
this            65
be              64
acetate         60
at              55
mole            53
±               53
as              52
activity        51
infinite        50
can             49
are             49
dilution        47
temperature     46
methanol        46
The             41
by              40
fraction        40
with            39
dtype: int64

I was curious if VLE would make it (vapor liquid equilibrium) but I guess not. The gap betewen methyl and acetate is also interesting as they should be used together as "methyl acetate" as I doubt we ever called methanol methyl alcohol.

## Reddit Image Transcriber

In [4]:
# Set a User Agent to avoid being blocked
import requests
import pprint
import pytesseract
import re
import io
from PIL import Image
from textblob import TextBlob

In [5]:
data = requests.get("https://www.reddit.com/r/comics/.json", headers = {'User-agent': 'your bot 0.1'}).json()

In [6]:
all_text = ""
all_titles = []
all_sentiment = []

for post in data['data']['children']:
    
    title = post['data']['title']
    url = post['data']['url']
    
    # Regex adapted from: https://www.geeksforgeeks.org/how-to-validate-image-file-extension-using-regular-expression/
    regex = 'https://i.redd.it/([^\\s]+(\\.(?i)(jpe?g|png|gif|bmp))$)'
    pattern = re.compile(regex)
    
    if(re.search(pattern,url)):
        response = requests.get(url)
        img = Image.open(io.BytesIO(response.content))
        text = pytesseract.image_to_string(img) #from spot checking, not super accurate but still impressive
        all_text += text
        blob = TextBlob(text)
        sentiment = blob.sentiment.polarity
        
        all_titles.append(title)
        all_sentiment.append(sentiment)

  if sys.path[0] == '':


In [7]:
blob = TextBlob(all_text)
print("Looking at the most frequent words:")
pd.Series(blob.word_counts).sort_values().tail(10) # Not super interesting

Looking at the most frequent words:


that     6
my       6
on       6
and      6
to       8
i        9
you     12
s       12
a       14
the     15
dtype: int64

In [8]:
print("A breakdown of the sentiment of a sample posts on this sub is given by:")
s = pd.Series(data=all_sentiment)
print(s.describe())
print("The average tends to indicate that the sentiment is slightly positive; however, it is likely that most posts are just neutral")

A breakdown of the sentiment of a sample posts on this sub is given by:
count    23.000000
mean      0.118248
std       0.316118
min      -0.431818
25%       0.000000
50%       0.000000
75%       0.227827
max       0.988281
dtype: float64
The average tends to indicate that the sentiment is slightly positive; however, it is likely that most posts are just neutral


## Trying Another Sub

Was going to do r/chemicalengineering, but decided a sub with more pictures with text would be interesting. So lets try r/funnysigns

In [9]:
data = requests.get("https://www.reddit.com/r/funnysigns/.json", headers = {'User-agent': 'your bot 0.1'}).json()

In [10]:
all_text = ""
all_titles = []
all_sentiment = []

for post in data['data']['children']:
    
    title = post['data']['title']
    url = post['data']['url']
    
    # Regex adapted from: https://www.geeksforgeeks.org/how-to-validate-image-file-extension-using-regular-expression/
    regex = 'https://i.redd.it/([^\\s]+(\\.(?i)(jpe?g|png|gif|bmp))$)'
    pattern = re.compile(regex)
    
    if(re.search(pattern,url)):
        response = requests.get(url)
        img = Image.open(io.BytesIO(response.content))
        text = pytesseract.image_to_string(img) #from spot checking, not super accurate but still impressive
        all_text += text
        blob = TextBlob(text)
        sentiment = blob.sentiment.polarity
        
        all_titles.append(title)
        all_sentiment.append(sentiment)

In [11]:
blob = TextBlob(all_text)
print("Looking at the most frequent words:")
pd.Series(blob.word_counts).sort_values().tail(10) # Not super interesting

Looking at the most frequent words:


stay     4
ae       5
‘        5
to       5
ee       5
i        5
you      6
7        7
—        8
a       12
dtype: int64

In [12]:
print("A breakdown of the sentiment of a sample posts on this sub is given by:")
s = pd.Series(data=all_sentiment)
print(s.describe())
print("The average indicates that there is no bias in sentiment; however, most posts are just neutral")

A breakdown of the sentiment of a sample posts on this sub is given by:
count    23.000000
mean      0.006262
std       0.131497
min      -0.400000
25%       0.000000
50%       0.000000
75%       0.000000
max       0.272000
dtype: float64
The average indicates that there is no bias in sentiment; however, most posts are just neutral


## Face Finding

In [14]:
import numpy as np
import cv2

In [15]:
# Traning with the default classifiers
face_cascade = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')
eye_cascade = cv2.CascadeClassifier('haarcascade_eye.xml')

In [16]:
img = cv2.imread('Ueckermann_headshot.JPG')
smallImg = cv2.resize(img, (750,1000), interpolation=cv2.INTER_AREA) # my go to headshot
gray = cv2.cvtColor(smallImg, cv2.COLOR_BGR2GRAY)

faces = face_cascade.detectMultiScale(gray, 1.3, 5)
for (x,y,w,h) in faces:
    smallImg = cv2.rectangle(smallImg,(x,y),(x+w,y+h),(255,0,0),2)
    roi_gray = gray[y:y+h, x:x+w]
    roi_color = smallImg[y:y+h, x:x+w]
    eyes = eye_cascade.detectMultiScale(roi_gray)
    for (ex,ey,ew,eh) in eyes:
        cv2.rectangle(roi_color,(ex,ey),(ex+ew,ey+eh),(0,255,0),2)

cv2.imshow('img',smallImg)
cv2.waitKey(0)
cv2.destroyAllWindows()

Grabs my nostrails as eyes as well, but at least it identifies the rest correctly...

What about a group of people:

In [17]:
img = cv2.imread('main_street.JPG')
smallImg = cv2.resize(img, (750,1000), interpolation=cv2.INTER_AREA) # my go to headshot
gray = cv2.cvtColor(smallImg, cv2.COLOR_BGR2GRAY)

faces = face_cascade.detectMultiScale(gray, 1.3, 5)
for (x,y,w,h) in faces:
    smallImg = cv2.rectangle(smallImg,(x,y),(x+w,y+h),(255,0,0),2)
    roi_gray = gray[y:y+h, x:x+w]
    roi_color = smallImg[y:y+h, x:x+w]
    eyes = eye_cascade.detectMultiScale(roi_gray)
    for (ex,ey,ew,eh) in eyes:
        cv2.rectangle(roi_color,(ex,ey),(ex+ew,ey+eh),(0,255,0),2)

cv2.imshow('img',smallImg)
cv2.waitKey(0)
cv2.destroyAllWindows()

Misses Nicholas and is pretty bad with the eyes, but hey I guess 3/4 faces isnt horrible...

What if we change the classifiers?

In [18]:
face_cascade = cv2.CascadeClassifier('haarcascade_frontalface_alt.xml')
left_eye_cascade = cv2.CascadeClassifier('haarcascade_lefteye_2splits.xml')
right_eye_cascade = cv2.CascadeClassifier('haarcascade_righteye_2splits.xml')

In [19]:
img = cv2.imread('Ueckermann_headshot.JPG')
smallImg = cv2.resize(img, (750,1000), interpolation=cv2.INTER_AREA) # my go to headshot
gray = cv2.cvtColor(smallImg, cv2.COLOR_BGR2GRAY)

faces = face_cascade.detectMultiScale(gray, 1.3, 5)
for (x,y,w,h) in faces:
    smallImg = cv2.rectangle(smallImg,(x,y),(x+w,y+h),(255,0,0),2)
    roi_gray = gray[y:y+h, x:x+w]
    roi_color = smallImg[y:y+h, x:x+w]
    eyes = eye_cascade.detectMultiScale(roi_gray)
    for (ex,ey,ew,eh) in eyes:
        cv2.rectangle(roi_color,(ex,ey),(ex+ew,ey+eh),(0,255,0),2)

cv2.imshow('img',smallImg)
cv2.waitKey(0)
cv2.destroyAllWindows()

Huh,just identifies the other nostrial now... not much better

In [20]:
img = cv2.imread('main_street.JPG')
smallImg = cv2.resize(img, (750,1000), interpolation=cv2.INTER_AREA) # my go to headshot
gray = cv2.cvtColor(smallImg, cv2.COLOR_BGR2GRAY)

faces = face_cascade.detectMultiScale(gray, 1.3, 5)
for (x,y,w,h) in faces:
    smallImg = cv2.rectangle(smallImg,(x,y),(x+w,y+h),(255,0,0),2)
    roi_gray = gray[y:y+h, x:x+w]
    roi_color = smallImg[y:y+h, x:x+w]
    eyes = eye_cascade.detectMultiScale(roi_gray)
    for (ex,ey,ew,eh) in eyes:
        cv2.rectangle(roi_color,(ex,ey),(ex+ew,ey+eh),(0,255,0),2)

cv2.imshow('img',smallImg)
cv2.waitKey(0)
cv2.destroyAllWindows()

Looks like this just idenitifies Chandlers left eye twice, probably need to train for the lighting/angles/zoom.