# Test each component for the indexer

In [2]:
from urllib.parse import urlsplit, urljoin
from dotenv import load_dotenv
from collections import deque
from bs4 import BeautifulSoup
import requests
import socket
import boto3
import json
import re
import os
import hashlib
import urllib.robotparser
from urllib.parse import urlparse
import time

# Text processing

In [40]:
#  [DONE] Lower case, remove numbers and diacritics;
#  [DONE] Remove punctuation and whitespaces
#  [DONE] Tokenization
#  [DONE] Remove stop words [2]
#  [] Stemming
#  [] Lemmatization

In [8]:
import codecs

In [13]:
# Choose an example HTML file and get only the text from it
exmaple_file_path = "/Users/kiyoshi/Desktop/Christian Poellabauer (Notre Dame).html"
file = codecs.open(exmaple_file_path, 'r')
document= BeautifulSoup(file.read()).get_text()
print(document)




Christian Poellabauer (Notre Dame)















Christian Poellabauer
Professor
              Computer Science and Engineering
              University of Notre Dame
              323B Cushing Hall
              Notre Dame, IN 46556
              Phone: (574) 631-9131
              Fax: (574) 631-9260
              Email: cpoellab AT nd.edu










 Education

Ph.D., Georgia Institute of Technology, May 2004
Diplom-Ingenieur, University of Technology Vienna, June 1998

 Research

My research interests span areas such as distributed real-time systems, resource
management (e.g., energy management), wireless/mobile networks, 
wireless sensor networks, vehicular networks, smart phone apps, and pervasive
healthcare applications and systems.
I direct the Mobile Computing Lab (M-Lab), where we focus on projects such as mobile applications and services, wireless networks (primarily MANETs, VANETs, mesh networks, etc.), wireless sensor networks, QoS and real-time requirements, energy eff

In [16]:
# Delete the empty lines and tokenizing
import nltk
from nltk.tokenize import word_tokenize

In [23]:
st = ""

docwords = word_tokenize(document)

for line in docwords:
    line = (line.rstrip())
    if line:
        if re.match("^[A-Za-z]*$",line):
            #if (line not in stop and len(line)>1):
            st = st + " " + line

# All lower cases
st_lower = st.lower()
print(st_lower)

 christian poellabauer notre dame christian poellabauer professor computer science and engineering university of notre dame cushing hall notre dame in phone fax email cpoellab at education georgia institute of technology may university of technology vienna june research my research interests span areas such as distributed systems resource management energy management networks wireless sensor networks vehicular networks smart phone apps and pervasive healthcare applications and systems i direct the mobile computing lab where we focus on projects such as mobile applications and services wireless networks primarily manets vanets mesh networks etc wireless sensor networks qos and requirements energy efficiency and healthcare solutions our group research is funded through grants by nsf including a career award office of naval research army research office air force office of scientific research motorola motorola labs and motorola foundation ibm including a ibm innovation award toyota infote

In [26]:
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords

In [35]:
word_list = st_lower.split(' ')

In [36]:
word_list

['',
 'christian',
 'poellabauer',
 'notre',
 'dame',
 'christian',
 'poellabauer',
 'professor',
 'computer',
 'science',
 'and',
 'engineering',
 'university',
 'of',
 'notre',
 'dame',
 'cushing',
 'hall',
 'notre',
 'dame',
 'in',
 'phone',
 'fax',
 'email',
 'cpoellab',
 'at',
 'education',
 'georgia',
 'institute',
 'of',
 'technology',
 'may',
 'university',
 'of',
 'technology',
 'vienna',
 'june',
 'research',
 'my',
 'research',
 'interests',
 'span',
 'areas',
 'such',
 'as',
 'distributed',
 'systems',
 'resource',
 'management',
 'energy',
 'management',
 'networks',
 'wireless',
 'sensor',
 'networks',
 'vehicular',
 'networks',
 'smart',
 'phone',
 'apps',
 'and',
 'pervasive',
 'healthcare',
 'applications',
 'and',
 'systems',
 'i',
 'direct',
 'the',
 'mobile',
 'computing',
 'lab',
 'where',
 'we',
 'focus',
 'on',
 'projects',
 'such',
 'as',
 'mobile',
 'applications',
 'and',
 'services',
 'wireless',
 'networks',
 'primarily',
 'manets',
 'vanets',
 'mesh',
 'net

In [37]:
filtered_list = [word for word in word_list if word not in stopwords.words('english')]

In [38]:
filtered_list

['',
 'christian',
 'poellabauer',
 'notre',
 'dame',
 'christian',
 'poellabauer',
 'professor',
 'computer',
 'science',
 'engineering',
 'university',
 'notre',
 'dame',
 'cushing',
 'hall',
 'notre',
 'dame',
 'phone',
 'fax',
 'email',
 'cpoellab',
 'education',
 'georgia',
 'institute',
 'technology',
 'may',
 'university',
 'technology',
 'vienna',
 'june',
 'research',
 'research',
 'interests',
 'span',
 'areas',
 'distributed',
 'systems',
 'resource',
 'management',
 'energy',
 'management',
 'networks',
 'wireless',
 'sensor',
 'networks',
 'vehicular',
 'networks',
 'smart',
 'phone',
 'apps',
 'pervasive',
 'healthcare',
 'applications',
 'systems',
 'direct',
 'mobile',
 'computing',
 'lab',
 'focus',
 'projects',
 'mobile',
 'applications',
 'services',
 'wireless',
 'networks',
 'primarily',
 'manets',
 'vanets',
 'mesh',
 'networks',
 'etc',
 'wireless',
 'sensor',
 'networks',
 'qos',
 'requirements',
 'energy',
 'efficiency',
 'healthcare',
 'solutions',
 'group',
 

In [39]:
print(len(word_list))
print(len(filtered_list))

977
742


In [44]:
# Stemming and Lemmatization
from nltk.stem.snowball import SnowballStemmer

stem_list = []
stemmer = SnowballStemmer(language='english')
for word in filtered_list:
    stem_list.append(stemmer.stem(word))

In [46]:
stem_list[:20]

['',
 'christian',
 'poellabau',
 'notr',
 'dame',
 'christian',
 'poellabau',
 'professor',
 'comput',
 'scienc',
 'engin',
 'univers',
 'notr',
 'dame',
 'cush',
 'hall',
 'notr',
 'dame',
 'phone',
 'fax']

In [62]:
from nltk.stem import WordNetLemmatizer

In [63]:
wordnet_lemmatizer = WordNetLemmatizer()

In [65]:
for word in stem_list:
    print (word, wordnet_lemmatizer.lemmatize(word, pos="v"))

 
christian christian
poellabau poellabau
notr notr
dame dame
christian christian
poellabau poellabau
professor professor
comput comput
scienc scienc
engin engin
univers univers
notr notr
dame dame
cush cush
hall hall
notr notr
dame dame
phone phone
fax fax
email email
cpoellab cpoellab
educ educ
georgia georgia
institut institut
technolog technolog
may may
univers univers
technolog technolog
vienna vienna
june june
research research
research research
interest interest
span span
area area
distribut distribut
system system
resourc resourc
manag manag
energi energi
manag manag
network network
wireless wireless
sensor sensor
network network
vehicular vehicular
network network
smart smart
phone phone
app app
pervas pervas
healthcar healthcar
applic applic
system system
direct direct
mobil mobil
comput comput
lab lab
focus focus
project project
mobil mobil
applic applic
servic servic
wireless wireless
network network
primarili primarili
manet manet
vanet vanet
mesh mesh
network network
etc 

In [53]:
# Convert the list back to string
stem_str = ' '.join(stem_list)

In [55]:
stem_str[:100]

' christian poellabau notr dame christian poellabau professor comput scienc engin univers notr dame c'

In [58]:
stem_str_lemma = sp(stem_str)

In [59]:
lemm_list = []

for word in stem_str_lemma:
    lemm_list.append(word.lemma_)

In [61]:
for word in stem_str_lemma:
    print(word.text + '  ===>', word.lemma_)

   ===>  
christian  ===> christian
poellabau  ===> poellabau
notr  ===> notr
dame  ===> dame
christian  ===> christian
poellabau  ===> poellabau
professor  ===> professor
comput  ===> comput
scienc  ===> scienc
engin  ===> engin
univers  ===> univer
notr  ===> notr
dame  ===> dame
cush  ===> cush
hall  ===> hall
notr  ===> notr
dame  ===> dame
phone  ===> phone
fax  ===> fax
email  ===> email
cpoellab  ===> cpoellab
educ  ===> educ
georgia  ===> georgia
institut  ===> institut
technolog  ===> technolog
may  ===> may
univers  ===> univer
technolog  ===> technolog
vienna  ===> vienna
june  ===> june
research  ===> research
research  ===> research
interest  ===> interest
span  ===> span
area  ===> area
distribut  ===> distribut
system  ===> system
resourc  ===> resourc
manag  ===> manag
energi  ===> energi
manag  ===> manag
network  ===> network
wireless  ===> wireless
sensor  ===> sensor
network  ===> network
vehicular  ===> vehicular
network  ===> network
smart  ===> smart
phone  ===> 

In [60]:
lemm_list

[' ',
 'christian',
 'poellabau',
 'notr',
 'dame',
 'christian',
 'poellabau',
 'professor',
 'comput',
 'scienc',
 'engin',
 'univer',
 'notr',
 'dame',
 'cush',
 'hall',
 'notr',
 'dame',
 'phone',
 'fax',
 'email',
 'cpoellab',
 'educ',
 'georgia',
 'institut',
 'technolog',
 'may',
 'univer',
 'technolog',
 'vienna',
 'june',
 'research',
 'research',
 'interest',
 'span',
 'area',
 'distribut',
 'system',
 'resourc',
 'manag',
 'energi',
 'manag',
 'network',
 'wireless',
 'sensor',
 'network',
 'vehicular',
 'network',
 'smart',
 'phone',
 'app',
 'pervas',
 'healthcar',
 'applic',
 'system',
 'direct',
 'mobil',
 'comput',
 'lab',
 'focus',
 'project',
 'mobil',
 'applic',
 'servic',
 'wireless',
 'network',
 'primarili',
 'manet',
 'vanet',
 'mesh',
 'network',
 'etc',
 'wireless',
 'sensor',
 'network',
 'qos',
 'requir',
 'energi',
 'effici',
 'healthcar',
 'solut',
 'group',
 'research',
 'fund',
 'grant',
 'nsf',
 'includ',
 'career',
 'award',
 'offic',
 'naval',
 'resear

# 1. Connect to AWS S3

In [3]:
s3 = boto3.resource('s3')

In [4]:
S3_BUCKET_NAME = "bingo-crawling"

In [5]:
bucket = s3.Bucket(S3_BUCKET_NAME)