# Importing necessary libraries

In [1]:
from sklearn.feature_extraction.text import CountVectorizer # for bag-of-words feature extraction
import email, getpass, imaplib, os # for email scraping
import pandas as pd # for storing retrieved mail records and related information in a dataframe

# Module for Bag-of-Words Similarity in order to filter mails with subjects containing keywords 'Thank you for applying'

In [2]:
def bow_similarity(subject):
    bow = CountVectorizer().fit_transform([subject.lower(), 'thank you for applying'])
    bow_similarity = bow[0] * bow[1].T
    return (bow_similarity.toarray()[0][0]/4)*100

# E-mail scraping ...

In [3]:
# entering email-id and password
user = input("Enter your email id --> ")
pwd  = getpass.getpass("Enter your password --> ")

Enter your email id --> navoworld98@gmail.com
Enter your password --> ········


## Note: For scraping emails for gmail servers, turn on "allow less secure app access" from https://myaccount.google.com/lesssecureapps

In [4]:
# connecting to the corresponding imap server of the mail. (different mail servers have different imap servers)
m = imaplib.IMAP4_SSL("imap.gmail.com") # imap server support for gmail
m.login(user, pwd) # logging in

('OK', [b'navoworld98@gmail.com authenticated (Success)'])

In [5]:
# selecting the whole Inbox for scraping emails
m.select('Inbox')

('OK', [b'42'])

In [6]:
resp, items = m.search(None, 'ALL') # searching mails from all senders
items = items[0].split()
from_mail = [] # sender information for every relevant mail is stored in this list
date = [] # datetime info for every relevant mail is stored in this list
body = [] # body of every relevant mail is stored in this list
mails = {} # relevant emails with their details are stored in the dictionary, later converted to pandas dataframe
mail_cnt = 0 # mail counter (keeping a track of the number of relevant emails scraped)
for emailid in items[::-1]: # scraping emails from most recent to least recent
    resp, data = m.fetch(emailid, "(RFC822)")
    for response_part in data:
        if isinstance(response_part, tuple):
            mail = email.message_from_string(list(response_part)[1].decode("utf-8"))
            if bow_similarity(mail['subject']) == 100.0: # checking whether the scraped mail is relevant or not
                r, d = m.fetch(emailid, "(UID BODY[TEXT])")
                mail_cnt += 1 # updating the mail encounter
                print("Email from scraped with subject-> " + mail['subject']) # printing the subject of the relevant mail
                from_mail.append(mail['from']) # storing the sender info of the relevant mail
                date.append(mail['date']) # storing the datetime info of the relevant mail 
                body.append(email.message_from_string(d[0][1].decode("utf-8"))) # storing the body of the relevant mail
                
mails['from'] = from_mail
mails['datetime'] = date
mails['body'] = [str(x) for x in body]
mails = pd.DataFrame(mails)
print(str(mail_cnt) + ' relevant email(s) is/are found and scraped')

Email from scraped with subject-> Thank you for Applying at paypal
1 relevant email(s) is/are found and scraped


In [7]:
mails # displaying the dataframe

Unnamed: 0,from,datetime,body
0,Navoneel Chakrabarty <nc2012@cse.jgec.ac.in>,"Thu, 15 Oct 2020 03:46:48 +0530",\n--0000000000001c874b05b1a8e4dc\nContent-Type...


In [8]:
str(mails['body'][0]).split('\n') # displaying the body of the mail

['',
 '--0000000000001c874b05b1a8e4dc',
 'Content-Type: text/plain; charset="UTF-8"',
 '',
 'Hello Navoneel Chakrabarty -',
 '',
 'We really appreciate the time and effort you took to connect with us and',
 'apply for the position: Data Scientist .',
 '',
 'We are no longer reviewing applicants for this particular position, but',
 "that doesn't necessarily mean this is the end. There are many openings at",
 "PayPal and we're always looking for smart, qualified and motivated",
 "applicants. If you see another interesting position, please apply and we'll",
 'get back in touch!',
 '',
 '',
 'Thanks,',
 'Global Talent Acquisition',
 '',
 '--0000000000001c874b05b1a8e4dc',
 'Content-Type: text/html; charset="UTF-8"',
 'Content-Transfer-Encoding: quoted-printable',
 '',
 '<div dir=3D"ltr"><div style=3D"font-size:13px;color:rgb(29,34,40);font-fami=',
 'ly:&quot;Helvetica Neue&quot;,Helvetica,Arial,sans-serif">Hello Navoneel Ch=',
 'akrabarty -<br><br>We really appreciate the time and effort yo