In [51]:
# Script converts 6sense emails to excel file
# Creation date: 16 Jan 2023
# Last modification: 26 Jan 2023
# Made by: Kamil Smolag

from bs4 import BeautifulSoup
import pandas as pd
import re
import os
import glob
import datetime as dt

# --Functions--
def scrap_email(url):
    soup = BeautifulSoup(open(url, 'rb').read())
    sense_table = soup.find('table', {"style": re.compile(r"^max-width:565.5pt;border-collapse:collapse;mso-yfti-tbllook:1184")}) # Fetching 6sense email table
    company_table = soup.find_all('table', {"style": re.compile(r"^width:100.0%;mso-cellspacing:0in;background:white;")})
    activities_dates = sense_table.find_all("p", {"class": "MsoNormal"}) # Difficult to define correct p - checking every
    for p in activities_dates:
        if "Activities from" in p.text: # Looking for p wich contains "Activites from"
            activities_date = p.text.split("Activities from : ")[1] # Taking only Date period
        else: activities_date = ""
    for company in company_table:
        activities_date_list.append(activities_date) # Each company has to have this
        name = company.find('span', {"style": "font-size:13.5pt"}).text.replace("\r","").replace("  ","").replace("\n"," ") # Extracting and normalizizng text
        names_list.append(name)
        website = company.find('span', {"style": "color:#0082D4;text-decoration:none;text-underline:none"})
        website_and_location = website.find_previous("p").text.replace("\r","").replace("  ","").replace("\n"," ").split(",") #spliting "p" as it has 2 vars
        website = website_and_location[0] # re-saving "website" to not create another variable
        websites_list.append(website)
        location = website_and_location[1]
        locations_list.append(location)
        comment = company.find('span', {"style": "font-size:11.5pt;color:#001F32"}).text.replace("\r","").replace("  ","").replace("\n"," ")
        comments_list.append(comment)
        for item in company.find_all('span', {"style": "font-size:11.5pt;color:black"}): # Scrapping stage, fit, reach - they all have the same style 
            item = item.text.replace("\r","").replace("  ","").replace("\n"," ")
            if "buying stage:" in item.lower():
                buying_stage_list.append(item.replace("Buying Stage: ",""))
            elif "profile fit:" in item.lower():
                profile_fit_list.append(item.replace("Profile Fit: ",""))
            elif "account reach:" in item.lower():
                account_reach_list.append(item.replace("Account Reach: ",""))
        keywords = company.find_all('span', {"style": re.compile(r"^font-size:10.5pt;color:#505C62;")})
        if not keywords:
            keyword1_list.append("")
            keyword2_list.append("")
            keyword3_list.append("")
            keyword4_list.append("")
            keyword5_list.append("")
        else:
            for count, keyword in enumerate(keywords): # There are usually 3 keywords - loop needed
                keyword = keyword.text.replace("\r","").replace("  ","").replace("\n"," ").split(" (")[0]
                if len(keywords) >= 5:
                    if count == 0:
                        keyword1_list.append(keyword)
                    elif count == 1:
                        keyword2_list.append(keyword)
                    elif count == 2:
                        keyword3_list.append(keyword)
                    elif count == 3:
                        keyword4_list.append(keyword)
                    elif count == 4:
                        keyword5_list.append(keyword)
                if len(keywords) == 4:
                    if count == 0:
                        keyword1_list.append(keyword)
                    elif count == 1:
                        keyword2_list.append(keyword)
                    elif count == 2:
                        keyword3_list.append(keyword)
                    elif count == 3:
                        keyword4_list.append(keyword)
                        keyword5_list.append("")
                if len(keywords) == 3:
                    if count == 0:
                        keyword1_list.append(keyword)
                    elif count == 1:
                        keyword2_list.append(keyword)
                    elif count == 2:
                        keyword3_list.append(keyword)
                        keyword4_list.append("")
                        keyword5_list.append("")
                elif len(keywords) == 2: 
                    if count == 0:
                        keyword1_list.append(keyword)
                    elif count == 1:
                        keyword2_list.append(keyword)
                        keyword3_list.append("")
                        keyword4_list.append("")
                        keyword5_list.append("")
                elif len(keywords) == 1:
                    keyword1_list.append(keyword)
                    keyword2_list.append("")
                    keyword3_list.append("")
                    keyword4_list.append("")
                    keyword5_list.append("")

# --Vars--
names_list = []
websites_list = []
locations_list = []
buying_stage_list = []
profile_fit_list = []
account_reach_list = []
keyword1_list = []
keyword2_list = []
keyword3_list = []
keyword4_list = []
keyword5_list = []
comments_list = []
date_list = []
activities_date_list = []

# --Main-code--
path = os.getcwd()
html_files = glob.glob(os.path.join(path, "*.html"))
html_files += glob.glob(os.path.join(path, "*.htm"))
print("Found", len(html_files), "HTML files")

for url in html_files:
    scrap_email(url)

for name in names_list:
    date_list.append(dt.datetime.today().strftime("%d-%b-%Y"))

dict = {
    "Date added": date_list,
    "Activities date": activities_date_list,
    "Name": names_list,
    "Website": websites_list,
    "Location": locations_list,
    "Buying stage": buying_stage_list,
    "Profile fit": profile_fit_list,
    "Account reach": account_reach_list,
    "Keyword1": keyword1_list,
    "Keyword2": keyword2_list,
    "Keyword3": keyword3_list,
    "Keyword4": keyword4_list,
    "Keyword5": keyword5_list,
    "Comment": comments_list,
}

df = pd.DataFrame(dict)
df.to_excel(path + "/6sense_master_file.xlsx", index=False)
print("File saved")
df

Found 52 HTML files
File saved


Unnamed: 0,Date added,Activities date,Name,Website,Location,Buying stage,Profile fit,Account reach,Keyword1,Keyword2,Keyword3,Keyword4,Keyword5,Comment
0,26-Jan-2023,"Dec 07 - Dec 13, 2022",Aramark Corporation,aramark.com,United States,Decision,Moderate,Medium,McAfee,Okta,Aruba,palo alto networks,,"3 Web Visits - 0 known contact , 1 anonymous"
1,26-Jan-2023,"Dec 07 - Dec 13, 2022","Adobe, Inc.",adobe.com,United States,Decision,Moderate,Low,Okta,Aruba,palo alto networks,McAfee,vasco,"1 Web Visit - 0 known contact , 1 anonymous"
2,26-Jan-2023,"Dec 07 - Dec 13, 2022",Ankura,ankura.com,United States,Decision,Moderate,Low,McAfee,sophos,Carbon Black,SentinelOne,Aruba,"1 Web Visit - 0 known contact , 1 anonymous"
3,26-Jan-2023,"Dec 07 - Dec 13, 2022",Elmsford Union Free School District,eufsd.org,United States,Decision,Strong,High,system security,firewall,,,,"1 Web Visit - 0 known contact , 1 anonymous"
4,26-Jan-2023,"Dec 07 - Dec 13, 2022",Improving,improving.com,United States,Decision,Moderate,Medium,system security,network security,,,,1 Active Contact
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
862,26-Jan-2023,"[[[[if gte vml 1]><v:shape id=""_x0000_i1037""\r...",Mount Olive Board Of Education,motsd.org,United States,Consideration,Strong,Low,mfa,,,,,1 Keyword - 2 times researched by 1 anonymous ...
863,26-Jan-2023,"[[[[if gte vml 1]><v:shape id=""_x0000_i1037""\r...",Stratford Public Schools,stratfordk12.org,United States,Consideration,Moderate,Low,,,,,,Partner Intent Activities
864,26-Jan-2023,"[[[[if gte vml 1]><v:shape id=""_x0000_i1037""\r...",Garfield School District,gboe.org,United States,Consideration,Strong,Low,Wi-Fi 6 Routers,Wi-Fi 6,,,,2 Keywords - 2 times researched by 1 anonymous...
865,26-Jan-2023,"[[[[if gte vml 1]><v:shape id=""_x0000_i1037""\r...",Pennsauken Public Schools,pennsauken.net,United States,Consideration,Strong,Low,system security,,,,,1 Keyword - 1 time researched by 1 anonymous u...


In [49]:
path = os.getcwd()
html_files = glob.glob(os.path.join(path, "*.html"))

soup = BeautifulSoup(open(html_files[0], 'rb').read())
sense_table = soup.find('table', {"style": re.compile(r"^max-width:565.5pt;border-collapse:collapse;mso-yfti-tbllook:1184")}) # Fetching 6sense email table
activities_date = sense_table.find_all("p", {"class": "MsoNormal"})
for p in activities_date:
    if "Activities from" in p.text:
        activities_date = p.text.split("Activities from : ")[1]
        activities_date_list.append(activities_date)


Dec 07 - Dec 13, 2022 
