# Exploratory Data Analysis 

I'd like to build a model that predicts a company's category based on the website text. Therefore, my EDA will focus on assessing the text data available. 

- Perform detailed EDA on CompanyMerged
- Visualize key aspects of data with notes relevant to model building
- Findings and hypotheses outlined below

In [1]:
import sys
import os
import pkgutil
from inspect import getmembers, isfunction
import pandas as pd

import plotly.express as px

# Dynamically get the current working directory
current_dir = os.getcwd()

# Add the path to utils/ directory, assuming it's one level up from the current working directory
utils_path = os.path.abspath(os.path.join(current_dir, '..', 'utils'))
sys.path.append(utils_path)

# Verify that the utils path is correctly added
print(f"Utils path added: {utils_path}")

# Check that the modules in the utils directory are found
print(f"Modules in utils directory: {[name for _, name, _ in pkgutil.iter_modules([utils_path])]}")

import db_utils as db

# Import helper_functions module after appending the correct path
try:
    import helper_functions as hf
    print("Successfully imported helper_functions.")
except ImportError as e:
    print(f"Failed to import helper_functions: {e}")

# Inspect and list all functions in helper_functions module
helper_funcs = getmembers(hf, isfunction)
print(f"Functions in helper_functions: {helper_funcs}")

# If no functions are found, print a warning message
if not helper_funcs:
    print("Warning: No functions found in helper_functions.py")

# Example: Call a function from helper_functions
if hasattr(hf, 'example_function_1'):
    result = hf.example_function_1()
    print(f"Result from 'example_function_1': {result}")




Utils path added: c:\Users\megan\OneDrive\Documents\GitHub\sqlite_to_analysis_app\utils
Modules in utils directory: ['db_utils', 'helper_functions', 'markdown_writer']
helper_functions.py has been loaded
Successfully imported helper_functions.
Functions in helper_functions: [('expandContractions', <function expandContractions at 0x000002078A841EE0>), ('join_text_columns', <function join_text_columns at 0x000002078A841E50>), ('process_text', <function process_text at 0x000002078A841F70>), ('remove_html', <function remove_html at 0x000002078A841DC0>), ('word_count', <function word_count at 0x00000207873CECA0>), ('word_freq', <function word_freq at 0x000002078A841CA0>), ('word_tokenize', <function word_tokenize at 0x0000020788A4C5E0>)]


In [2]:
db_path = "C:/Users/megan/OneDrive/Documents/GitHub/sqlite_to_analysis_app/data/combined_data.db"
conn = db.connect_to_db(db_path)

In [8]:
# identify names of tables in the database
db.run_query(conn,"SELECT name FROM sqlite_master WHERE type='table'")

[('CompanyClassification',), ('CompanyDataset',), ('CompanyMerged',)]

In [3]:
# extract data to pandas dataframe
company_merged = pd.read_sql_query("SELECT * FROM CompanyMerged",conn)
# count the words within homepage_text
company_merged['len_homepage_text'] = company_merged['homepage_text'].apply(lambda x: hf.word_count(x) if x is not None else 0)
company_merged.head()

Unnamed: 0,Company_ID,CompanyName,Website,Industry,Size_Range,Locality,Country,Current_Employee_Estimate,Total_Employee_Estimate,Category,homepage_text,h1,h2,h3,nav_link_text,meta_keywords,meta_description,len_homepage_text
0,99,crinan hotel,crinanhotel.com,hospitality,1 - 10,"ardchonell, argyll and bute, united kingdom",united kingdom,1,3,Corporate Services,01546 830261 Crinan · by Lochgilp...,Latest News#sep#Website Privacy Statement#sep#...,How we use cookies#sep#Security#sep#Let's be S...,Accommodation#sep#Activities#sep#Experience Cr...,,"Crinan hotel, country house hotel, boutique ho...",Crinan Hotel - on waterfront overlooking Loch ...,3897
1,222,"spot on productions, llc",spotonproductionsllc.com,entertainment,1 - 10,"jackson, mississippi, united states",united states,2,3,"Media, Marketing & Sales",...,Storytelling Brought to Life.,,,,,"We're Philip Scarborough and Tom Beck, the for...",200
2,535,akhand jyoti eye hospital,akhandjyoti.in,hospital & health care,11 - 50,"saran, bihar, india",india,8,11,Healthcare,Donate ...,Eradicate Curable Blindness,"12,600,000#sep#In Low-Income States Of India",Our Girls Help#sep#Donate In Specific Programs...,"why blindness,women empowerment,our impact,abo...",Akhand Jyoti - the largest eye hospital in eas...,"Akhandjyoti, akhand jyoti eye hospital, non-pr...",1426
3,642,lasercare eye center,dfweyes.com,medical practice,1 - 10,"irving, texas, united states",united states,4,11,Healthcare,...,,,,"home,why choose us,new patient information,pat...",,Call 214.574.9600 TODAY for an appointment! Th...,2319
4,675,compumachine inc,compumachine.com,machinery,1 - 10,"danvers, massachusetts, united states",united states,4,9,Industrials,MACHINES & AUTOMATION HOME MACHINE...,,MACHINES & AUTOMATION,,"home,machines,automation,mastercam,services,ab...",,Compumachine is proud to offer CNC Machine Too...,242


In [4]:
# check nulls per row of the merged table
print(f"Total Rows: {len(company_merged)}")
company_merged.isnull().sum(axis=0)

Total Rows: 73124


Company_ID                       0
CompanyName                      0
Website                          0
Industry                         0
Size_Range                       0
Locality                      1745
Country                          0
Current_Employee_Estimate        0
Total_Employee_Estimate          0
Category                         0
homepage_text                    0
h1                           26511
h2                           20055
h3                           28491
nav_link_text                25084
meta_keywords                49474
meta_description              6688
len_homepage_text                0
dtype: int64

In my sample, all companies have some website text. 
- Roughly one third don't have h1-h3 or nav_link_text. 
- meta_keywords is not available for most of my sample, but only about 10% are missing meta_description

It would make sense to join text from all available text fields to expand words available for predicting categories per company.

In [10]:
clean_text = company_merged.loc[company_merged['len_homepage_text']>0]
fig = px.histogram(clean_text, x='len_homepage_text', title="Distribution of Homepage Words")
fig.show()

## Categories Summary

In [11]:
# understand categories available
categories = company_merged['Category'].unique()
print("There are {} categories in CompanyMerged".format(len(categories)))
print(categories)

There are 13 categories in CompanyMerged
['Corporate Services' 'Media, Marketing & Sales' 'Healthcare'
 'Industrials' 'Commercial Services & Supplies' 'Consumer Discretionary'
 'Transportation & Logistics' 'Energy & Utilities' 'Financials'
 'Professional Services' 'Consumer Staples' 'Materials'
 'Information Technology']


In [55]:
# Group by Category 
grouped_df = company_merged.groupby(['Category'], as_index=False)['Website'].nunique()

# Create bar plot
fig = px.bar(
    grouped_df,
    x='Category',
    y='Website',
    color='Category',
    title='Unique Company Websites by Category',
    barmode='group'  # Group bars by industry
)

# Adjust the axes to scale automatically per group
fig.update_yaxes(matches=None)  # This ensures y-axes are independent

# Show the plot
fig.show()

In [56]:
# Group by Category for total word count
grouped_df = company_merged.groupby(['Category'], as_index=False)['len_homepage_text'].sum()

# Create bar plot
fig = px.bar(
    grouped_df,
    x='Category',
    y='len_homepage_text',
    color='Category',
    title='Words by Category',
    barmode='group'  # Group bars by industry
)

# Adjust the axes to scale automatically per group
fig.update_yaxes(matches=None)  # This ensures y-axes are independent

# Show the plot
fig.show()

In [12]:
# visually inspect some examples of the homepage text
# print top 3 examples by word count
for i,row in company_merged.sort_values(by='len_homepage_text',ascending=False)['homepage_text'].iloc[:3].reset_index(drop=True).iteritems():
    print(i,row)

0                    LOADING                               Browse Events    Past Events    News    Event Alerts    Vendor    About Us    How it Works    FAQs    Contact        Publish Event   Publish Press Release                           Register for Dundalk Institute of Technology Admission Program 2019       Blarose Lifestyle & Fashion Expo       Blarose Winter Edit- Lifestyle & Fashion Expo       Blarose Lifestyle and Fashion Expo- Season 3       Global Educators Fest 2018 , 3 AUG 2018 - 4 AUG 2018                    -- Select Sector --  Automobiles  Healthcare  IT & ITeS  Engineering  Services  Cement  Aviation  Startups  Food Industry  Education and Training  Science and Technology  Government  Real Estate  Pharmaceuticals  Media and Entertainment  Financial Services  Consumer Markets  Urban Market  Auto Components  Tourism and Hospitality  Agriculture  Textiles  Manufacturing  Gems and Jewellery  Food & Beverage  Consultancy  Not for Profit  Business Services  Environment  Infr

In [13]:
# print bottom 3 examples by word count
for i,row in company_merged.sort_values(by='len_homepage_text',ascending=True)['homepage_text'].iloc[:10].reset_index(drop=True).iteritems():
    print(i,row)

0 RackCorp.com
1 ÍøÕ¾·ÃÎÊÈÏÖ¤£¬µã»÷Á´½Óºó½«Ìø×ªµ½·ÃÎÊÒ³Ãæ
2 welcome
3 Skip
4 www.gs-co.eu
5 Loading
6 welcome
7 ...
8 welcome
9 Skip


In [14]:
company_merged.loc[company_merged['len_homepage_text']<20]

Unnamed: 0,Company_ID,CompanyName,Website,Industry,Size_Range,Locality,Country,Current_Employee_Estimate,Total_Employee_Estimate,Category,homepage_text,h1,h2,h3,nav_link_text,meta_keywords,meta_description,len_homepage_text
17,1809,guelph medical laser skin centre,guelphlaser.com,medical practice,1 - 10,"guelph, ontario, canada",canada,1,1,Healthcare,,,,,,"Laser Hair REmoval, CoolSculpting, Baby Belly,...",Guelph Medical Laser & Skin Centre offer Laser...,5
55,5175,new era debt solutions,neweradebtsolutions.com,financial services,1 - 10,"camarillo, california, united states",united states,1,2,Financials,,,,,,,,2
111,10819,"live edge media, llc",live-edge-media.com,photography,1 - 10,"cleona, pennsylvania, united states",united states,1,1,"Media, Marketing & Sales",,,,,,,Live Edge Media photography and videography. S...,7
196,21723,dominion lending centres clearmortgage.ca,clearmortgage.ca,financial services,1 - 10,"penticton, british columbia, canada",canada,2,2,Financials,,,,,,"mortgages, rates, broker, mortgage, lender, ba...",Mortgage Brokers,5
210,22706,7 accounts - xero accountants in london and ch...,7accounts.uk,accounting,1 - 10,"chichester, west sussex, united kingdom",united kingdom,2,2,Professional Services,,,,,,,Get online with Website Builder! Create a free...,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72727,7128791,institute for energy and sustainability,energyandsustainability.com,renewables & environment,1 - 10,"worcester, massachusetts, united states",united states,1,11,Energy & Utilities,,,,,,"Worcester, Incubator, Business, Startup, Clean...",Worcester CleanTech Incubator,5
72750,7130805,kerry erb computer services,kerryerb.com,information technology and services,1 - 10,"joliet, illinois, united states",united states,0,1,Information Technology,,,,,,"joliet,service,repair,computer,pc,mac,services...",,12
72868,7143064,dlee transportation,dleetransport.com,transportation/trucking/railroad,1 - 10,"kansas city, missouri, united states",united states,1,2,Transportation & Logistics,Skip,,,,,"ltl, truckload, freight, shipping, intermodal,...",Driving Logistics Solutions,1
72983,7158497,bouzies bakery,bouziesbakery.com,food production,1 - 10,"spokane, washington, united states",united states,0,2,Consumer Staples,,,,,,"Bouzies, bouzie, bakery, artisan bread, Spokan...","Bouzies Bakery opened on a great, cold day in ...",5


While cases where homepage_text was null have been removed, there are still examples where the text will be empty or have too few words to use.

Will need to clean:
- punctuation
- unicode
- html formatting
- stopwords
- contractions
- indentations, paragraphs etc

In [23]:
# Check the distribution of Total_Employee_Estimate per Category
# Calculate the 5th and 95th percentiles
lower_bound = company_merged['Total_Employee_Estimate'].quantile(0.05)
upper_bound = company_merged['Total_Employee_Estimate'].quantile(0.95)

print("upper bound: {}".format(upper_bound))
print("lower_bound: {}".format(lower_bound))

# Filter the DataFrame to remove outliers
filtered_df = company_merged[(company_merged['Total_Employee_Estimate'] >= lower_bound) & 
                  (company_merged['Total_Employee_Estimate'] <= upper_bound)]

# Create faceted charts
fig = px.histogram(
    filtered_df,
    x='Total_Employee_Estimate',
    color='Category',
    facet_col='Category',
    title='Distribution of Total Employee Estimate per Category'
)

# Show the plot
fig.show()

upper bound: 127.0
lower_bound: 1.0


In [24]:
# Check the distribution of Total_Employee_Estimate per Category
# Calculate the 5th and 95th percentiles
lower_bound = company_merged['Current_Employee_Estimate'].quantile(0.05)
upper_bound = company_merged['Current_Employee_Estimate'].quantile(0.95)

print("upper bound: {}".format(upper_bound))
print("lower_bound: {}".format(lower_bound))

# Filter the DataFrame to remove outliers
filtered_df = company_merged[(company_merged['Current_Employee_Estimate'] >= lower_bound) & 
                  (company_merged['Current_Employee_Estimate'] <= upper_bound)]

# Create faceted charts
fig = px.histogram(
    filtered_df,
    x='Current_Employee_Estimate',
    color='Category',
    facet_col='Category',
    title='Distribution of Current Employee Estimate per Category'
)

# Show the plot
fig.show()

upper bound: 57.0
lower_bound: 0.0


Consumer Discretionary, Industrials, and Materials are least represented in this dataset. However, there isn't a large enough skew amongst existing categories to make me significantly alter the distribution. I will test classificaiton with data as is for the time being. 

## Text Cleaning for Further Analysis

1. Merge text from all text fields into one extended string
2. Remove HTML tags
3. Update contractions
4. Remove punctuation
5. Remove stopwords

In [11]:
# Join text from specified columns
columns_to_merge = ['homepage_text', 'h1', 'h2','h3','nav_link_text','meta_keywords','meta_description']
company_merged['Full_Text'] = hf.join_text_columns(company_merged, columns_to_merge, separator=' ')
# count the words within homepage_text
company_merged['len_Full_Text'] = company_merged['Full_Text'].apply(lambda x: hf.word_count(x) if x is not None else 0)
company_merged.head()


Unnamed: 0,Company_ID,CompanyName,Website,Industry,Size_Range,Locality,Country,Current_Employee_Estimate,Total_Employee_Estimate,Category,homepage_text,h1,h2,h3,nav_link_text,meta_keywords,meta_description,len_homepage_text,Full_Text
0,99,crinan hotel,crinanhotel.com,hospitality,1 - 10,"ardchonell, argyll and bute, united kingdom",united kingdom,1,3,Corporate Services,01546 830261 Crinan · by Lochgilp...,Latest News#sep#Website Privacy Statement#sep#...,How we use cookies#sep#Security#sep#Let's be S...,Accommodation#sep#Activities#sep#Experience Cr...,,"Crinan hotel, country house hotel, boutique ho...",Crinan Hotel - on waterfront overlooking Loch ...,3897,01546 830261 Crinan · by Lochgilp...
1,222,"spot on productions, llc",spotonproductionsllc.com,entertainment,1 - 10,"jackson, mississippi, united states",united states,2,3,"Media, Marketing & Sales",...,Storytelling Brought to Life.,,,,,"We're Philip Scarborough and Tom Beck, the for...",200,...
2,535,akhand jyoti eye hospital,akhandjyoti.in,hospital & health care,11 - 50,"saran, bihar, india",india,8,11,Healthcare,Donate ...,Eradicate Curable Blindness,"12,600,000#sep#In Low-Income States Of India",Our Girls Help#sep#Donate In Specific Programs...,"why blindness,women empowerment,our impact,abo...",Akhand Jyoti - the largest eye hospital in eas...,"Akhandjyoti, akhand jyoti eye hospital, non-pr...",1426,Donate ...
3,642,lasercare eye center,dfweyes.com,medical practice,1 - 10,"irving, texas, united states",united states,4,11,Healthcare,...,,,,"home,why choose us,new patient information,pat...",,Call 214.574.9600 TODAY for an appointment! Th...,2319,...
4,675,compumachine inc,compumachine.com,machinery,1 - 10,"danvers, massachusetts, united states",united states,4,9,Industrials,MACHINES & AUTOMATION HOME MACHINE...,,MACHINES & AUTOMATION,,"home,machines,automation,mastercam,services,ab...",,Compumachine is proud to offer CNC Machine Too...,242,MACHINES & AUTOMATION HOME MACHINE...


In [12]:
company_merged['Full_Text'][0]

'            01546 830261  Crinan\xa0·\xa0by Lochgilphead\xa0·\xa0PA31 8SR                 Home Hotel History The Ryan Family Awards Reviews Crinan from the air Accommodation Rooms at Crinan Classic Double Balcony Twin / Double Superior Twin / Double Rates and Reservations Yours Exclusively Dogs are welcome Facilities and Services Food & Drink Lock 16 The Westward Crinan Seafood Bar The Pub Crinan Coffee Shop Sample Menus & Wine List Weddings Romantic Breaks Our Secret Garden Crinan Fine Art Art and Music weekends Fine Art Prints For Sale Crinan Gallery Exhibitions Frances Macdonald Ross Ryan Painting Holidays Sleep with the Art Activities & Boat Trips Boat trips on the Sgarbh The Corryvreckan Whirlpool Golf near Crinan Health & Beauty Heart of Argyll Wildlife Organisation History and Heritage Knapdale Beavers at Barnluasgan Kilmartin Glen and Kilmartin Museum Tarbert on Loch Fyne Visitor Attractions Walking at Crinan Whisky Distilleries Upcoming events Special offers Gift vouchers Tra

In [13]:
# remove HTML tags
# expand contractions
# remove punctuation and numbers
# remove stopwords
#Apply the function to preprocess the text. Tokenize, lower, expand contactions, lemmatize, remove punctuation, numbers and stop words
df_clean = company_merged.copy()
df_clean['clean_text'] = df_clean['Full Text'].apply(hf.process_text)
df_clean.head()

'this is a sample text '