# Exploratory Data Analysis 

I'd like to build a model that predicts a company's category based on the website text. Therefore, my EDA will focus on assessing the text data available. 

- Perform detailed EDA on CompanyMerged
- Visualize key aspects of data with notes relevant to model building
- Findings and hypotheses outlined below

In [1]:
import sys
import os
import pkgutil
from inspect import getmembers, isfunction
import pandas as pd

import plotly.express as px

# Dynamically get the current working directory
current_dir = os.getcwd()

# Add the path to utils/ directory, assuming it's one level up from the current working directory
utils_path = os.path.abspath(os.path.join(current_dir, '..', 'utils'))
sys.path.append(utils_path)

# Verify that the utils path is correctly added
print(f"Utils path added: {utils_path}")

# Check that the modules in the utils directory are found
print(f"Modules in utils directory: {[name for _, name, _ in pkgutil.iter_modules([utils_path])]}")

import db_utils as db

# Import helper_functions module after appending the correct path
try:
    import helper_functions as hf
    print("Successfully imported helper_functions.")
except ImportError as e:
    print(f"Failed to import helper_functions: {e}")

# Inspect and list all functions in helper_functions module
helper_funcs = getmembers(hf, isfunction)
print(f"Functions in helper_functions: {helper_funcs}")

# If no functions are found, print a warning message
if not helper_funcs:
    print("Warning: No functions found in helper_functions.py")

# Example: Call a function from helper_functions
if hasattr(hf, 'example_function_1'):
    result = hf.example_function_1()
    print(f"Result from 'example_function_1': {result}")




Utils path added: c:\Users\megan\OneDrive\Documents\GitHub\sqlite_to_analysis_app\utils
Modules in utils directory: ['db_utils', 'helper_functions', 'markdown_writer']
helper_functions.py has been loaded
Successfully imported helper_functions.
Functions in helper_functions: [('expandContractions', <function expandContractions at 0x000001CB1659F820>), ('get_word_net_pos', <function get_word_net_pos at 0x000001CB1659F8B0>), ('join_text_columns', <function join_text_columns at 0x000001CB1659F790>), ('lemmatize_text', <function lemmatize_text at 0x000001CB1659F940>), ('process_text', <function process_text at 0x000001CB1659F9D0>), ('remove_html', <function remove_html at 0x000001CB1659F700>), ('word_count', <function word_count at 0x000001CB132D7310>), ('word_freq', <function word_freq at 0x000001CB1659F5E0>), ('word_tokenize', <function word_tokenize at 0x000001CB147A4670>)]


In [2]:
db_path = "C:/Users/megan/OneDrive/Documents/GitHub/sqlite_to_analysis_app/data/combined_data.db"
conn = db.connect_to_db(db_path)

In [3]:
# identify names of tables in the database
db.run_query(conn,"SELECT name FROM sqlite_master WHERE type='table'")

[('CompanyClassification',), ('CompanyDataset',), ('CompanyMerged',)]

In [3]:
# extract data to pandas dataframe
company_merged = pd.read_sql_query("SELECT * FROM CompanyMerged",conn)
# count the words within homepage_text
company_merged['len_homepage_text'] = company_merged['homepage_text'].apply(lambda x: hf.word_count(x) if x is not None else 0)
company_merged.head()

Unnamed: 0,Company_ID,CompanyName,Website,Industry,Size_Range,Locality,Country,Current_Employee_Estimate,Total_Employee_Estimate,Category,homepage_text,h1,h2,h3,nav_link_text,meta_keywords,meta_description,len_homepage_text
0,99,crinan hotel,crinanhotel.com,hospitality,1 - 10,"ardchonell, argyll and bute, united kingdom",united kingdom,1,3,Corporate Services,01546 830261 Crinan · by Lochgilp...,Latest News#sep#Website Privacy Statement#sep#...,How we use cookies#sep#Security#sep#Let's be S...,Accommodation#sep#Activities#sep#Experience Cr...,,"Crinan hotel, country house hotel, boutique ho...",Crinan Hotel - on waterfront overlooking Loch ...,3897
1,222,"spot on productions, llc",spotonproductionsllc.com,entertainment,1 - 10,"jackson, mississippi, united states",united states,2,3,"Media, Marketing & Sales",...,Storytelling Brought to Life.,,,,,"We're Philip Scarborough and Tom Beck, the for...",200
2,535,akhand jyoti eye hospital,akhandjyoti.in,hospital & health care,11 - 50,"saran, bihar, india",india,8,11,Healthcare,Donate ...,Eradicate Curable Blindness,"12,600,000#sep#In Low-Income States Of India",Our Girls Help#sep#Donate In Specific Programs...,"why blindness,women empowerment,our impact,abo...",Akhand Jyoti - the largest eye hospital in eas...,"Akhandjyoti, akhand jyoti eye hospital, non-pr...",1426
3,642,lasercare eye center,dfweyes.com,medical practice,1 - 10,"irving, texas, united states",united states,4,11,Healthcare,...,,,,"home,why choose us,new patient information,pat...",,Call 214.574.9600 TODAY for an appointment! Th...,2319
4,675,compumachine inc,compumachine.com,machinery,1 - 10,"danvers, massachusetts, united states",united states,4,9,Industrials,MACHINES & AUTOMATION HOME MACHINE...,,MACHINES & AUTOMATION,,"home,machines,automation,mastercam,services,ab...",,Compumachine is proud to offer CNC Machine Too...,242


In [6]:
# check nulls per row of the merged table
print(f"Total Rows: {len(company_merged)}")
company_merged.isnull().sum(axis=0)

Total Rows: 73124


Company_ID                       0
CompanyName                      0
Website                          0
Industry                         0
Size_Range                       0
Locality                      1745
Country                          0
Current_Employee_Estimate        0
Total_Employee_Estimate          0
Category                         0
homepage_text                    0
h1                           26511
h2                           20055
h3                           28491
nav_link_text                25084
meta_keywords                49474
meta_description              6688
len_homepage_text                0
dtype: int64

In my sample, all companies have some website text. 
- Roughly one third don't have h1-h3 or nav_link_text. 
- meta_keywords is not available for most of my sample, but only about 10% are missing meta_description

It would make sense to join text from all available text fields to expand words available for predicting categories per company.

In [10]:
clean_text = company_merged.loc[company_merged['len_homepage_text']>0]
fig = px.histogram(clean_text, x='len_homepage_text', title="Distribution of Homepage Words")
fig.show()

## Categories Summary

In [11]:
# understand categories available
categories = company_merged['Category'].unique()
print("There are {} categories in CompanyMerged".format(len(categories)))
print(categories)

There are 13 categories in CompanyMerged
['Corporate Services' 'Media, Marketing & Sales' 'Healthcare'
 'Industrials' 'Commercial Services & Supplies' 'Consumer Discretionary'
 'Transportation & Logistics' 'Energy & Utilities' 'Financials'
 'Professional Services' 'Consumer Staples' 'Materials'
 'Information Technology']


In [55]:
# Group by Category 
grouped_df = company_merged.groupby(['Category'], as_index=False)['Website'].nunique()

# Create bar plot
fig = px.bar(
    grouped_df,
    x='Category',
    y='Website',
    color='Category',
    title='Unique Company Websites by Category',
    barmode='group'  # Group bars by industry
)

# Adjust the axes to scale automatically per group
fig.update_yaxes(matches=None)  # This ensures y-axes are independent

# Show the plot
fig.show()

In [56]:
# Group by Category for total word count
grouped_df = company_merged.groupby(['Category'], as_index=False)['len_homepage_text'].sum()

# Create bar plot
fig = px.bar(
    grouped_df,
    x='Category',
    y='len_homepage_text',
    color='Category',
    title='Words by Category',
    barmode='group'  # Group bars by industry
)

# Adjust the axes to scale automatically per group
fig.update_yaxes(matches=None)  # This ensures y-axes are independent

# Show the plot
fig.show()

In [12]:
# visually inspect some examples of the homepage text
# print top 3 examples by word count
for i,row in company_merged.sort_values(by='len_homepage_text',ascending=False)['homepage_text'].iloc[:3].reset_index(drop=True).iteritems():
    print(i,row)

0                    LOADING                               Browse Events    Past Events    News    Event Alerts    Vendor    About Us    How it Works    FAQs    Contact        Publish Event   Publish Press Release                           Register for Dundalk Institute of Technology Admission Program 2019       Blarose Lifestyle & Fashion Expo       Blarose Winter Edit- Lifestyle & Fashion Expo       Blarose Lifestyle and Fashion Expo- Season 3       Global Educators Fest 2018 , 3 AUG 2018 - 4 AUG 2018                    -- Select Sector --  Automobiles  Healthcare  IT & ITeS  Engineering  Services  Cement  Aviation  Startups  Food Industry  Education and Training  Science and Technology  Government  Real Estate  Pharmaceuticals  Media and Entertainment  Financial Services  Consumer Markets  Urban Market  Auto Components  Tourism and Hospitality  Agriculture  Textiles  Manufacturing  Gems and Jewellery  Food & Beverage  Consultancy  Not for Profit  Business Services  Environment  Infr

In [13]:
# print bottom 3 examples by word count
for i,row in company_merged.sort_values(by='len_homepage_text',ascending=True)['homepage_text'].iloc[:10].reset_index(drop=True).iteritems():
    print(i,row)

0 RackCorp.com
1 ÍøÕ¾·ÃÎÊÈÏÖ¤£¬µã»÷Á´½Óºó½«Ìø×ªµ½·ÃÎÊÒ³Ãæ
2 welcome
3 Skip
4 www.gs-co.eu
5 Loading
6 welcome
7 ...
8 welcome
9 Skip


In [5]:
print("Row count with less than 50 words: {}".format(len(company_merged.loc[company_merged['len_homepage_text']<50])))
company_merged.loc[company_merged['len_homepage_text']<20].head()

Row count with less than 50 words: 1516


Unnamed: 0,Company_ID,CompanyName,Website,Industry,Size_Range,Locality,Country,Current_Employee_Estimate,Total_Employee_Estimate,Category,homepage_text,h1,h2,h3,nav_link_text,meta_keywords,meta_description,len_homepage_text
17,1809,guelph medical laser skin centre,guelphlaser.com,medical practice,1 - 10,"guelph, ontario, canada",canada,1,1,Healthcare,,,,,,"Laser Hair REmoval, CoolSculpting, Baby Belly,...",Guelph Medical Laser & Skin Centre offer Laser...,5
55,5175,new era debt solutions,neweradebtsolutions.com,financial services,1 - 10,"camarillo, california, united states",united states,1,2,Financials,,,,,,,,2
111,10819,"live edge media, llc",live-edge-media.com,photography,1 - 10,"cleona, pennsylvania, united states",united states,1,1,"Media, Marketing & Sales",,,,,,,Live Edge Media photography and videography. S...,7
196,21723,dominion lending centres clearmortgage.ca,clearmortgage.ca,financial services,1 - 10,"penticton, british columbia, canada",canada,2,2,Financials,,,,,,"mortgages, rates, broker, mortgage, lender, ba...",Mortgage Brokers,5
210,22706,7 accounts - xero accountants in london and ch...,7accounts.uk,accounting,1 - 10,"chichester, west sussex, united kingdom",united kingdom,2,2,Professional Services,,,,,,,Get online with Website Builder! Create a free...,5


While cases where homepage_text was null have been removed, there are still examples where the text will be empty or have too few words to use.

Will need to clean:
- punctuation
- unicode
- html formatting
- stopwords
- contractions
- indentations, paragraphs etc

In [23]:
# Check the distribution of Total_Employee_Estimate per Category
# Calculate the 5th and 95th percentiles
lower_bound = company_merged['Total_Employee_Estimate'].quantile(0.05)
upper_bound = company_merged['Total_Employee_Estimate'].quantile(0.95)

print("upper bound: {}".format(upper_bound))
print("lower_bound: {}".format(lower_bound))

# Filter the DataFrame to remove outliers
filtered_df = company_merged[(company_merged['Total_Employee_Estimate'] >= lower_bound) & 
                  (company_merged['Total_Employee_Estimate'] <= upper_bound)]

# Create faceted charts
fig = px.histogram(
    filtered_df,
    x='Total_Employee_Estimate',
    color='Category',
    facet_col='Category',
    title='Distribution of Total Employee Estimate per Category'
)

# Show the plot
fig.show()

upper bound: 127.0
lower_bound: 1.0


In [24]:
# Check the distribution of Total_Employee_Estimate per Category
# Calculate the 5th and 95th percentiles
lower_bound = company_merged['Current_Employee_Estimate'].quantile(0.05)
upper_bound = company_merged['Current_Employee_Estimate'].quantile(0.95)

print("upper bound: {}".format(upper_bound))
print("lower_bound: {}".format(lower_bound))

# Filter the DataFrame to remove outliers
filtered_df = company_merged[(company_merged['Current_Employee_Estimate'] >= lower_bound) & 
                  (company_merged['Current_Employee_Estimate'] <= upper_bound)]

# Create faceted charts
fig = px.histogram(
    filtered_df,
    x='Current_Employee_Estimate',
    color='Category',
    facet_col='Category',
    title='Distribution of Current Employee Estimate per Category'
)

# Show the plot
fig.show()

upper bound: 57.0
lower_bound: 0.0


Consumer Discretionary, Industrials, and Materials are least represented in this dataset. However, there isn't a large enough skew amongst existing categories to make me significantly alter the distribution. I will test classificaiton with data as is for the time being. 

## Text Cleaning for Further Analysis

1. Merge text from all text fields into one extended string
2. Remove HTML tags
3. Update contractions
4. Remove punctuation
5. Remove stopwords

In [4]:
# Join text from specified columns
columns_to_merge = ['homepage_text', 'h1', 'h2','h3','nav_link_text','meta_keywords','meta_description']
company_merged['Full_Text'] = hf.join_text_columns(company_merged, columns_to_merge, separator=' ')
# count the words within homepage_text
company_merged['len_Full_Text'] = company_merged['Full_Text'].apply(lambda x: hf.word_count(x) if x is not None else 0)
print('Rows with less than 50 words of full text: {}'.format(len(company_merged.loc[company_merged['len_Full_Text']<50])))
company_merged.head()


Rows with less than 50 words of full text: 958


Unnamed: 0,Company_ID,CompanyName,Website,Industry,Size_Range,Locality,Country,Current_Employee_Estimate,Total_Employee_Estimate,Category,homepage_text,h1,h2,h3,nav_link_text,meta_keywords,meta_description,len_homepage_text,Full_Text,len_Full_Text
0,99,crinan hotel,crinanhotel.com,hospitality,1 - 10,"ardchonell, argyll and bute, united kingdom",united kingdom,1,3,Corporate Services,01546 830261 Crinan · by Lochgilp...,Latest News#sep#Website Privacy Statement#sep#...,How we use cookies#sep#Security#sep#Let's be S...,Accommodation#sep#Activities#sep#Experience Cr...,,"Crinan hotel, country house hotel, boutique ho...",Crinan Hotel - on waterfront overlooking Loch ...,3897,01546 830261 Crinan · by Lochgilp...,4095
1,222,"spot on productions, llc",spotonproductionsllc.com,entertainment,1 - 10,"jackson, mississippi, united states",united states,2,3,"Media, Marketing & Sales",...,Storytelling Brought to Life.,,,,,"We're Philip Scarborough and Tom Beck, the for...",200,...,230
2,535,akhand jyoti eye hospital,akhandjyoti.in,hospital & health care,11 - 50,"saran, bihar, india",india,8,11,Healthcare,Donate ...,Eradicate Curable Blindness,"12,600,000#sep#In Low-Income States Of India",Our Girls Help#sep#Donate In Specific Programs...,"why blindness,women empowerment,our impact,abo...",Akhand Jyoti - the largest eye hospital in eas...,"Akhandjyoti, akhand jyoti eye hospital, non-pr...",1426,Donate ...,1532
3,642,lasercare eye center,dfweyes.com,medical practice,1 - 10,"irving, texas, united states",united states,4,11,Healthcare,...,,,,"home,why choose us,new patient information,pat...",,Call 214.574.9600 TODAY for an appointment! Th...,2319,...,2504
4,675,compumachine inc,compumachine.com,machinery,1 - 10,"danvers, massachusetts, united states",united states,4,9,Industrials,MACHINES & AUTOMATION HOME MACHINE...,,MACHINES & AUTOMATION,,"home,machines,automation,mastercam,services,ab...",,Compumachine is proud to offer CNC Machine Too...,242,MACHINES & AUTOMATION HOME MACHINE...,278


In [5]:
company_merged['Full_Text'][0]

'            01546 830261  Crinan\xa0·\xa0by Lochgilphead\xa0·\xa0PA31 8SR                 Home Hotel History The Ryan Family Awards Reviews Crinan from the air Accommodation Rooms at Crinan Classic Double Balcony Twin / Double Superior Twin / Double Rates and Reservations Yours Exclusively Dogs are welcome Facilities and Services Food & Drink Lock 16 The Westward Crinan Seafood Bar The Pub Crinan Coffee Shop Sample Menus & Wine List Weddings Romantic Breaks Our Secret Garden Crinan Fine Art Art and Music weekends Fine Art Prints For Sale Crinan Gallery Exhibitions Frances Macdonald Ross Ryan Painting Holidays Sleep with the Art Activities & Boat Trips Boat trips on the Sgarbh The Corryvreckan Whirlpool Golf near Crinan Health & Beauty Heart of Argyll Wildlife Organisation History and Heritage Knapdale Beavers at Barnluasgan Kilmartin Glen and Kilmartin Museum Tarbert on Loch Fyne Visitor Attractions Walking at Crinan Whisky Distilleries Upcoming events Special offers Gift vouchers Tra

In [6]:
# remove cases that have less than 50 words to classify them with across available website text
df_clean = company_merged.loc[company_merged['len_Full_Text']>50] # must have at least 50 words

# remove HTML tags
# expand contractions
# remove punctuation and numbers
# remove stopwords
df_clean['clean_text'] = df_clean['Full_Text'].apply(hf.process_text)
df_clean.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['clean_text'] = df_clean['Full_Text'].apply(hf.process_text)


Unnamed: 0,Company_ID,CompanyName,Website,Industry,Size_Range,Locality,Country,Current_Employee_Estimate,Total_Employee_Estimate,Category,...,h1,h2,h3,nav_link_text,meta_keywords,meta_description,len_homepage_text,Full_Text,len_Full_Text,clean_text
0,99,crinan hotel,crinanhotel.com,hospitality,1 - 10,"ardchonell, argyll and bute, united kingdom",united kingdom,1,3,Corporate Services,...,Latest News#sep#Website Privacy Statement#sep#...,How we use cookies#sep#Security#sep#Let's be S...,Accommodation#sep#Activities#sep#Experience Cr...,,"Crinan hotel, country house hotel, boutique ho...",Crinan Hotel - on waterfront overlooking Loch ...,3897,01546 830261 Crinan · by Lochgilp...,4095,"[crinan, ·, lochgilphead, ·, pa, sr, hotel, hi..."
1,222,"spot on productions, llc",spotonproductionsllc.com,entertainment,1 - 10,"jackson, mississippi, united states",united states,2,3,"Media, Marketing & Sales",...,Storytelling Brought to Life.,,,,,"We're Philip Scarborough and Tom Beck, the for...",200,...,230,"[reels, work, storytelling, brought, life, phi..."
2,535,akhand jyoti eye hospital,akhandjyoti.in,hospital & health care,11 - 50,"saran, bihar, india",india,8,11,Healthcare,...,Eradicate Curable Blindness,"12,600,000#sep#In Low-Income States Of India",Our Girls Help#sep#Donate In Specific Programs...,"why blindness,women empowerment,our impact,abo...",Akhand Jyoti - the largest eye hospital in eas...,"Akhandjyoti, akhand jyoti eye hospital, non-pr...",1426,Donate ...,1532,"[donate, gift, someone, sight, support, girl, ..."
3,642,lasercare eye center,dfweyes.com,medical practice,1 - 10,"irving, texas, united states",united states,4,11,Healthcare,...,,,,"home,why choose us,new patient information,pat...",,Call 214.574.9600 TODAY for an appointment! Th...,2319,...,2504,"[lasik, hotline, main, number, toll, free, irv..."
4,675,compumachine inc,compumachine.com,machinery,1 - 10,"danvers, massachusetts, united states",united states,4,9,Industrials,...,,MACHINES & AUTOMATION,,"home,machines,automation,mastercam,services,ab...",,Compumachine is proud to offer CNC Machine Too...,242,MACHINES & AUTOMATION HOME MACHINE...,278,"[machines, automation, machines, automation, m..."


In [17]:
df_test = df_clean.copy()
# ' '.join(df_test['clean_text'][0])
df_clean['len_clean_text'] = df_clean['clean_text'].apply(lambda x: hf.word_count(' '.join(x)) if x is not None else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['len_clean_text'] = df_clean['clean_text'].apply(lambda x: hf.word_count(' '.join(x)) if x is not None else 0)


In [7]:
#Top 20 most frequent words for all websites
# review stopwords as needed
cl_text_list = df_clean['clean_text'].tolist()
wf = hf.word_freq(cl_text_list, 20)
wf.head(20)

Unnamed: 0,0,1
0,business,170490
1,insurance,147385
2,solutions,101822
3,team,101329
4,management,97645
5,read,93514
6,products,91372
7,learn,84561
8,–,84323
9,care,82428


In [12]:
# check through top words and output results to excel file
# update punctuation list and stopwords after reviewing these words
# want to capture all punctuation and general "website" words that don't provide insight into the actual industry
df_list = []
categories = df_clean['Category'].unique()

for i in categories:
    df_cat = df_clean.loc[df_clean['Category']==i]
    cl_text_list = df_cat['clean_text'].tolist()
    wf = hf.word_freq(cl_text_list, 50)
    wf = wf.rename(columns={'0':'word','1':'count'})
    wf['Category']=i
    df_list.append(wf)

df_result = pd.concat(df_list)

In [13]:
# df_result.to_excel("C:/Users/megan/OneDrive/Documents/GitHub/sqlite_to_analysis_app/output/top_word.xlsx")

## website EDA part 2

In [20]:
print("Average Words in Cleaned Text:")
avg_word_count = df_clean.groupby('Category')['len_clean_text'].mean().reset_index()
avg_word_count

Average Words in Cleaned Text:


Unnamed: 0,Category,len_clean_text
0,Commercial Services & Supplies,450.570882
1,Consumer Discretionary,567.355332
2,Consumer Staples,442.50019
3,Corporate Services,535.352504
4,Energy & Utilities,444.343559
5,Financials,516.874588
6,Healthcare,525.013366
7,Industrials,451.610912
8,Information Technology,515.83247
9,Materials,449.16597


In [22]:
# Group by Category for total word count
tot_word_count = df_clean.groupby(['Category'], as_index=False)['len_clean_text'].sum()

# Create bar plot
fig = px.bar(
    tot_word_count,
    x='Category',
    y='len_clean_text',
    color='Category',
    title='Total Clean Words by Category',
    barmode='group'  # Group bars by industry
)

# Adjust the axes to scale automatically per group
fig.update_yaxes(matches=None)  # This ensures y-axes are independent

# Show the plot
fig.show()

In [29]:
# distribution of clean word count by category
# Calculate the 5th and 95th percentiles
# lower_bound = df_clean['len_clean_text'].quantile(0.1)
# upper_bound = df_clean['len_clean_text'].quantile(0.90)

# print("upper bound: {}".format(upper_bound))
# print("lower_bound: {}".format(lower_bound))

# # Filter the DataFrame to remove outliers
# filtered_df_clean = df_clean[(df_clean['len_clean_text'] >= lower_bound) & 
#                   (df_clean['len_clean_text']  <= upper_bound)]

# Create faceted charts
fig = px.histogram(
    df_clean.loc[df_clean['len_clean_text']<5000],
    x='len_clean_text',
    color='Category',
    facet_col='Category',
    title='Distribution of Total Word Count per Category'
)

# Show the plot
fig.show()

In [8]:
def split_dataframe_by_category(df, split_column='Category'):
    """
    Splits a DataFrame into multiple DataFrames based on unique values in a category column.
    
    Args:
    - df (df): pandas DataFrame
    - split_column (str): the column name to split the DataFrame on (default is 'Category')
    
    Returns:
    - A dictionary where the keys are unique categories, and the values are DataFrames
    """
    unique_categories = df[split_column].unique()  # Get unique categories
    category_dfs = {category: df[df[split_column] == category].copy() for category in unique_categories}
    
    return category_dfs

In [11]:
category_dfs = split_dataframe_by_category(df_clean, split_column='Category')
category_dfs['Corporate Services']

# ['Corporate Services' 'Media, Marketing & Sales' 'Healthcare'
#  'Industrials' 'Commercial Services & Supplies' 'Consumer Discretionary'
#  'Transportation & Logistics' 'Energy & Utilities' 'Financials'
#  'Professional Services' 'Consumer Staples' 'Materials'
#  'Information Technology']

Unnamed: 0,Company_ID,CompanyName,Website,Industry,Size_Range,Locality,Country,Current_Employee_Estimate,Total_Employee_Estimate,Category,...,h1,h2,h3,nav_link_text,meta_keywords,meta_description,len_homepage_text,Full_Text,len_Full_Text,clean_text
0,99,crinan hotel,crinanhotel.com,hospitality,1 - 10,"ardchonell, argyll and bute, united kingdom",united kingdom,1,3,Corporate Services,...,Latest News#sep#Website Privacy Statement#sep#...,How we use cookies#sep#Security#sep#Let's be S...,Accommodation#sep#Activities#sep#Experience Cr...,,"Crinan hotel, country house hotel, boutique ho...",Crinan Hotel - on waterfront overlooking Loch ...,3897,01546 830261 Crinan · by Lochgilp...,4095,"[crinan, ·, lochgilphead, ·, pa, sr, hotel, hi..."
9,1034,oyster bay seafood restaurant,oysterbayseafoodrestaurant.com,restaurants,1 - 10,"winston salem, north carolina, united states",united states,0,4,Corporate Services,...,,,Monthly Specials#sep#Follow us on,,"Oyster bay, Oysterbay, Oyster, Seafoods, Great...",Cebu Seafood Restaurant is all about Great foo...,365,Contact us Gallery B...,405,"[gallery, banquets, buffet, menu, breakfast, b..."
35,3302,market share property,marketshareproperty.com.au,real estate,1 - 10,"croydon north, victoria, australia",australia,1,1,Corporate Services,...,,Dave speaks about Social media in Elite Agent ...,Recently Sold#sep#The Online Offer Management ...,"home,buy,residential,open for inspection,prior...",,Market Share Property offer real estate for sa...,1837,Home Buy ...,1910,"[buy, browse, properties, residential, open, i..."
47,4656,farmhouse inn and restaurant,farmhouseinn.com,hospitality,11 - 50,"forestville, california, united states",united states,23,57,Corporate Services,...,Everything You've Heard is True,Wine Country Events#sep#A DINING EXPERIENCE WR...,,"rooms,specials,gift cards,wine country,restaur...",,Farmhouse Inn is a romantic and rustic boutiqu...,1161,Farmhouse Inn Rooms Spec...,1254,"[farmhouse, inn, rooms, specials, gift, cards,..."
66,6012,park avenue villas hotel,parkavenuevillas.com,hospitality,1 - 10,"tallahassee, florida, united states",united states,1,1,Corporate Services,...,,Latest News & Updates#sep#Book Directly with U...,Welcome to Park Avenue Villas#sep#Reservation ...,"home,accommodations,1 bedroom condo suites,2 b...",,,1262,info@parkavenuevillas.com / 505-8837...,1371,"[info, parkavenuevillas.com, accommodations, b..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73037,7163750,oceana hotels,oceanahotels.co.uk,hospitality,11 - 50,,united kingdom,18,58,Corporate Services,...,BOOK DIRECT AND SAVE#sep#Oceana Hotels Bournem...,,Oceana Hotels,"home,careers,contact us,why book direct?",,Oceana Hotels is a group of resort-type hotels...,1145,BOOK DIRECT AND SAVE Save on y...,1193,"[book, direct, save, save, room, rate, spa, tr..."
73079,7169558,the georgian terrace hotel,thegeorgianterrace.com,hospitality,11 - 50,"atlanta, georgia, united states",united states,25,99,Corporate Services,...,Welcome to The Georgian Terrace,See what Guests are saying#sep#SpecialsStay 2 ...,Sotherly Radio#sep#Exceptional Extended Stays#...,"hotel,rooms + suites,specials,photos,drink + e...",,The Georgian Terrace is a modern luxury hotel ...,1288,The Georgian Terrace (40...,1383,"[georgian, terrace, rate, guarantee, checkin, ..."
73107,7172369,key property realty,kprhomes.com,real estate,1 - 10,"kennesaw, georgia, united states",united states,5,29,Corporate Services,...,The domain name \n kprhomes.com\n...,,Safe Payments By Adyen#sep#Fast Domain Transfe...,,,The domain name kprhomes.com is for sale. Make...,671,The domain name kprhomes.com is for sale! ...,742,"[domain, name, kprhomes.com, sale, ns, listed,..."
73118,7172834,hands-on staffing ltd,hands-on-staffing.co.uk,staffing and recruiting,1 - 10,"orlando, florida, united states",united states,1,1,Corporate Services,...,Welcome!,Why Do I Need,Follow Us#sep#Twitter#sep#Contacts#sep#London’...,"home,about,staff,contacts","london, event management, catering, staffing, ...",London’s newest events staffing agency.,340,London’s newest events staffing agency ...,374,"[london, newest, events, staffing, agency, men..."


## Model Development

In [30]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [40]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [42]:
# Combine the list of words in 'clean_text' back into a string for TF-IDF
df_clean['clean_text_str'] = df_clean['clean_text'].apply(lambda x: ' '.join(x))

#Turning the labels into numbers
label_encoder = LabelEncoder()
df_clean['Category_Num'] = label_encoder.fit_transform(df_clean['Category'])
print(df_clean['Category'].unique())
print(df_clean['Category_Num'].unique())

# Split the data into features (X) and labels (y)
X = df_clean['clean_text_str']
y = df_clean['Category_Num']

print (X.shape)
print(y.shape)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Fit TF-IDF on the training data
tfidf = TfidfVectorizer(max_features=100, stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



['Corporate Services' 'Media, Marketing & Sales' 'Healthcare'
 'Industrials' 'Commercial Services & Supplies' 'Consumer Discretionary'
 'Transportation & Logistics' 'Energy & Utilities' 'Financials'
 'Professional Services' 'Consumer Staples' 'Materials'
 'Information Technology']
[ 3 10  6  7  0  1 12  4  5 11  2  9  8]
(72141,)
(72141,)


In [43]:
# Define multiple models to test
models = {
    'Naive Bayes': MultinomialNB(),
    'Support Vector Classifier': SVC(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(n_estimators=100)
}


# Evaluate each model
for model_name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    
    print(f"\n=== {model_name} ===")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Cross-validate the models
for model_name, model in models.items():
    scores = cross_val_score(model, X_train_tfidf, y_train, cv=5, scoring='accuracy')
    print(f"{model_name} Cross-Validation Accuracy: {scores.mean():.4f}")



=== Naive Bayes ===
                                precision    recall  f1-score   support

Commercial Services & Supplies       0.53      0.58      0.56      1574
        Consumer Discretionary       0.67      0.04      0.08       693
              Consumer Staples       0.53      0.68      0.60      1341
            Corporate Services       0.52      0.57      0.54      1702
            Energy & Utilities       0.81      0.66      0.73      1385
                    Financials       0.81      0.72      0.76      1641
                    Healthcare       0.72      0.84      0.77      1765
                   Industrials       0.49      0.37      0.42       839
        Information Technology       0.61      0.77      0.68      1502
                     Materials       0.54      0.15      0.23       663
      Media, Marketing & Sales       0.63      0.67      0.65      1559
         Professional Services       0.74      0.67      0.70      1820
    Transportation & Logistics       0.40 

In [None]:


X_train, X_test, y_train, y_test = train_test_split(subjects['question_text'], subjects['actual_subject'], 
                                                    random_state = 0)

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

clf = MultinomialNB().fit(X_train_tfidf, y_train)