# SQLite Database Exploration

This application takes a SQLite database and outputs results to markdown report. 

Steps:
This notebook inspects the database to identify tables, data structure and optimize through indexing. 

In [1]:
import pandas as pd
import sys
import os

# Add the path to utils/ directory, which is one level up from the /data directory
sys.path.append(os.path.abspath(os.path.join('..', 'utils')))


# Now you can import the db_utils module
import db_utils as db


In [2]:
db_path = "C:/Users/megan/OneDrive/Documents/GitHub/sqlite_to_analysis_app/data/combined_data.db"
conn = db.connect_to_db(db_path)

In [3]:
# identify names of tables in the database
db.run_query(conn,"SELECT name FROM sqlite_master WHERE type='table'")

[('CompanyClassification',), ('CompanyDataset',)]

In [4]:
# identify if database is optimized with indexes for CompanyDataset
print(db.run_query(conn,"PRAGMA index_list(CompanyDataset);"))

# identify if database is optimized with indexes for CompanyClassification
print(db.run_query(conn,"PRAGMA index_list(CompanyClassification);"))

[]
[]


In [5]:
print(db.time_query(conn, "SELECT count(*) FROM CompanyDataset"))
print(db.time_query(conn, "SELECT count(*) FROM CompanyClassification"))

(0.12378454208374023, [(7173426,)])
(0.0039997100830078125, [(73974,)])


In [6]:
# check table structure and column names
db.run_query(conn,"PRAGMA table_info('CompanyDataset');")

[(0, 'Unnamed: 0', 'INTEGER', 0, None, 0),
 (1, 'CompanyName', 'TEXT', 0, None, 0),
 (2, 'Website', 'TEXT', 0, None, 0),
 (3, 'year founded', 'REAL', 0, None, 0),
 (4, 'industry', 'TEXT', 0, None, 0),
 (5, 'size range', 'TEXT', 0, None, 0),
 (6, 'locality', 'TEXT', 0, None, 0),
 (7, 'country', 'TEXT', 0, None, 0),
 (8, 'linkedin url', 'TEXT', 0, None, 0),
 (9, 'current employee estimate', 'INTEGER', 0, None, 0),
 (10, 'total employee estimate', 'INTEGER', 0, None, 0)]

In [7]:
# check table structure and column names
db.run_query(conn,"PRAGMA table_info('CompanyClassification');")

[(0, 'Category', 'TEXT', 0, None, 0),
 (1, 'Website', 'TEXT', 0, None, 0),
 (2, 'CompanyName', 'TEXT', 0, None, 0),
 (3, 'homepage_text', 'TEXT', 0, None, 0),
 (4, 'h1', 'TEXT', 0, None, 0),
 (5, 'h2', 'TEXT', 0, None, 0),
 (6, 'h3', 'TEXT', 0, None, 0),
 (7, 'nav_link_text', 'TEXT', 0, None, 0),
 (8, 'meta_keywords', 'TEXT', 0, None, 0),
 (9, 'meta_description', 'TEXT', 0, None, 0)]

In [8]:
# read tables into pandas dataframe
company_dataset = pd.read_sql_query("SELECT * FROM CompanyDataset",conn)
company_classification = pd.read_sql_query("SELECT * FROM CompanyClassification",conn)

In [16]:
print(company_dataset.columns)
print(company_dataset.shape)
company_dataset.head()

Index(['Unnamed: 0', 'CompanyName', 'Website', 'year founded', 'industry',
       'size range', 'locality', 'country', 'linkedin url',
       'current employee estimate', 'total employee estimate'],
      dtype='object')
(7173426, 11)


Unnamed: 0.1,Unnamed: 0,CompanyName,Website,year founded,industry,size range,locality,country,linkedin url,current employee estimate,total employee estimate
0,5872184,ibm,ibm.com,1911.0,information technology and services,10001+,"new york, new york, united states",united states,linkedin.com/company/ibm,274047,716906
1,4425416,tata consultancy services,tcs.com,1968.0,information technology and services,10001+,"bombay, maharashtra, india",india,linkedin.com/company/tata-consultancy-services,190771,341369
2,21074,accenture,accenture.com,1989.0,information technology and services,10001+,"dublin, dublin, ireland",ireland,linkedin.com/company/accenture,190689,455768
3,2309813,us army,goarmy.com,1800.0,military,10001+,"alexandria, virginia, united states",united states,linkedin.com/company/us-army,162163,445958
4,1558607,ey,ey.com,1989.0,accounting,10001+,"london, greater london, united kingdom",united kingdom,linkedin.com/company/ernstandyoung,158363,428960


In [17]:
print(company_classification.columns)
print(company_classification.shape)
company_classification.head()

Index(['Category', 'Website', 'CompanyName', 'homepage_text', 'h1', 'h2', 'h3',
       'nav_link_text', 'meta_keywords', 'meta_description'],
      dtype='object')
(73974, 10)


Unnamed: 0,Category,Website,CompanyName,homepage_text,h1,h2,h3,nav_link_text,meta_keywords,meta_description
0,Commercial Services & Supplies,bipelectric.com,bip dipietro electric inc,Electrici...,,,,,"electricians vero beach, vero beach electrical...","Providing quality, reliable full service resid..."
1,Healthcare,eliasmedical.com,elias medical,site map | en español Elias Medical h...,Offering Bakersfield family medical care from ...,Welcome to ELIAS MEDICAL#sep#Family Medical Pr...,Get To Know Elias Medical#sep#Family Medical P...,,Elias Medical bakersfield ca family doctor med...,For the best value in Bakersfield skin care tr...
2,Commercial Services & Supplies,koopsoverheaddoors.com,koops overhead doors,Home About Us Garage Door Repair & Servi...,,Customer Reviews#sep#Welcome to Koops Overhead...,,,"Koops Overhead Doors, Albany Garage Doors, Tro...","Koops Overhead Doors specializes in the sales,..."
3,Healthcare,midtowneyes.com,midtown eyecare,918-599-0202 Type Size...,,Welcome to our practice!,,,,We would like to welcome you to Midtown Eyecar...
4,Commercial Services & Supplies,reprosecurity.co.uk,repro security ltd,Simply fill out our form below...,,Welcome to REPRO SECURITY Ltd,,,,Repro Security provide a range of tailor made ...


# Optimize tables 
### rename columns for ease in development
There are columns with spaces and one column in CompanyDataset has the nondescript name of "Unnamed: 0". In this case, the column appears to be some kind of company ID.

You can reference a column with spaces in sqlite using a table alias and '' but I'd prefer to clean it up.


### indexing
Columns chosen for indexing based on what I expect to use for filtering, joining, sorting or aggregation. 

That is columns used in:
- WHERE
- JOIN
- ORDER BY
- GROUP BY

There are no indexes set for either CompanyDataset or CompanyClassification, which is really impacting query performance.

In [None]:
# cursor = conn.cursor()
# cursor.execute("ALTER TABLE CompanyDataset RENAME COLUMN 'Unnamed: 0' TO 'Company_ID'")


In [19]:
columns_to_rename = {'Unnamed: 0':'Company_ID',
                     'year founded':'Year_Founded',
                     'industry':'Industry',
                     'size range':'Size_Range',
                     'locality':'Locality',
                     'country':'Country',
                     'linkedin url': 'Linkedin_URL',
                     'current employee estimate':'Current_Employee_Estimate',
                     'total employee estimate':'Total_Employee_Estimate'
                     }

db.rename_columns(conn, 'CompanyDataset', columns_to_rename)

Unnamed: 0 --> Company_ID
year founded --> Year_Founded
industry --> Industry
size range --> Size_Range
locality --> Locality
country --> Country
linkedin url --> Linkedin_URL
current employee estimate --> Current_Employee_Estimate
total employee estimate --> Total_Employee_Estimate


In the cell above, I ran into an issue where the database became locked during development. I was able to resolve this easily by copying the database file to a different location and then Copy/Replace the database file in the /data directory. 

In [20]:
# check table structure and column names
db.run_query(conn,"PRAGMA table_info('CompanyDataset');")

[(0, 'Company_ID', 'INTEGER', 0, None, 0),
 (1, 'CompanyName', 'TEXT', 0, None, 0),
 (2, 'Website', 'TEXT', 0, None, 0),
 (3, 'Year_Founded', 'REAL', 0, None, 0),
 (4, 'Industry', 'TEXT', 0, None, 0),
 (5, 'Size_Range', 'TEXT', 0, None, 0),
 (6, 'Locality', 'TEXT', 0, None, 0),
 (7, 'Country', 'TEXT', 0, None, 0),
 (8, 'Linkedin_URL', 'TEXT', 0, None, 0),
 (9, 'Current_Employee_Estimate', 'INTEGER', 0, None, 0),
 (10, 'Total_Employee_Estimate', 'INTEGER', 0, None, 0)]

In [6]:
# create indexes for both tables
indexes_to_create = {
    'CompanyDataset': ['Company_ID','CompanyName', 'Website','Industry','Size_Range', 'Country','Current_Employee_Estimate','Total_Employee_Estimate']
    ,'CompanyClassification': ['Category', 'CompanyName', 'Website']
}

db.create_indexes(conn, indexes_to_create)


Created index: idx_companydataset_company_id on CompanyDataset(Company_ID)
Created index: idx_companydataset_companyname on CompanyDataset(CompanyName)
Created index: idx_companydataset_website on CompanyDataset(Website)
Created index: idx_companydataset_industry on CompanyDataset(Industry)
Created index: idx_companydataset_size_range on CompanyDataset(Size_Range)
Created index: idx_companydataset_country on CompanyDataset(Country)
Created index: idx_companydataset_current_employee_estimate on CompanyDataset(Current_Employee_Estimate)
Created index: idx_companydataset_total_employee_estimate on CompanyDataset(Total_Employee_Estimate)
Created index: idx_companyclassification_category on CompanyClassification(Category)
Created index: idx_companyclassification_companyname on CompanyClassification(CompanyName)
Created index: idx_companyclassification_website on CompanyClassification(Website)


In [13]:
indexes = pd.DataFrame(db.run_query(conn,"PRAGMA index_list(CompanyDataset);"))
indexes

Unnamed: 0,0,1,2,3,4
0,0,idx_companydataset_total_employee_estimate,0,c,0
1,1,idx_companydataset_current_employee_estimate,0,c,0
2,2,idx_companydataset_country,0,c,0
3,3,idx_companydataset_size_range,0,c,0
4,4,idx_companydataset_industry,0,c,0
5,5,idx_companydataset_website,0,c,0
6,6,idx_companydataset_companyname,0,c,0
7,7,idx_companydataset_company_id,0,c,0


In [7]:
# check that indexes have optimized queries
print(db.time_query(conn, "SELECT count(*) FROM CompanyDataset"))
print(db.time_query(conn, "SELECT count(*) FROM CompanyClassification"))

(0.07013940811157227, [(7173426,)])
(0.002001047134399414, [(73974,)])


# EDA

In [24]:
df_hp = pd.read_sql_query( "SELECT * from CompanyDataset as cd WHERE CompanyName='hewlett-packard'",conn)
df_hp

Unnamed: 0,Company_ID,CompanyName,Website,Year_Founded,Industry,Size_Range,Locality,Country,Linkedin_URL,Current_Employee_Estimate,Total_Employee_Estimate
0,3844889,hewlett-packard,hpe.com,1939.0,information technology and services,10001+,"palo alto, california, united states",united states,linkedin.com/company/hewlett-packard-enterprise,127952,412952
1,1428341,hewlett-packard,ngppr.ru,,real estate,201 - 500,,,linkedin.com/company/нгппр,93,707
2,5805348,hewlett-packard,,,facilities services,51 - 200,"glendale, california, united states",united states,linkedin.com/company/h-and-p-inc-,39,136
3,6195736,hewlett-packard,globalsoftuk.com,,information technology and services,1 - 10,"didcot, oxfordshire, united kingdom",united kingdom,linkedin.com/company/globalsoft-uk-ltd,0,4


In [50]:
df_ibm = pd.read_sql_query( "SELECT * from CompanyDataset as cd WHERE CompanyName='ibm'",conn)
df_ibm

Unnamed: 0,Company_ID,CompanyName,Website,Year_Founded,Industry,Size_Range,Locality,Country,Linkedin_URL,Current_Employee_Estimate,Total_Employee_Estimate
0,5872184,ibm,ibm.com,1911.0,information technology and services,10001+,"new york, new york, united states",united states,linkedin.com/company/ibm,274047,716906
1,1537496,ibm,,,information technology and services,501 - 1000,,,linkedin.com/company/global-value,280,931
2,6903843,ibm,,1912.0,information technology and services,11 - 50,,,linkedin.com/company/ibm-united-kingdom-ltd,7,50
3,2567154,ibm,,1914.0,information technology and services,1 - 10,"colombes, ile-de-france, france",france,linkedin.com/company/ibm-france,3,8
4,939747,ibm,mcgeemarketingconsultants.com,,computer software,1 - 10,,,linkedin.com/company/mindflow,2,6
5,5479744,ibm,ezsource.com,2003.0,computer software,1 - 10,"mevo modi`im, hamerkaz, israel",israel,linkedin.com/company/ezlegacy,1,12
6,5733310,ibm,,,,1 - 10,,,linkedin.com/company/ibmdaf,0,1
7,2519019,ibm,ibmconsult.com,2012.0,marketing and advertising,1 - 10,,,linkedin.com/company/ibmconsult-com,0,2
8,6161271,ibm,,,,1 - 10,,,linkedin.com/company/fiokware,0,1


In [25]:
company_dataset['diff'] = company_dataset['total employee estimate'] - company_dataset['current employee estimate']


In [23]:
company_dataset.sort_values(by='diff', ascending=False).head()

Unnamed: 0.1,Unnamed: 0,CompanyName,Website,year founded,industry,size range,locality,country,linkedin url,current employee estimate,total employee estimate,diff
0,5872184,ibm,ibm.com,1911.0,information technology and services,10001+,"new york, new york, united states",united states,linkedin.com/company/ibm,274047,716906,442859
5,3844889,hewlett-packard,hpe.com,1939.0,information technology and services,10001+,"palo alto, california, united states",united states,linkedin.com/company/hewlett-packard-enterprise,127952,412952,285000
3,2309813,us army,goarmy.com,1800.0,military,10001+,"alexandria, virginia, united states",united states,linkedin.com/company/us-army,162163,445958,283795
4,1558607,ey,ey.com,1989.0,accounting,10001+,"london, greater london, united kingdom",united kingdom,linkedin.com/company/ernstandyoung,158363,428960,270597
11,2780814,pwc,pwc.com,1998.0,accounting,10001+,"new york, new york, united states",united states,linkedin.com/company/pwc,111372,379447,268075


In [27]:
company_dataset['size range'].unique()

array(['10001+', '5001 - 10000', '1001 - 5000', '501 - 1000', '201 - 500',
       '51 - 200', '11 - 50', '1 - 10'], dtype=object)

# Query Questions

## Output results to excel files under output folder

In [43]:
# def query_to_excel(conn, output_dir, query, output_filename='query_results.xlsx'):
#     """
#     Runs a query on the SQLite database and saves the results to an Excel file in the output folder.
    
#     Parameters:
#     - conn: str, created from connect_to_db function
#     - output_dir: str, path to output file directory
#     - query: str, the SQL query to run.
#     - output_filename: str, the filename for the output Excel file (default is 'query_results.xlsx').
    
#     Returns:
#     - str, path to the saved Excel file.
#     """
#     # Construct the path to the output folder
#     # base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
#     # output_dir = os.path.join(base_dir, 'output')
#     # print(output_dir)
    
#     # Run the query and fetch the result into a DataFrame
#     df = pd.read_sql_query(query, conn)
    
#     # Ensure the output folder exists
#     os.makedirs(output_dir, exist_ok=True)
    
#     # Construct the full path for the output Excel file
#     output_path = os.path.join(output_dir, output_filename)
    
#     # Write the DataFrame to an Excel file
#     df.to_excel(output_path, index=False)
    
#     print(f"Query results successfully saved to: {output_path}")
#     return output_path

## Find the top 10 industries with the highest average number of employees

** only considering companies founded after 2000 that more than 10 employees

Data has multiple rows per company, suggesting each row could represent a different location entity of the corporation. 

Note - need to confirm if the top 10 are by current or total employee count

In [39]:
df_industry= pd.read_sql_query( '''
                                    SELECT 
                                        cd.Industry
                                        ,AVG(cd.Current_Employee_Estimate)      as 'AVG_Current_Employee'
                                        ,AVG(cd.Total_Employee_Estimate)        as 'AVG_Total_Employee'
                                    FROM CompanyDataset as cd
                                    WHERE cd.Year_Founded > 2000
                                    AND cd.'Size_Range' <> '1 - 10'
                                    GROUP BY cd.Industry
                                    ORDER BY 3 desc
                                    LIMIT 10
                           ''',conn)
df_industry

Unnamed: 0,Industry,AVG_Current_Employee,AVG_Total_Employee
0,tobacco,368.017241,906.913793
1,government administration,132.592317,257.164564
2,banking,104.675101,208.69585
3,supermarkets,86.938462,207.730769
4,dairy,103.022222,206.044444
5,legislative office,48.043478,164.086957
6,semiconductors,63.942731,157.331498
7,defense & space,68.672566,151.281416
8,airlines/aviation,67.569304,126.086258
9,telecommunications,54.69171,125.991436


In [47]:
top_total_query = '''
            SELECT 
                cd.Industry
                ,AVG(cd.Current_Employee_Estimate)      as 'AVG_Current_Employee'
                ,AVG(cd.Total_Employee_Estimate)        as 'AVG_Total_Employee'
            FROM CompanyDataset as cd
            WHERE cd.Year_Founded > 2000
            AND cd.Size_Range <> '1 - 10'
            GROUP BY cd.Industry
            ORDER BY 3 desc
            LIMIT 10
        '''
output_dir = 'C:/Users/megan/OneDrive/Documents/GitHub/sqlite_to_analysis_app/output'
query_to_excel(conn, output_dir, top_total_query, output_filename='top_industries_by_total_avg_employees.xlsx')

Query results successfully saved to: C:/Users/megan/OneDrive/Documents/GitHub/sqlite_to_analysis_app/output\top_industries_by_total_avg_employees.xlsx


'C:/Users/megan/OneDrive/Documents/GitHub/sqlite_to_analysis_app/output\\top_industries_by_total_avg_employees.xlsx'

In [34]:
# df_industry.sort_values(by='AVG_Current_Employee', ascending=False)
df_industry.nlargest(10,'AVG_Current_Employee')

Unnamed: 0,Industry,AVG_Current_Employee,AVG_Total_Employee
0,tobacco,368.017241,906.913793
1,government administration,132.592317,257.164564
2,banking,104.675101,208.69585
4,dairy,103.022222,206.044444
3,supermarkets,86.938462,207.730769
7,defense & space,68.672566,151.281416
8,airlines/aviation,67.569304,126.086258
6,semiconductors,63.942731,157.331498
9,telecommunications,54.69171,125.991436
5,legislative office,48.043478,164.086957


## Identify companies in the Technology-like industries 
- limit to companies that do not have an effective homepage_text AND less than 100 employees
- "effective" homepage_text cannot be null and must have at least 50 words
--> I found that 669 websites had no homepage text and most cases have between 200-400 words. 

Because there are only text fields in common between the two tables CompanyDataset and CompanyClassification, I first tested merging the two datasets on 'Website' as a key. 

This results in 322 cases where the CompanyName did not match between the tables. On deeper inspection this appears to be because of character differences (UNICODE, punctuation, etc.). I could clean the CompanyNames on the join clause so that I use both fields as my join criteria. However, I'm satisfied that website can be used as a unique key here



In [89]:
# show companies that DO NOT have an effective homepage AND have less than 100 employees within the Technology-like industry

df_tech = pd.read_sql_query( '''
                                SELECT DISTINCT
                                    cc.Category
                                    ,cd.Industry
                                    ,cd.CompanyName
                                    --,cd.Website
                                    --,cc.homepage_text
                                FROM CompanyDataset as cd
                                LEFT JOIN CompanyClassification as cc
                                    ON cd.Website=cc.Website
                                    and cc.Website is not null
                                WHERE cd.Website is not null
                                and cd.CompanyName is not null
                                and cd.Total_Employee_Estimate < 100
                                and cc.homepage_text is null
                                and cc.Category like '%technology%'
                            ''',conn)

df_tech

Unnamed: 0,Category,Industry,CompanyName
0,Information Technology,information technology and services,itrend business solutions cc
1,Information Technology,computer software,real control solutions ltd
2,Information Technology,computer software,elite value solutions
3,Information Technology,information technology and services,risen solutions
4,Information Technology,information technology and services,wireless 1 apps inc.
5,Information Technology,information technology and services,kuplasolutions
6,Information Technology,information technology and services,w3cloud llc
7,Information Technology,information technology and services,cdn tech solutions
8,Information Technology,computer software,datum global solutions
9,Information Technology,computer software,cloudstream


In [10]:
# read tables into pandas dataframe with new column names
company_dataset = pd.read_sql_query("SELECT * FROM CompanyDataset",conn)
company_classification = pd.read_sql_query("SELECT * FROM CompanyClassification",conn)

In [50]:
print("Number of rows with empty homepage text:", len(company_classification.loc[company_classification['homepage_text'].isna()]))
print(len(company_dataset))
print("Number of companies with less than 100 total employees:", len(company_dataset.loc[company_dataset['Total_Employee_Estimate']<100]))
print("Number of companies with less than 100 current employees:", len(company_dataset.loc[company_dataset['Current_Employee_Estimate']<100]))

Number of rows with empty homepage text: 669
7173426
Number of companies with less than 100 employees: 6925840
Number of companies with less than 100 employees: 7062530


In [72]:
company_classification['Category'].unique()

array(['Commercial Services & Supplies', 'Healthcare', 'Materials',
       'Financials', 'Energy & Utilities', 'Professional Services',
       'Corporate Services', 'Media, Marketing & Sales',
       'Information Technology', 'Consumer Discretionary', 'Industrials',
       'Transportation & Logistics', 'Consumer Staples'], dtype=object)

## Rank companies within each country by their total employee estimate
- descending order by total employee estimate
- show only companies that rank in the top 5 per country


In [48]:
df_top5 = pd.read_sql_query( '''
                                with country_rank as (SELECT 
                                                        cd.Country
                                                        ,cd.CompanyName
                                                        ,cd.Total_Employee_Estimate
                                                        ,row_number() over( partition by cd.Country order by cd.Total_Employee_Estimate desc)   RowNum
                                                    FROM CompanyDataset as cd
                                                    WHERE cd.Country is not null
                                                    and cd.CompanyName is not null
                                                    )
                            SELECT *
                            FROM country_rank 
                            where RowNum <6
                                                        
                            ''',conn)

df_top5

Unnamed: 0,Country,CompanyName,Total_Employee_Estimate,RowNum
0,afghanistan,roshan,986,1
1,afghanistan,awcc,823,2
2,afghanistan,etisalat afghanistan,818,3
3,afghanistan,"ministry of agriculture, irrigation and livestock",508,4
4,afghanistan,mtn afghanistan,423,5
...,...,...,...,...
1144,Åland islands,crosskey banking solutions,277,1
1145,Åland islands,ålands landskapsregering,196,2
1146,Åland islands,rederiaktiebolaget eckerö,116,3
1147,Åland islands,posten åland,94,4


In [49]:
top5_country = '''
                with country_rank as (SELECT 
                                        cd.Country
                                        ,cd.CompanyName
                                        ,cd.Total_Employee_Estimate
                                        ,row_number() over( partition by cd.Country order by cd.Total_Employee_Estimate desc)   RowNum
                                    FROM CompanyDataset as cd
                                    WHERE cd.Country is not null
                                    and cd.CompanyName is not null
                                    )
                SELECT *
                FROM country_rank 
                where RowNum <6                       
                '''
db.run_query(conn, top5_country)

[('afghanistan', 'roshan', 986, 1),
 ('afghanistan', 'awcc', 823, 2),
 ('afghanistan', 'etisalat afghanistan', 818, 3),
 ('afghanistan', 'ministry of agriculture, irrigation and livestock', 508, 4),
 ('afghanistan', 'mtn afghanistan', 423, 5),
 ('albania', 'albtelecom albania', 892, 1),
 ('albania', 'raiffeisen bank albania', 779, 2),
 ('albania', 'telekom albania', 671, 3),
 ('albania', 'intesa sanpaolo bank albania', 447, 4),
 ('albania', 'national food authority', 399, 5),
 ('algeria', 'sonatrach', 17062, 1),
 ('algeria', 'algerie telecom', 3017, 2),
 ('algeria', 'ooredoo algérie', 2318, 3),
 ('algeria', 'groupe cevital', 2298, 4),
 ('algeria', 'naftal spa', 1879, 5),
 ('american samoa', 'american samoa government', 268, 1),
 ('american samoa', 'blue sky communications', 96, 2),
 ('american samoa', 'american samoa community college', 32, 3),
 ('american samoa', 'genexy company limited', 6, 4),
 ('american samoa', 'rda law firm', 3, 5),
 ('andorra', 'crèdit andorrà', 472, 1),
 ('ando

# Merge CompanyDataset and CompanyClassification

- merge both tables into one table
- ensure data is clean and ready for analysis
- load merged dataset to new table in combined_data.db 

In [14]:
company_classification.isnull().sum(axis=0)

Category                0
Website                 0
CompanyName             0
homepage_text         669
h1                  27321
h2                  20762
h3                  29315
nav_link_text       25924
meta_keywords       50302
meta_description     7088
dtype: int64

In [15]:
company_dataset.isnull().sum(axis=0)

Company_ID                         0
CompanyName                        3
Website                      1650621
Year_Founded                 3606980
Industry                      290003
Size_Range                         0
Locality                     2508825
Country                      2349207
Linkedin_URL                       0
Current_Employee_Estimate          0
Total_Employee_Estimate            0
dtype: int64

In [16]:
# check table structure and column names
db.run_query(conn,"PRAGMA table_info('CompanyDataset');")

[(0, 'Company_ID', 'INTEGER', 0, None, 0),
 (1, 'CompanyName', 'TEXT', 0, None, 0),
 (2, 'Website', 'TEXT', 0, None, 0),
 (3, 'Year_Founded', 'REAL', 0, None, 0),
 (4, 'Industry', 'TEXT', 0, None, 0),
 (5, 'Size_Range', 'TEXT', 0, None, 0),
 (6, 'Locality', 'TEXT', 0, None, 0),
 (7, 'Country', 'TEXT', 0, None, 0),
 (8, 'Linkedin_URL', 'TEXT', 0, None, 0),
 (9, 'Current_Employee_Estimate', 'INTEGER', 0, None, 0),
 (10, 'Total_Employee_Estimate', 'INTEGER', 0, None, 0)]

In [17]:
# check table structure and column names
db.run_query(conn,"PRAGMA table_info('CompanyClassification');")

[(0, 'Category', 'TEXT', 0, None, 0),
 (1, 'Website', 'TEXT', 0, None, 0),
 (2, 'CompanyName', 'TEXT', 0, None, 0),
 (3, 'homepage_text', 'TEXT', 0, None, 0),
 (4, 'h1', 'TEXT', 0, None, 0),
 (5, 'h2', 'TEXT', 0, None, 0),
 (6, 'h3', 'TEXT', 0, None, 0),
 (7, 'nav_link_text', 'TEXT', 0, None, 0),
 (8, 'meta_keywords', 'TEXT', 0, None, 0),
 (9, 'meta_description', 'TEXT', 0, None, 0)]

## create new table to insert into

In [23]:
db.run_query(conn, '''
                    CREATE TABLE IF NOT EXISTS CompanyMerged (
                     Company_ID INTEGER PRIMARY KEY
                     ,CompanyName TEXT NOT NULL
                     ,Website TEXT NOT NULL
                     ,Industry TEXT
                     ,Size_Range TEXT
                     ,Locality TEXT
                     ,Country TEXT
                     ,Current_Employee_Estimate INTEGER
                     ,Total_Employee_Estimate INTEGER
                     ,Category TEXT NOT NULL
                     ,homepage_text TEXT NOT NULL
                     ,h1 TEXT
                     ,h2 TEXT
                     ,h3 TEXT
                     ,nav_link_text TEXT
                     ,meta_keywords TEXT
                     ,meta_description TEXT

                    )
                    '''
             )


[]

In [24]:
# identify names of tables in the database
db.run_query(conn,"SELECT name FROM sqlite_master WHERE type='table'")

[('CompanyClassification',), ('CompanyDataset',), ('CompanyMerged',)]

In [31]:
db.run_query(conn, '''
                    INSERT INTO CompanyMerged
                    SELECT 
                        cd.Company_ID
                        ,cd.CompanyName
                        ,cd.Website
                        ,cd.Industry
                        ,cd.Size_Range
                        ,cd.Locality
                        ,cd.Country
                        ,cd.Current_Employee_Estimate
                        ,cd.Total_Employee_Estimate
                        ,cc.Category
                        ,cc.homepage_text
                        ,cc.h1
                        ,cc.h2
                        ,cc.h3
                        ,cc.nav_link_text
                        ,cc.meta_keywords
                        ,cc.meta_description
                    FROM CompanyDataset as cd
                    INNER JOIN CompanyClassification as cc
                        ON cd.Website=cc.Website
                        and cc.Website is not null
                    WHERE cd.Website is not null
                    and cd.CompanyName is not null
                    and cc.homepage_text is not null
                    ''')

[]

In [33]:
# create indexes for both tables
indexes_to_create = {
    'CompanyMerged': ['Company_ID','CompanyName', 'Website','Industry','Category']
}

db.create_indexes(conn, indexes_to_create)


Created index: idx_companymerged_company_id on CompanyMerged(Company_ID)
Created index: idx_companymerged_companyname on CompanyMerged(CompanyName)
Created index: idx_companymerged_website on CompanyMerged(Website)
Created index: idx_companymerged_industry on CompanyMerged(Industry)
Created index: idx_companymerged_category on CompanyMerged(Category)


In [32]:
company_merged = pd.read_sql_query("select * from CompanyMerged",conn)
print(len(company_merged))
company_merged.head()

73124


Unnamed: 0,Company_ID,CompanyName,Website,Industry,Size_Range,Category,homepage_text,h1,h2,h3,nav_link_text,meta_keywords,meta_description
0,99,crinan hotel,crinanhotel.com,hospitality,1 - 10,Corporate Services,01546 830261 Crinan · by Lochgilp...,Latest News#sep#Website Privacy Statement#sep#...,How we use cookies#sep#Security#sep#Let's be S...,Accommodation#sep#Activities#sep#Experience Cr...,,"Crinan hotel, country house hotel, boutique ho...",Crinan Hotel - on waterfront overlooking Loch ...
1,222,"spot on productions, llc",spotonproductionsllc.com,entertainment,1 - 10,"Media, Marketing & Sales",...,Storytelling Brought to Life.,,,,,"We're Philip Scarborough and Tom Beck, the for..."
2,535,akhand jyoti eye hospital,akhandjyoti.in,hospital & health care,11 - 50,Healthcare,Donate ...,Eradicate Curable Blindness,"12,600,000#sep#In Low-Income States Of India",Our Girls Help#sep#Donate In Specific Programs...,"why blindness,women empowerment,our impact,abo...",Akhand Jyoti - the largest eye hospital in eas...,"Akhandjyoti, akhand jyoti eye hospital, non-pr..."
3,642,lasercare eye center,dfweyes.com,medical practice,1 - 10,Healthcare,...,,,,"home,why choose us,new patient information,pat...",,Call 214.574.9600 TODAY for an appointment! Th...
4,675,compumachine inc,compumachine.com,machinery,1 - 10,Industrials,MACHINES & AUTOMATION HOME MACHINE...,,MACHINES & AUTOMATION,,"home,machines,automation,mastercam,services,ab...",,Compumachine is proud to offer CNC Machine Too...
