# SQLite Database Exploration

This application takes a SQLite database and outputs results to markdown report. 

Steps:
This notebook inspects the database to identify tables, data structure and optimize through indexing. 

In [1]:
import pandas as pd
import sys
import os

# Add the path to utils/ directory, which is one level up from the /data directory
sys.path.append(os.path.abspath(os.path.join('..', 'utils')))

# Now you can import the db_utils module
import db_utils as db


In [2]:
db_path = "C:/Users/megan/OneDrive/Documents/GitHub/sqlite_to_analysis_app/data/combined_data.db"
conn = db.connect_to_db(db_path)

In [3]:
# identify names of tables in the database
db.run_query(conn,"SELECT name FROM sqlite_master WHERE type='table'")

[('CompanyClassification',), ('CompanyDataset',)]

In [4]:
# identify if database is optimized with indexes for CompanyDataset
print(db.run_query(conn,"SELECT * FROM sqlite_master WHERE type='index' and name='CompanyDataset'"))

# identify if database is optimized with indexes for CompanyClassification
print(db.run_query(conn,"SELECT * FROM sqlite_master WHERE type='index' and name='CompanyClassification'"))

[]
[]


In [5]:
print(db.time_query(conn, "SELECT count(*) FROM CompanyDataset"))
print(db.time_query(conn, "SELECT count(*) FROM CompanyClassification"))

(1.0531792640686035, [(7173426,)])
(0.1920790672302246, [(73974,)])


In [6]:
# check table structure and column names
db.run_query(conn,"PRAGMA table_info('CompanyDataset');")

[(0, 'Unnamed: 0', 'INTEGER', 0, None, 0),
 (1, 'CompanyName', 'TEXT', 0, None, 0),
 (2, 'Website', 'TEXT', 0, None, 0),
 (3, 'year founded', 'REAL', 0, None, 0),
 (4, 'industry', 'TEXT', 0, None, 0),
 (5, 'size range', 'TEXT', 0, None, 0),
 (6, 'locality', 'TEXT', 0, None, 0),
 (7, 'country', 'TEXT', 0, None, 0),
 (8, 'linkedin url', 'TEXT', 0, None, 0),
 (9, 'current employee estimate', 'INTEGER', 0, None, 0),
 (10, 'total employee estimate', 'INTEGER', 0, None, 0)]

In [7]:
# check table structure and column names
db.run_query(conn,"PRAGMA table_info('CompanyClassification');")

[(0, 'Category', 'TEXT', 0, None, 0),
 (1, 'Website', 'TEXT', 0, None, 0),
 (2, 'CompanyName', 'TEXT', 0, None, 0),
 (3, 'homepage_text', 'TEXT', 0, None, 0),
 (4, 'h1', 'TEXT', 0, None, 0),
 (5, 'h2', 'TEXT', 0, None, 0),
 (6, 'h3', 'TEXT', 0, None, 0),
 (7, 'nav_link_text', 'TEXT', 0, None, 0),
 (8, 'meta_keywords', 'TEXT', 0, None, 0),
 (9, 'meta_description', 'TEXT', 0, None, 0)]

In [8]:
# read tables 
company_dataset = pd.read_sql_query("SELECT * FROM CompanyDataset",conn)
company_classification = pd.read_sql_query("SELECT * FROM CompanyClassification",conn)

In [9]:
print(company_dataset.columns)
company_dataset.head()

Index(['Unnamed: 0', 'CompanyName', 'Website', 'year founded', 'industry',
       'size range', 'locality', 'country', 'linkedin url',
       'current employee estimate', 'total employee estimate'],
      dtype='object')


Unnamed: 0.1,Unnamed: 0,CompanyName,Website,year founded,industry,size range,locality,country,linkedin url,current employee estimate,total employee estimate
0,5872184,ibm,ibm.com,1911.0,information technology and services,10001+,"new york, new york, united states",united states,linkedin.com/company/ibm,274047,716906
1,4425416,tata consultancy services,tcs.com,1968.0,information technology and services,10001+,"bombay, maharashtra, india",india,linkedin.com/company/tata-consultancy-services,190771,341369
2,21074,accenture,accenture.com,1989.0,information technology and services,10001+,"dublin, dublin, ireland",ireland,linkedin.com/company/accenture,190689,455768
3,2309813,us army,goarmy.com,1800.0,military,10001+,"alexandria, virginia, united states",united states,linkedin.com/company/us-army,162163,445958
4,1558607,ey,ey.com,1989.0,accounting,10001+,"london, greater london, united kingdom",united kingdom,linkedin.com/company/ernstandyoung,158363,428960


In [10]:
print(company_classification.columns)
company_classification.head()

Index(['Category', 'Website', 'CompanyName', 'homepage_text', 'h1', 'h2', 'h3',
       'nav_link_text', 'meta_keywords', 'meta_description'],
      dtype='object')


Unnamed: 0,Category,Website,CompanyName,homepage_text,h1,h2,h3,nav_link_text,meta_keywords,meta_description
0,Commercial Services & Supplies,bipelectric.com,bip dipietro electric inc,Electrici...,,,,,"electricians vero beach, vero beach electrical...","Providing quality, reliable full service resid..."
1,Healthcare,eliasmedical.com,elias medical,site map | en español Elias Medical h...,Offering Bakersfield family medical care from ...,Welcome to ELIAS MEDICAL#sep#Family Medical Pr...,Get To Know Elias Medical#sep#Family Medical P...,,Elias Medical bakersfield ca family doctor med...,For the best value in Bakersfield skin care tr...
2,Commercial Services & Supplies,koopsoverheaddoors.com,koops overhead doors,Home About Us Garage Door Repair & Servi...,,Customer Reviews#sep#Welcome to Koops Overhead...,,,"Koops Overhead Doors, Albany Garage Doors, Tro...","Koops Overhead Doors specializes in the sales,..."
3,Healthcare,midtowneyes.com,midtown eyecare,918-599-0202 Type Size...,,Welcome to our practice!,,,,We would like to welcome you to Midtown Eyecar...
4,Commercial Services & Supplies,reprosecurity.co.uk,repro security ltd,Simply fill out our form below...,,Welcome to REPRO SECURITY Ltd,,,,Repro Security provide a range of tailor made ...


# Optimize tables by adding indexes

Columns chosen for indexing based on what I expect to use for filtering, joining, sorting or aggregation. 

That is columns used in:
- WHERE
- JOIN
- ORDER BY
- GROUP BY

There are no indexes set for either CompanyDataset or CompanyClassification. Also there is a column in the Company dataset that has no name. This should cause an error if it happens in future cases. In this case, the column appears to be some kind of company ID.

In [11]:
print(db.check_for_unnamed_columns(conn, 'CompanyClassification'))
print(db.check_for_unnamed_columns(conn, 'CompanyDataset'))

[]
[]


In [12]:
cursor = conn.cursor()
cursor.execute("ALTER TABLE CompanyDataset RENAME COLUMN 'Unnamed: 0' TO 'Company_ID'")


<sqlite3.Cursor at 0x286bdb0dce0>

In [56]:
# rename columns in table for development ease

def rename_columns(conn, table_name, columns_to_rename):
    """
    Create rename columns on SQLite tables based on the dictionary input.
    
    Args:
    - conn (str): connection to SQLite database file.
    - table_name (str): name of table to update
    - indexes_to_create (dict): A dictionary where keys are table names and values are lists of column names to index.
    
    Example:
    columns_to_rename = {
        'Unnamed: 0':'Company_ID",
        'size range':'size_range'
    }
    """
    cursor = conn.cursor()
    table = table_name
    
    # Iterate over the dictionary of current and updated column names
    for key,value in columns_to_rename.items():
        print(key, "-->", value)
        if ' ' in key:
        # SQL query to update the column name
            query = "ALTER TABLE {} RENAME COLUMN '{}'TO {}".format(table, key, value)
        else:
            # SQL query to update the column name
            query = "ALTER TABLE {} RENAME COLUMN {} TO {}".format(table, key, value)
            
        # Execute the query
        cursor.execute(query)
        # print(f"Updated name: {table} column changed from {key} to {value}")
    
    # Commit and close the connection
    conn.commit()

In [59]:
columns_to_rename = {
                     'country':'Country'}

rename_columns(conn, 'CompanyDataset', columns_to_rename)

country --> Country


In the cell above, I ran into an issue where the database became locked during development. I was able to resolve this easily by copying the database file to a different location and then Copy/Replace the database file in the /data directory. 

In [60]:
# check table structure and column names
db.run_query(conn,"PRAGMA table_info('CompanyDataset');")

[(0, 'Company_ID', 'INTEGER', 0, None, 0),
 (1, 'CompanyName', 'TEXT', 0, None, 0),
 (2, 'Website', 'TEXT', 0, None, 0),
 (3, 'Year_Founded', 'REAL', 0, None, 0),
 (4, 'industry', 'TEXT', 0, None, 0),
 (5, 'Size_Range', 'TEXT', 0, None, 0),
 (6, 'locality', 'TEXT', 0, None, 0),
 (7, 'Country', 'TEXT', 0, None, 0),
 (8, 'linkedin url', 'TEXT', 0, None, 0),
 (9, 'current employee estimate', 'INTEGER', 0, None, 0),
 (10, 'total employee estimate', 'INTEGER', 0, None, 0)]

In [38]:
# Example usage
indexes_to_create = {
    'CompanyDataset': ['Company_ID','CompanyName', 'Website','industry','size range', 'country']
    ,'CompanyClassification': ['Category', 'CompanyName', 'Website']
}

db.create_indexes(conn, indexes_to_create)


Created index: idx_companydataset_company_id on CompanyDataset(Company_ID)
Created index: idx_companydataset_companyname on CompanyDataset(CompanyName)
Created index: idx_companydataset_website on CompanyDataset(Website)
Created index: idx_companydataset_industry on CompanyDataset(industry)


OperationalError: near "range": syntax error

# EDA

In [28]:
df_ibm = pd.read_sql_query( "SELECT * from CompanyDataset as cd WHERE CompanyName='ibm'",conn)
df_ibm

Unnamed: 0,Company_ID,CompanyName,Website,year founded,industry,size range,locality,country,linkedin url,current employee estimate,total employee estimate
0,5872184,ibm,ibm.com,1911.0,information technology and services,10001+,"new york, new york, united states",united states,linkedin.com/company/ibm,274047,716906
1,1537496,ibm,,,information technology and services,501 - 1000,,,linkedin.com/company/global-value,280,931
2,6903843,ibm,,1912.0,information technology and services,11 - 50,,,linkedin.com/company/ibm-united-kingdom-ltd,7,50
3,2567154,ibm,,1914.0,information technology and services,1 - 10,"colombes, ile-de-france, france",france,linkedin.com/company/ibm-france,3,8
4,939747,ibm,mcgeemarketingconsultants.com,,computer software,1 - 10,,,linkedin.com/company/mindflow,2,6
5,5479744,ibm,ezsource.com,2003.0,computer software,1 - 10,"mevo modi`im, hamerkaz, israel",israel,linkedin.com/company/ezlegacy,1,12
6,5733310,ibm,,,,1 - 10,,,linkedin.com/company/ibmdaf,0,1
7,2519019,ibm,ibmconsult.com,2012.0,marketing and advertising,1 - 10,,,linkedin.com/company/ibmconsult-com,0,2
8,6161271,ibm,,,,1 - 10,,,linkedin.com/company/fiokware,0,1


## Find the top 10 industries with the highest average number of employees

** only considering companies founded after 2000 that more than 10 employees

Data has multiple rows per company, suggesting each row could represent a different entity of the corporation. The average # of employees per CompanyName before the filtering to top 10

In [32]:
df_company = pd.read_sql_query( '''
                    SELECT 
                        cd.industry
                        ,cd.Company_ID
                        ,cd.CompanyName
                        ,cd.'size range'
                        ,cd.'current employee estimate'
                        ,cd.'total employee estimate'
                    FROM CompanyDataset as cd
                    WHERE cd.'year founded' > 2000
                    ORDER BY cd.'CompanyName',cd.'size range' 
                           ''',conn)
df_company

Unnamed: 0,industry,Company_ID,CompanyName,size range,current employee estimate,total employee estimate
0,information technology and services,256899,,51 - 200,18,29
1,education management,3896268,! boost-your-sales !,1 - 10,5,5
2,real estate,6115557,! cb repossessions !,5001 - 10000,1879,2040
3,marketing and advertising,962323,! design e comunicação,1 - 10,3,3
4,information technology and services,6829416,"!80kb limited, uk",1 - 10,5,5
...,...,...,...,...,...,...
2566134,internet,2232733,💡 @1871chicago,201 - 500,150,308
2566135,internet,1542285,💡 myhappyidea.com,11 - 50,4,4
2566136,marketing and advertising,2601979,📲 takcam social media/digital marketing | info...,1 - 10,1,1
2566137,photography,4296555,📷 cm2b photography & design,1 - 10,0,1


In [24]:
# Find the top 10 industries with the highest average number of employees, only considering companies founded after 2000 that more than 10 employees

db.run_query(conn,'''
                    SELECT 
                        cd.industry
                        ,cd.'size range'
                    FROM CompanyDataset as cd
                    WHERE cd.'year founded' > 2000
                    ORDER BY cd.'total employee estimate' desc 
                    LIMIT 10''')

[('information technology and services', '10001+'),
 ('telecommunications', '10001+'),
 ('aviation & aerospace', '10001+'),
 ('financial services', '10001+'),
 ('retail', '10001+'),
 ('defense & space', '10001+'),
 ('hospitality', '10001+'),
 ('information technology and services', '10001+'),
 ('internet', '10001+'),
 ('hospital & health care', '10001+')]