# Part 2 -- Data Integration and Database Insertion

In [1]:
import os
import pandas as pd
import sqlite3

In [2]:
con = sqlite3.connect(os.path.join('data', 'combined_data.db'))
cur = con.cursor()

In [3]:
res = cur.execute("""
    SELECT * FROM sqlite_master
""")

for c in cur.fetchall():
    print(c)

('table', 'CompanyClassification', 'CompanyClassification', 2, 'CREATE TABLE "CompanyClassification" (\n"Category" TEXT,\n  "Website" TEXT,\n  "CompanyName" TEXT,\n  "homepage_text" TEXT,\n  "h1" TEXT,\n  "h2" TEXT,\n  "h3" TEXT,\n  "nav_link_text" TEXT,\n  "meta_keywords" TEXT,\n  "meta_description" TEXT\n)')
('table', 'CompanyDataset', 'CompanyDataset', 110723, 'CREATE TABLE "CompanyDataset" (\n"Unnamed: 0" INTEGER,\n  "CompanyName" TEXT,\n  "Website" TEXT,\n  "year founded" REAL,\n  "industry" TEXT,\n  "size range" TEXT,\n  "locality" TEXT,\n  "country" TEXT,\n  "linkedin url" TEXT,\n  "current employee estimate" INTEGER,\n  "total employee estimate" INTEGER\n)')
('index', 'employees', 'CompanyDataset', 416224, 'CREATE INDEX employees ON CompanyDataset (`current employee estimate`)')
('index', 'countries', 'CompanyDataset', 409796, 'CREATE INDEX countries ON CompanyDataset (`country`)')
('table', 'Company', 'Company', 397909, 'CREATE TABLE "Company" (\n"Unnamed: 0" INTEGER,\n  "Comp

## Method 1: Create a View to merge Tables

In [4]:
cur.execute("""
    DROP TABLE IF EXISTS Company;
""")

con.commit()

In [5]:
cur.execute("""
    DROP VIEW IF EXISTS Company;
""")

con.commit()

In [6]:
cur.execute("""
    CREATE VIEW Company
    AS SELECT * FROM CompanyDataset
    LEFT JOIN CompanyClassification
    ON CompanyDataset.CompanyName = CompanyClassification.CompanyName
""")

con.commit()

## Method 2: Use Pandas to merge Tables as DataFrames

In [7]:
CompanyDataset = pd.read_sql_query("SELECT * FROM CompanyDataset", con)

print('Total rows in the CompanyDataset table:', len(CompanyDataset))
print()
print('Rows with missing values:')

display(CompanyDataset.isna().sum())

Total rows in the CompanyDataset table: 7173426

Rows with missing values:


Unnamed: 0                         0
CompanyName                        3
Website                      1650621
year founded                 3606980
industry                      290003
size range                         0
locality                     2508825
country                      2349207
linkedin url                       0
current employee estimate          0
total employee estimate            0
dtype: int64

In [8]:
CompanyClassification = pd.read_sql_query("SELECT * FROM CompanyClassification", con)

print('Total rows in the CompanyClassification table:', len(CompanyClassification))
print()
print('Rows with missing values:')

display(CompanyClassification.isna().sum())

Total rows in the CompanyClassification table: 73974

Rows with missing values:


Category                0
Website                 0
CompanyName             0
homepage_text         669
h1                  27321
h2                  20762
h3                  29315
nav_link_text       25924
meta_keywords       50302
meta_description     7088
dtype: int64

In [9]:
CompanyDataset_columns = CompanyDataset.columns
CompanyClassification_columns = CompanyClassification.columns

common_columns = list(set(CompanyDataset_columns).intersection(set(CompanyClassification_columns)))

In [10]:
CompanyDataset = CompanyDataset.dropna(subset=common_columns)
CompanyClassification = CompanyClassification.dropna(subset=common_columns)

In [11]:
Company = pd.merge(CompanyDataset, CompanyClassification, on=common_columns, how='left')

print('Total rows in the merged "Company" table:', len(Company))
print()
print('Rows with missing values:')

display(Company.isna().sum())

Total rows in the merged "Company" table: 5522803

Rows with missing values:


Unnamed: 0                         0
CompanyName                        0
Website                            0
year founded                 2175920
industry                      112775
size range                         0
locality                     1693997
country                      1566173
linkedin url                       0
current employee estimate          0
total employee estimate            0
Category                     5449355
homepage_text                5450023
h1                           5476420
h2                           5469988
h3                           5478389
nav_link_text                5475006
meta_keywords                5499219
meta_description             5456364
dtype: int64

In [12]:
cur.execute("""
    DROP VIEW IF EXISTS Company;
""")

con.commit()

In [13]:
cur.execute("""
    DROP TABLE IF EXISTS Company;
""")

con.commit()

In [14]:
Company.to_sql(name='Company', con=con, schema='Tables', if_exists='replace', index=False)

5522803

## Verify result

In [15]:
cur.execute("""
    SELECT DISTINCT CompanyName, industry, `current employee estimate`, homepage_text
    FROM Company
    WHERE industry LIKE "%Technology%" AND 
          `current employee estimate` < 100 AND 
          TRIM(homepage_text, ' ') IS NULL
    ORDER BY CompanyName
    LIMIT 10;
""")

for c in cur.fetchall():
    print(c)

('!bien ict', 'information technology and services', 3, None)
('!qcl solutions', 'information technology and services', 12, None)
('"40-02" search advertisement agency', 'information technology and services', 8, None)
('"abideweb technologies" llc', 'information technology and services', 0, None)
('"acorn systems"\u200b s.r.o.', 'information technology and services', 1, None)
('"apply" intelligent technologies', 'information technology and services', 10, None)
('"arte" azerbaijan gps vehicle tracking systems', 'information technology and services', 0, None)
('"ato id", uab', 'nanotechnology', 1, None)
('"atom"\u200b web design company', 'information technology and services', 0, None)
('"august ir ko" uab', 'biotechnology', 4, None)


## Finalize

In [16]:
cur.close()
con.close()