## Data_anomalies_1NF


In [1]:
# Load and activate the SQL extension to allow us to execute SQL in a Jupyter notebook.
%load_ext sql

In [2]:

## Load the SoftDevEmployees database stored in your local machine. 
# Make sure the file is saved in the same folder as this notebook.
%sql sqlite:///SoftDevEmployees.db

'Connected: @SoftDevEmployees.db'

In [None]:
%%sql
SELECT * 
FROM 
    employees
WHERE 
    Role LIKE '%,%'    -- we use the LIKE keyword to search for the comma "," delimiter
OR 
    Department LIKE '%,%' -- we use the LIKE keyword to search for the comma "," delimiter

In [None]:
%%sql

DROP TABLE IF EXISTS Employees_1NF; -- We delete the table in case it exits when we create it. 

CREATE TABLE Employees_1NF (
    Name VARCHAR NOT NULL, 
    Surname VARCHAR NOT NULL,
    Role VARCHAR NOT NULL,
    Department VARCHAR NOT NULL,
    Title VARCHAR,
    OccupationBand VARCHAR,
    Salary REAL,
    PRIMARY KEY(Name, Surname, Role, Department) 
);

In [5]:
%%sql
SELECT 
    FullName,
    TRIM(SUBSTR(FullName,1,INSTR(FullName,',')-1)) AS Name, --Get substring before comma
    TRIM(SUBSTR(FullName,INSTR(FullName,',')+1)) AS Surname, --Get substring after comma
    UPPER(SUBSTR(Title,1,1)) ||LOWER(SUBSTR(Title,2)) AS Title --Standardising all Titles to start with a capital letter
FROM 
    Employees
LIMIT 5;

 * sqlite:///SoftDevEmployees.db
(sqlite3.OperationalError) no such table: Employees
[SQL: SELECT 
    FullName,
    TRIM(SUBSTR(FullName,1,INSTR(FullName,',')-1)) AS Name, --Get substring before comma
    TRIM(SUBSTR(FullName,INSTR(FullName,',')+1)) AS Surname, --Get substring after comma
    UPPER(SUBSTR(Title,1,1)) ||LOWER(SUBSTR(Title,2)) AS Title --Standardising all Titles to start with a capital letter
FROM 
    Employees
LIMIT 5;]
(Background on this error at: http://sqlalche.me/e/14/e3q8)


In [None]:
%%sql
SELECT 
    TRIM(SUBSTR(FullName,1,INSTR(FullName,',')-1)) AS Name,     --Splitting FullName to obtain Name,
    TRIM(SUBSTR(FullName,INSTR(FullName,',')+1)) AS Surname,    --Splitting FullName to obtain Surname
    UPPER(SUBSTR(Title,1,1)) ||LOWER(SUBSTR(Title,2)) AS Title, --Standardising all Titles to start with a capital letter
    Role,
    OccupationBand,
    Salary,
    Department
FROM
    Employees
WHERE 
    ROLE LIKE '%,%' OR Department LIKE '%,%' --Targets only the non-atomic values

Now that we have fixed the name columns. We can move on to the `Role` and `Department` columns. Here we get a view of all the rows in the `Role` and `Department` columns that contain **non-atomic** values. These are the rows we want to focus on.

In [None]:
%%sql
SELECT 
    TRIM(SUBSTR(FullName,1,INSTR(FullName,',')-1)) AS Name,             -- Splitting FullName to obtain Name
    TRIM(SUBSTR(FullName,INSTR(FullName,',')+1)) AS Surname,            -- Splitting FullName to obtain Surname
    UPPER(SUBSTR(Title,1,1)) || LOWER(SUBSTR(Title,2)) AS Title,        -- Standardising all Titles to start with a capital letter
    
    CASE 
        WHEN                                                            -- When the row only has one role, i.e. there's no value after any comma
            TRIM(SUBSTR(Role,1,INSTR(Role,',')-1))='' 
        THEN 
            Role                                                        -- return the original role
        ELSE 
            TRIM(SUBSTR(Role,1,INSTR(Role,',')-1))                      -- otherwise return the substring before the comma
    END AS Role,                                                        -- and include that as the Role
    
    OccupationBand,                                                     
    Salary,                                                             
    
    CASE                                                                -- When the row only has one department, i.e. there's no value after any comma
        WHEN      
            TRIM(SUBSTR(Department,1,INSTR(Department,',')-1))='' 
        THEN 
            Department                                                 -- return the original department
        ELSE 
            TRIM(SUBSTR(Department,1,INSTR(Department,',')-1))         -- otherwise return the substring before the comma 
    END AS Department                                                  --  and include that as the Department
    
FROM
    Employees
WHERE 
    Role LIKE '%,%' OR Department LIKE '%,%'                           -- filter all entries that have non-atomic values in the Role and Department columns

In [None]:
%%sql
SELECT 
    TRIM(SUBSTR(FullName,1,INSTR(FullName,',')-1)) AS Name,             -- Splitting FullName to obtain Name
    TRIM(SUBSTR(FullName,INSTR(FullName,',')+1)) AS Surname,            -- Splitting FullName to obtain Surname
    UPPER(SUBSTR(Title,1,1)) || LOWER(SUBSTR(Title,2)) AS Title,        -- Standardising all Titles to start with a capital letter
    
    CASE 
        WHEN                                                            -- When the row only has one role, i.e. there's no value after any comma
            TRIM(SUBSTR(Role,1,INSTR(Role,',')-1))='' 
        THEN 
            Role                                                        -- return the original role
        ELSE 
            TRIM(SUBSTR(Role,INSTR(Role,',')+1))                     -- otherwise return the substring after the comma
    END AS Role,                                                        -- and include that as the Role
    
    OccupationBand,                                                     
    Salary,                                                             
    
    CASE                                                                -- When the row only has one department, i.e. there's no value after any comma
        WHEN      
            TRIM(SUBSTR(Department,1,INSTR(Department,',')-1))='' 
        THEN 
            Department                                                 -- return the original department
        ELSE 
            TRIM(SUBSTR(Department,INSTR(Department,',')+1))         -- otherwise return the substring after the comma 
    END AS Department                                                  --  and include that as the Department
    
FROM
    Employees
WHERE 
    Role LIKE '%,%' OR Department LIKE '%,%'                           -- filter all entries that have non-atomic values in the Role and Department columns

In [None]:
%%sql

SELECT 
    TRIM(SUBSTR(FullName,1,INSTR(FullName,',')-1)) AS Name,     --Splitting FullName to obtain Name
    TRIM(SUBSTR(FullName,INSTR(FullName,',')+1)) AS Surname,    --Splitting FullName to obtain Surname
    UPPER(SUBSTR(Title,1,1)) ||LOWER(SUBSTR(Title,2)) AS Title, --Standardising all Titles to start with a capital letter
    Role,
    OccupationBand,
    Salary,
    Department
FROM
    Employees
WHERE  ROLE NOT LIKE '%,%' AND Department NOT LIKE '%,%' --Targets only the atomic values

In [None]:
%%sql

/*SET #1 ======================================================================================
   The set of all entries containing the first `Role` or `Department` for all non-atomic cells. 
==============================================================================================*/

SELECT 
    TRIM(SUBSTR(FullName,1,INSTR(FullName,',')-1)) AS Name,             -- Splitting FullName to obtain Name
    TRIM(SUBSTR(FullName,INSTR(FullName,',')+1)) AS Surname,            -- Splitting FullName to obtain Surname
    UPPER(SUBSTR(Title,1,1)) || LOWER(SUBSTR(Title,2)) AS Title,        -- Standardising all Titles to start with a capital letter
    
    CASE 
        WHEN                                                            -- When the row only has one role, i.e. there's no value after any comma
            TRIM(SUBSTR(Role,1,INSTR(Role,',')-1))='' 
        THEN 
            Role                                                        -- return the original role
        ELSE 
            TRIM(SUBSTR(Role,1,INSTR(Role,',')-1))                      -- otherwise return the substring before the comma
    END AS Role,                                                        -- and include that as the Role
    
    OccupationBand,                                                     
    Salary,                                                             
    
    CASE                                                                -- When the row only has one department, i.e. there's no value after any comma
        WHEN      
            TRIM(SUBSTR(Department,1,INSTR(Department,',')-1))='' 
        THEN 
            Department                                                 -- return the original department
        ELSE 
            TRIM(SUBSTR(Department,1,INSTR(Department,',')-1))         -- otherwise return the substring before the comma 
    END AS Department                                                  --  and include that as the Department
    
FROM
    Employees
WHERE 
    Role LIKE '%,%' OR Department LIKE '%,%'                           -- Filter all entries that have non-atomic values in the Role and Department columns 
    
UNION

/*SET #2 ======================================================================================
   The set of all entries containing the second `Role` or `Department` for all non-atomic cells. 
==============================================================================================*/

SELECT 
    TRIM(SUBSTR(FullName,1,INSTR(FullName,',')-1)) AS Name,             -- Splitting FullName to obtain Name
    TRIM(SUBSTR(FullName,INSTR(FullName,',')+1)) AS Surname,            -- Splitting FullName to obtain Surname
    UPPER(SUBSTR(Title,1,1)) || LOWER(SUBSTR(Title,2)) AS Title,        -- Standardising all Titles to start with a capital letter
    
    CASE 
        WHEN                                                            -- When the row only has one role, i.e. there's no value after any comma
            TRIM(SUBSTR(Role,1,INSTR(Role,',')-1))='' 
        THEN 
            Role                                                        -- return the original role
        ELSE 
            TRIM(SUBSTR(Role,INSTR(Role,',')+1))                     -- otherwise return the substring after the comma
    END AS Role,                                                        -- and include that as the Role
    
    OccupationBand,                                                     
    Salary,                                                             
    
    CASE                                                                -- When the row only has one department, i.e. there's no value after any comma
        WHEN      
            TRIM(SUBSTR(Department,1,INSTR(Department,',')-1))='' 
        THEN 
            Department                                                 -- return the original department
        ELSE 
            TRIM(SUBSTR(Department,INSTR(Department,',')+1))         -- otherwise return the substring after the comma 
    END AS Department                                                  --  and include that as the Department
    
FROM
    Employees
WHERE 
    Role LIKE '%,%' OR Department LIKE '%,%'

UNION

/*SET #3 ======================================================================================
   The set of all entries that **only** contain atomic cells. 
==============================================================================================*/

SELECT 
    TRIM(SUBSTR(FullName,1,INSTR(FullName,',')-1)) AS Name,     --Splitting FullName to obtain Name
    TRIM(SUBSTR(FullName,INSTR(FullName,',')+1)) AS Surname,    --Splitting FullName to obtain Surname
    UPPER(SUBSTR(Title,1,1)) ||LOWER(SUBSTR(Title,2)) AS Title, --Standardising all Titles to start with a capital letter
    Role,
    OccupationBand,
    Salary,
    Department
FROM
    Employees
WHERE ROLE NOT LIKE '%,%' AND Department NOT LIKE '%,%' --Targets only the atomic values;

In [None]:
%%sql
--Below is the INSERT query for the First Normal Form.

DELETE FROM Employees_1NF;

INSERT INTO Employees_1NF (Name,Surname,Title,Role,OccupationBand,Salary,Department)

/*SET #1 ======================================================================================
   The set of all entries containing the first `Role` or `Department` for all non-atomic cells. 
==============================================================================================*/

SELECT 
    TRIM(SUBSTR(FullName,1,INSTR(FullName,',')-1)) AS Name,             -- Splitting FullName to obtain Name
    TRIM(SUBSTR(FullName,INSTR(FullName,',')+1)) AS Surname,            -- Splitting FullName to obtain Surname
    UPPER(SUBSTR(Title,1,1)) || LOWER(SUBSTR(Title,2)) AS Title,        -- Standardising all Titles to start with a capital letter
    
    CASE 
        WHEN                                                            -- When the row only has one role, i.e. there's no value after any comma
            TRIM(SUBSTR(Role,1,INSTR(Role,',')-1))='' 
        THEN 
            Role                                                        -- return the original role
        ELSE 
            TRIM(SUBSTR(Role,1,INSTR(Role,',')-1))                      -- otherwise return the substring before the comma
    END AS Role,                                                        -- and include that as the Role
    
    OccupationBand,                                                     
    Salary,                                                             
    
    CASE                                                                -- When the row only has one department, i.e. there's no value after any comma
        WHEN      
            TRIM(SUBSTR(Department,1,INSTR(Department,',')-1))='' 
        THEN 
            Department                                                 -- return the original department
        ELSE 
            TRIM(SUBSTR(Department,1,INSTR(Department,',')-1))         -- otherwise return the substring before the comma 
    END AS Department                                                  --  and include that as the Department
    
FROM
    Employees
WHERE 
    Role LIKE '%,%' OR Department LIKE '%,%'                           -- filter all entries that have non-atomic values in the Role and Department columns 
    
UNION

/*SET #2 ======================================================================================
   The set of all entries containing the second `Role` or `Department` for all non-atomic cells. 
==============================================================================================*/

SELECT 
    TRIM(SUBSTR(FullName,1,INSTR(FullName,',')-1)) AS Name,             -- Splitting FullName to obtain Name
    TRIM(SUBSTR(FullName,INSTR(FullName,',')+1)) AS Surname,            -- Splitting FullName to obtain Surname
    UPPER(SUBSTR(Title,1,1)) || LOWER(SUBSTR(Title,2)) AS Title,        -- Standardising all Titles to start with a capital letter
    
    CASE 
        WHEN                                                            -- When the row only has one role, i.e. there's no value after any comma
            TRIM(SUBSTR(Role,1,INSTR(Role,',')-1))='' 
        THEN 
            Role                                                        -- return the original role
        ELSE 
            TRIM(SUBSTR(Role,INSTR(Role,',')+1))                     -- otherwise return the substring after the comma
    END AS Role,                                                        -- and include that as the Role
    
    OccupationBand,                                                     
    Salary,                                                             
    
    CASE                                                                -- When the row only has one department, i.e. there's no value after any comma
        WHEN      
            TRIM(SUBSTR(Department,1,INSTR(Department,',')-1))='' 
        THEN 
            Department                                                 -- return the original department
        ELSE 
            TRIM(SUBSTR(Department,INSTR(Department,',')+1))         -- otherwise return the substring after the comma 
    END AS Department                                                  --  and include that as the Department
    
FROM
    Employees
WHERE 
    Role LIKE '%,%' OR Department LIKE '%,%'

UNION

/*SET #3 ======================================================================================
   The set of all entries that **only** contain atomic cells. 
==============================================================================================*/

SELECT 
    TRIM(SUBSTR(FullName,1,INSTR(FullName,',')-1)) AS Name,     --Splitting FullName to obtain Name
    TRIM(SUBSTR(FullName,INSTR(FullName,',')+1)) AS Surname,    --Splitting FullName to obtain Surname
    UPPER(SUBSTR(Title,1,1)) ||LOWER(SUBSTR(Title,2)) AS Title, --Standardising all Titles to start with a capital letter
    Role,
    OccupationBand,
    Salary,
    Department
FROM
    Employees
WHERE ROLE NOT LIKE '%,%' AND Department NOT LIKE '%,%' --Targets only the atomic values;