## SQL cleaning

In [None]:
#Use concat, replicate, len
SELECT 
	-- Concat the strings
	concat(
		carrier_code, 
		' - ', 
      	-- Replicate zeros
		Replicate('0', 9 - len(registration_code)), 
		registration_code, 
		', ', 
		airport_code)
	AS registration_code
FROM flight_statistics
-- Filter registers with more than 100 delays
WHERE delayed > 100

In [None]:
#use concat, format
SELECT 
    -- Concat the strings
	concat(
		carrier_code, 
		' - ', 
        -- Format the code
		format(cast(registration_code AS INT), '0000000'),
		', ', 
		airport_code
	) AS registration_code
FROM flight_statistics
-- Filter registers with more than 100 delays
WHERE delayed > 100

In [None]:
#soundex() difference()
SELECT 
    -- First name and surname of the statisticians
	DISTINCT S1.statistician_name, S1.statistician_surname
-- Join flight_statistics with itself
FROM flight_statistics S1 INNER JOIN flight_statistics S2 
	-- The SOUNDEX result of the first name and surname have to be the same
	ON SOUNDEX(S1.statistician_name) = SOUNDEX(S2.statistician_name) 
	AND SOUNDEX(S1.statistician_surname) = SOUNDEX(S2.statistician_surname) 
-- The texts of the first name or the texts of the surname have to be different
WHERE S1.statistician_name <> S2.statistician_name
	OR S1.statistician_surname <> S2.statistician_surname

In [None]:
SELECT 
    -- First name and surnames of the statisticians
	DISTINCT S1.statistician_name, S1.statistician_surname
-- Join flight_statistics with itself
FROM flight_statistics S1 INNER JOIN flight_statistics S2 
	-- The DIFFERENCE of the first name and surname has to be equals to 4
	ON difference(S1.statistician_name, S2.statistician_name) = 4
	AND difference(S1.statistician_surname, S2.statistician_surname) = 4
-- The texts of the first name or the texts of the surname have to be different
WHERE S1.statistician_name <> S2.statistician_name
	OR S1.statistician_surname <> S2.statistician_surname

In [None]:
#COALESCE, if first 2 are null, then display 3rd value
SELECT
airport_code,
airport_name,
-- Replace the missing values
COALESCE(airport_city, airport_state, 'Unknown') AS location
FROM airports

In [None]:
#row_number()
WITH cte AS (
    SELECT *, 
        ROW_NUMBER() OVER (
            PARTITION BY 
                airport_code, 
                carrier_code, 
                registration_date
			ORDER BY 
                airport_code, 
                carrier_code, 
                registration_date
        ) row_num
    FROM flight_statistics
)
SELECT * FROM cte
-- Exclude duplicates
WHERE row_num =1;

In [None]:
#Use convert to convert date
SELECT 
    airport_code,
    carrier_code,
    canceled, 
    airport_code, 
    -- Convert the registration_date to a DATE and print it in mm/dd/yyyy format
    convert(VARCHAR(10), cast(registration_date AS DATE), 101) AS registration_date
FROM flight_statistics 
-- Convert the registration_date to mm/dd/yyyy format
WHERE convert(VARCHAR(10), cast(registration_date AS DATE), 101) 
	-- Filter the first semester of 2014 in mm/dd/yyyy format 
	BETWEEN '01/01/2014' AND '06/30/2014'

In [None]:
#use format to convert date
SELECT 
	pilot_code,
	pilot_name,
	pilot_surname,
	carrier_code,
    -- Convert the entry_date to a DATE and print it in dd/MM/yyyy format
	format(cast(entry_date AS DATE), 'dd/MM/yyyy') AS entry_date
from pilots

In [None]:
# substring and chaindex
SELECT 
	client_name,
	client_surname,
    -- Extract the name of the city
	SUBSTRING(city_state, 1, CHARINDEX(', ', city_state) - 1) AS city,
    -- Extract the name of the state
    SUBSTRING(city_state, CHARINDEX(', ', city_state) + 1, LEN(city_state)) AS state
FROM clients_split

In [None]:
#pivot and unpivot
#pivot turns rows to columns
#unpivot turns columns to rows
SELECT
	year_of_sale,
    -- Select the pivoted columns
	notebooks, 
	pencils, 
	crayons
FROM
   (SELECT 
		SUBSTRING(product_name_units, 1, charindex('-', product_name_units)-1) product_name, 
		CAST(SUBSTRING(product_name_units, charindex('-', product_name_units)+1, len(product_name_units)) AS INT) units,	
		year_of_sale
	FROM paper_shop_monthly_sales) sales
-- Sum the units for column that contains the values that will be column headers
PIVOT (SUM(units) FOR product_name IN (notebooks, pencils, crayons))
-- Give the alias name
AS paper_shop_pivot

In [None]:
SELECT * FROM pivot_sales
-- Use the operator to convert columns into rows
unpivot
	-- The resulting column that will contain the turned columns into rows
	(units FOR product_name IN (notebooks, pencils, crayons))
-- Give the alias name
AS unpivot_sales