## Natural Language Inverted Index from Scratch

We will do **stop words optimization**, and the other optimization we're going to do is just **make everything lowercase**, and the other optimization is **stemming**.

Keep in mind that stemming and the stop words very much depend on the language.

In [1]:
%load_ext sql
import os
connection_string = os.environ["DATABASE_URL"]
%sql postgresql://$connection_string

The shows the default installation with Postgres languages that have stop words and stemming rules and other things precomputed.

In [2]:
%%sql
select cfgname from pg_ts_config;

 * postgresql://postgres:***@localhost/pg4e
29 rows affected.


cfgname
simple
arabic
armenian
basque
catalan
danish
dutch
english
finnish
french


In [3]:
%%sql
DROP TABLE IF EXISTS docs CASCADE;
CREATE TABLE docs(id SERIAL, doc TEXT, PRIMARY KEY(id));
INSERT INTO docs(doc) VALUES
  ('This is SQL and Python and other fun teaching stuff'),
  ('More people should learn SQL from UMSI'),
  ('UMSI also teaches Python and also SQL');
SELECT * FROM docs;

 * postgresql://postgres:***@localhost/pg4e
Done.
Done.
3 rows affected.
3 rows affected.


id,doc
1,This is SQL and Python and other fun teaching stuff
2,More people should learn SQL from UMSI
3,UMSI also teaches Python and also SQL


Break the document column into one row per word +  primary key

In [4]:
%%sql
SELECT id, s.keyword AS keyword
FROM docs AS D, unnest(string_to_array(D.doc, ' ')) s(keyword)
ORDER BY id;

 * postgresql://postgres:***@localhost/pg4e
24 rows affected.


id,keyword
1,This
1,is
1,SQL
1,and
1,Python
1,and
1,other
1,fun
1,teaching
1,stuff


Lower case it all

In [5]:
%%sql
SELECT id, s.keyword AS keyword
FROM docs AS D, unnest(string_to_array(lower(D.doc), ' ')) s(keyword)
ORDER BY id;

 * postgresql://postgres:***@localhost/pg4e
24 rows affected.


id,keyword
1,this
1,is
1,sql
1,and
1,python
1,and
1,other
1,fun
1,teaching
1,stuff


#### Stop Words

In [6]:
%%sql
DROP TABLE IF EXISTS stop_words;
CREATE TABLE stop_words(word TEXT UNIQUE);
INSERT INTO stop_words(word) VALUES ('is'), ('this'), ('and');
SELECT * FROM stop_words;

 * postgresql://postgres:***@localhost/pg4e
Done.
Done.
3 rows affected.
3 rows affected.


word
is
this
and


Put the stop_words free list into the GIN

In [7]:
%%sql
DROP TABLE IF EXISTS docs_gin;
CREATE TABLE docs_gin(
  keyword TEXT, 
  doc_id INTEGER REFERENCES docs(id) ON DELETE CASCADE
);

 * postgresql://postgres:***@localhost/pg4e
Done.
Done.


[]

In [8]:
%%sql 
INSERT INTO docs_gin(doc_id, keyword)
SELECT DISTINCT id, s.keyword AS keyword
FROM docs AS D, unnest(string_to_array(lower(D.doc), ' ')) s(keyword)
WHERE s.keyword NOT IN (SELECT word FROM stop_words)
ORDER BY id;

SELECT * FROM docs_gin;

 * postgresql://postgres:***@localhost/pg4e
18 rows affected.
18 rows affected.


keyword,doc_id
fun,1
other,1
python,1
sql,1
stuff,1
teaching,1
from,2
learn,2
more,2
people,2


A one word query

In [9]:
%%sql
SELECT DISTINCT doc FROM docs AS D
JOIN docs_gin AS G ON D.id = G.doc_id
WHERE G.keyword = lower('UMSI');

 * postgresql://postgres:***@localhost/pg4e
2 rows affected.


doc
More people should learn SQL from UMSI
UMSI also teaches Python and also SQL


A multi-word query

In [10]:
%%sql 
SELECT DISTINCT doc FROM docs AS D
JOIN docs_gin AS G ON D.id = G.doc_id
WHERE G.keyword = ANY(string_to_array(lower('Meet fun people'), ' '));

 * postgresql://postgres:***@localhost/pg4e
2 rows affected.


doc
More people should learn SQL from UMSI
This is SQL and Python and other fun teaching stuff


In [11]:
%%sql
SELECT DISTINCT doc FROM docs AS D
JOIN docs_gin AS G ON D.id = G.doc_id
WHERE G.keyword = lower('and');

 * postgresql://postgres:***@localhost/pg4e
0 rows affected.


doc


#### Stemming

In [12]:
%%sql
DROP TABLE IF EXISTS docs_stem;
CREATE TABLE docs_stem(word TEXT, stem TEXT);
INSERT INTO docs_stem VALUES('teaching', 'teach'), ('teaches', 'teach');
SELECT * FROM docs_stem;

 * postgresql://postgres:***@localhost/pg4e
Done.
Done.
2 rows affected.
2 rows affected.


word,stem
teaching,teach
teaches,teach


Move the initial word extraction into a sub-query

In [13]:
%%sql
SELECT id, keyword FROM(
  SELECT DISTINCT id, s.keyword AS keyword
  FROM docs AS D, unnest(string_to_array(lower(D.doc), ' ')) s(keyword)
) AS X
ORDER BY id;

 * postgresql://postgres:***@localhost/pg4e
22 rows affected.


id,keyword
1,and
1,fun
1,is
1,other
1,python
1,sql
1,stuff
1,teaching
1,this
2,from


Add the stems at the third column (may or may not exist)

In [14]:
%%sql
SELECT id, keyword, stem FROM(
  SELECT DISTINCT id, s.keyword AS keyword
  FROM docs AS D, unnest(string_to_array(lower(D.doc), ' ')) s(keyword)
) AS K
LEFT JOIN docs_stem AS S ON K.keyword = S.word
ORDER BY id;

 * postgresql://postgres:***@localhost/pg4e
22 rows affected.


id,keyword,stem
1,fun,
1,teaching,teach
1,python,
1,is,
1,sql,
1,this,
1,stuff,
1,and,
1,other,
2,umsi,


If the stem is there, use it

In [15]:
%%sql
SELECT id,
CASE WHEN stem IS NOT NULL THEN stem ELSE keyword END AS awesome, 
keyword, stem
FROM(
  SELECT DISTINCT id, lower(s.keyword) AS keyword
  FROM docs AS D, unnest(string_to_array(D.doc, ' ')) s(keyword)
  ) AS K
LEFT JOIN docs_stem AS S ON K.keyword = S.word
ORDER BY id;

 * postgresql://postgres:***@localhost/pg4e
22 rows affected.


id,awesome,keyword,stem
1,fun,fun,
1,teach,teaching,teach
1,python,python,
1,is,is,
1,sql,sql,
1,this,this,
1,stuff,stuff,
1,and,and,
1,other,other,
2,umsi,umsi,


A more elegant way of the previous query using:  
Null Coalescing, return the first non-null of the keyword

In [16]:
%%sql
SELECT id , COALESCE(stem, keyword) AS keyword
FROM(
  SELECT DISTINCT id, s.keyword AS keyword
  FROM docs AS D, unnest(string_to_array(lower(D.doc), ' ')) s(keyword)
) AS K
LEFT JOIN docs_stem AS S ON K.keyword = S.word
ORDER BY id;

 * postgresql://postgres:***@localhost/pg4e
22 rows affected.


id,keyword
1,fun
1,teach
1,python
1,is
1,sql
1,this
1,stuff
1,and
1,other
2,umsi


Insert only the stems

In [17]:
%%sql
DELETE FROM docs_gin;
INSERT INTO docs_gin (doc_id, keyword)
SELECT id , COALESCE(stem, keyword) AS keyword
FROM(
  SELECT DISTINCT id, s.keyword AS keyword
  FROM docs AS D, unnest(string_to_array(lower(D.doc), ' ')) s(keyword)
) AS K
LEFT JOIN docs_stem AS S ON K.keyword = S.word
ORDER BY id;
SELECT doc_id, keyword FROM docs_gin;

 * postgresql://postgres:***@localhost/pg4e
18 rows affected.
22 rows affected.
22 rows affected.


doc_id,keyword
1,fun
1,teach
1,python
1,is
1,sql
1,this
1,stuff
1,and
1,other
2,umsi


Now lets do stop_words and stems...

In [18]:
%%sql
DELETE FROM docs_gin;

 * postgresql://postgres:***@localhost/pg4e
22 rows affected.


[]

In [19]:
%%sql
INSERT INTO docs_gin(doc_id, keyword)
SELECT id, COALESCE(stem, keyword)
FROM(
  SELECT DISTINCT id, s.keyword AS keyword
  FROM docs AS D, unnest(string_to_array(lower(D.doc), ' ')) s(keyword)
  WHERE s.keyword NOT IN (SELECT word FROM stop_words)
) AS K
LEFT JOIN docs_stem AS S ON K.keyword = S.word;

SELECT * FROM docs_gin LIMIT 10;

 * postgresql://postgres:***@localhost/pg4e
18 rows affected.
10 rows affected.


keyword,doc_id
also,3
from,2
fun,1
learn,2
more,2
other,1
people,2
python,3
python,1
should,2


Let's do some queries. The key is when we're looking for a word, we also have to look for its stem, and we have to prefer the stem.

In [20]:
%%sql
SELECT COALESCE((SELECT stem FROM docs_stem WHERE word=lower('SQL')), lower('SQL'));

 * postgresql://postgres:***@localhost/pg4e
1 rows affected.


coalesce
sql


In [21]:
%%sql
SELECT COALESCE((SELECT stem FROM docs_stem WHERE word=lower('teaching')), lower('teaching'));

 * postgresql://postgres:***@localhost/pg4e
1 rows affected.


coalesce
teach


In [22]:
%%sql
SELECT DISTINCT id, doc FROM docs AS D
JOIN docs_gin G ON D.id = G.doc_id
WHERE G.keyword = COALESCE((SELECT stem FROM docs_stem WHERE word= lower('SQL')), lower('SQL'));

 * postgresql://postgres:***@localhost/pg4e
3 rows affected.


id,doc
1,This is SQL and Python and other fun teaching stuff
2,More people should learn SQL from UMSI
3,UMSI also teaches Python and also SQL


In [23]:
%%sql
SELECT DISTINCT id, doc FROM docs AS D
JOIN docs_gin G ON D.id = G.doc_id
WHERE G.keyword = COALESCE((SELECT stem FROM docs_stem WHERE word= lower('teaching')), lower('teaching'));

 * postgresql://postgres:***@localhost/pg4e
2 rows affected.


id,doc
1,This is SQL and Python and other fun teaching stuff
3,UMSI also teaches Python and also SQL
