# Collecting Entities and Creating Entity Ruler

In [None]:
# pip install -U spacy
# python -m spacy download en_core_web_sm
import requests
from bs4 import BeautifulSoup
import pandas as pd
import spacy
import ssl
import utils
ssl._create_default_https_context = ssl._create_unverified_context

# Collecting Entities

## Programming Languages (Wikipedia)

[List of programming languages](https://en.wikipedia.org/wiki/List_of_programming_languages)

In [None]:
# Helper

def text_cleaner(text: str) -> str:
    text = text.encode("ascii", "ignore")
    return text.decode().strip()

In [None]:
url = 'https://en.wikipedia.org/wiki/List_of_programming_languages'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
data = []
for lang in soup.select('.div-col > ul > li'):
    data.append([lang.text.strip()])
utils.save_csv(path='data/languages.csv', columns=['name'], rows=data)
data[0:5]

## Platforms (Cloud Service Providers From Wikipedia)

In [None]:
base_url = 'https://en.wikipedia.org'
url = f'{base_url}/wiki/Category:Cloud_platforms'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
data = []
container = soup.find(attrs={'class': 'mw-category-generated'})
for x in container.select('ul > li'):
    data.append([text_cleaner(x.text)])

sub_categories = soup.find(attrs={'id': 'mw-subcategories'})
for anchor in sub_categories.find_all('a'):
    href = anchor.get("href")
    url = f'{base_url}{href}'
    response = requests.get(url)
    # print(response.url)
    _soup = BeautifulSoup(response.content, 'html.parser')
    _container = _soup.find(attrs={'class': 'mw-category-generated'})
    for x in _container.select('ul > li'):
        data.append([text_cleaner(x.text)])

utils.save_csv(path='data/platforms.csv', columns=['name'], rows=data)
data[0:5]

## Databases

* [Data Store](https://en.wikipedia.org/wiki/Data_store)
    * [Relational Database](https://en.wikipedia.org/wiki/Relational_database)
    * [Graph Database](https://en.wikipedia.org/wiki/Graph_database)
    * [Wide Column Store](https://en.wikipedia.org/wiki/Wide-column_store)
    * [NoSQL](https://en.wikipedia.org/wiki/NoSQL)
    * [Key-Value Database](https://en.wikipedia.org/wiki/Key%E2%80%93value_database)

In [None]:
data = []

# Relational Databases
url = 'https://en.wikipedia.org/wiki/Relational_database#Relational_model'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
data = []
container = soup.select_one("#mw-content-text > div.mw-parser-output > ol:nth-child(89)")
for x in container.find_all('li'):
    data.append([text_cleaner(x.text)])

# Graph Databases
df = pd.read_html("https://en.wikipedia.org/wiki/Graph_database", attrs={"class": "wikitable"})[0]
for x in df['Name'].tolist():
    data.append([text_cleaner(x)])

# Wide Column Store
url = 'https://en.wikipedia.org/wiki/Wide-column_store'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
container = soup.select_one("#mw-content-text > div.mw-parser-output > ul")
for x in container.find_all('li'):
    data.append([text_cleaner(x.text)])

# NoSQL
df = pd.read_html("https://en.wikipedia.org/wiki/NoSQL")
for x in df[0]['Notable examples of this type'].tolist():
    for y in str(x).split(','):
        data.append([text_cleaner(y)])

for x in df[1]['Name'].tolist():
    data.append([text_cleaner(x)])

for x in df[3]['Database'].tolist():
    data.append([text_cleaner(x)])

# Key-Value Databases
df = pd.read_html("https://en.wikipedia.org/wiki/Key%E2%80%93value_database")[1]
for x in df['Provider'].tolist():
    data.append([text_cleaner(x)])


utils.save_csv(path='data/databases.csv', columns=['name'], rows=data)
data[0:5]

## Frameworks & Tools

Data was manually typed from various tech blog such as medium.com etc.

# Creating Entity Ruler

In [5]:
languages = pd.read_csv('data/languages.csv')
platforms = pd.read_csv('data/platforms.csv')
databases = pd.read_csv('data/databases.csv')
frameworks_tools = pd.read_csv('data/frameworks_tools_etc.csv')

patterns = []
for x in languages.name.tolist():
    patterns.append({"label": "PROG_LANG", "pattern": x, "id": "SKILLS"})

for x in databases.name.tolist():
    patterns.append({"label": "DB", "pattern": [{"lower": w.lower()} for w in str(x).split()], "id": "SKILLS"})

for x in platforms.name.tolist():
    patterns.append({"label": "PLATFORM", "pattern": [{"lower": w.lower()} for w in str(x).split()], "id": "SKILLS"})

for x in frameworks_tools.name.tolist():
    patterns.append({"label": "FRAMEWORKS", "pattern": [{"lower": w.lower()} for w in str(x).split()], "id": "SKILLS"})

nlp = spacy.load("en_core_web_sm")
ruler = nlp.add_pipe("entity_ruler", before="ner")
ruler.add_patterns(patterns)
ruler.to_disk("data/patterns.jsonl")