In [12]:
import pandas as pd
import spacy
import time
import gc
import psutil
import ast

from spacy.matcher import PhraseMatcher
from skillNer.general_params import SKILL_DB
from skillNer.skill_extractor_class import SkillExtractor


In [10]:
RAW_DATA_PATH = '../data/raw/'
PR_DATA_PATH = '../data/processed/'
data_raw = 'requirements_data_raw.csv'
cleaned_data_raw = 'requirements_data_raw_cleaned.csv'


In [3]:
data = pd.read_csv(f'{RAW_DATA_PATH}{data_raw}')

data['requirements'] = data['requirements'].replace(r'^\s*$', pd.NA, regex=True)
data = data.dropna(subset=['requirements'])

data.to_csv(f'{RAW_DATA_PATH}{cleaned_data_raw}', index=False)

In [4]:
nlp = spacy.load("en_core_web_lg")
skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher)
nlp_used = skill_extractor.nlp 
print(nlp_used.pipe_names)
print("Vectors:", nlp_used.vocab.vectors_length) 

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...
['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
Vectors: 300


In [7]:
def extract_hard_skills(text):
    time.sleep(0.2)
    print(f"Memory usage: {psutil.virtual_memory().percent}%")
    print(f"Processing text: {text[:100]}..." if len(text) > 100 else f"Processing text: {text}")
    
    if not isinstance(text, str) or text.strip() == '':
        print("  → Empty or invalid text, returning []")
        return []
    
    try:
        annotations = skill_extractor.annotate(text)
        print(f"  → Found annotations: keys={list(annotations.keys())}, results keys={list(annotations['results'].keys())}")

        ngram_scored = annotations['results']['ngram_scored']
        print(f"  → ngram_scored count: {len(ngram_scored)}")

        skills = [item['doc_node_value'] for item in ngram_scored]
        print(f"  → Extracted skills: {skills}")

        return skills

    except KeyError as e:
        print(f"KeyError: missing key {e}")
        return []
    except Exception as e:
        print(f"Unexpected error: {type(e).__name__}: {e}")
        return []

batch_size = 5
results = []
cleaned_data = pd.read_csv(f'{RAW_DATA_PATH}{cleaned_data_raw}')

for i in range(0, len(cleaned_data), batch_size):
    print(f"Processing batch {i}–{i+batch_size}...")
    
    batch = cleaned_data.iloc[i:i+batch_size].copy()
    batch['requirements'] = batch['requirements'].apply(extract_hard_skills)
    
    results.append(batch)
    
    time.sleep(1)
    gc.collect()   

final_data = pd.concat(results, ignore_index=True)
final_data.to_csv(f'{RAW_DATA_PATH}requirements_data_processed.csv', index=False)
print('Done!')

Processing batch 0–5...
Memory usage: 32.8%
Processing text: Data Science
  → Found annotations: keys=['text', 'results'], results keys=['full_matches', 'ngram_scored']
  → ngram_scored count: 0
  → Extracted skills: []
Memory usage: 32.8%
Processing text: Data Science
  → Found annotations: keys=['text', 'results'], results keys=['full_matches', 'ngram_scored']
  → ngram_scored count: 0
  → Extracted skills: []
Memory usage: 32.9%
Processing text: Python, RAG, LLM, Deep Learning
  → Found annotations: keys=['text', 'results'], results keys=['full_matches', 'ngram_scored']
  → ngram_scored count: 1
  → Extracted skills: ['python']
Memory usage: 32.9%
Processing text: Python, Clickhouse, Hadoop, PostgreSQL, хорошее знание математической статистики и методик статистич...
  → Found annotations: keys=['text', 'results'], results keys=['full_matches', 'ngram_scored']
  → ngram_scored count: 9
  → Extracted skills: ['python', 'postgresql', 'python', 'numpy', 'pandas', 'scipy', 'matplotlib', 

  vec_similarity = token1.similarity(token2)


Processing batch 305–310...
Memory usage: 33.5%
Processing text: SQL, Python, Power Query, MS Excel, pandas, Numpy, Matplotlib, Аналитический склад ума
  → Found annotations: keys=['text', 'results'], results keys=['full_matches', 'ngram_scored']
  → ngram_scored count: 5
  → Extracted skills: ['sql', 'python', 'pandas', 'numpy', 'matplotlib']
Memory usage: 33.5%
Processing text: WMS, ERP-системы на базе 1С, AutoCAD
  → Found annotations: keys=['text', 'results'], results keys=['full_matches', 'ngram_scored']
  → ngram_scored count: 2
  → Extracted skills: ['wms', 'autocad']
Memory usage: 33.5%
Processing text: Big Data, SQL, Amplitude, Визуализация данных, Power BI, A/B тесты, Опыт работы в геймдеве; Опыт раб...
  → Found annotations: keys=['text', 'results'], results keys=['full_matches', 'ngram_scored']
  → ngram_scored count: 8
  → Extracted skills: ['big data', 'big data', 'data', 'amplitude', 'b', 'bigquery', 'amplitude', 'tableau']
Memory usage: 33.5%
Processing text: Бизнес-ана

KeyboardInterrupt: 

In [13]:
def is_empty_array(val):
    try:
        lst = ast.literal_eval(val)
        return isinstance(lst, list) and len(lst) == 0
    except (ValueError, SyntaxError):
        return False 

data = pd.read_csv(f'{PR_DATA_PATH}requirements_data_processed.csv')
data['requirements'] = data['requirements'].replace(r'^\s*$', pd.NA, regex=True)
data = data.dropna(subset=['requirements'])

mask_not_empty = ~data['requirements'].apply(lambda x: is_empty_array(x))

data['requirements'] = data['requirements'].replace(r'^\s*$', pd.NA, regex=True)
data = data.dropna(subset=['requirements'])

data = data[mask_not_empty]

data.to_csv(f'{PR_DATA_PATH}requirements_data_processed_cleaned.csv', index=False)