In [57]:
import pandas as pd
import numpy as np
%pip install openpyxl -q

Note: you may need to restart the kernel to use updated packages.


In [58]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)

In [59]:
df = pd.read_csv('data/Train_rev1.csv', index_col='Id')
df_test = pd.read_csv('data/Test_rev1.csv', index_col='Id')

In [60]:
print("Train data shape: ", df.shape)
print("Test data shape: ", df_test.shape)

Train data shape:  (244768, 11)
Test data shape:  (122463, 9)


In [61]:
df.head(3)

Unnamed: 0_level_0,Title,FullDescription,LocationRaw,LocationNormalized,ContractType,ContractTime,Company,Category,SalaryRaw,SalaryNormalized,SourceName
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
12612628,Engineering Systems Analyst,Engineering Systems Analyst Dorking Surrey Sal...,"Dorking, Surrey, Surrey",Dorking,,permanent,Gregory Martin International,Engineering Jobs,20000 - 30000/annum 20-30K,25000,cv-library.co.uk
12612830,Stress Engineer Glasgow,Stress Engineer Glasgow Salary **** to **** We...,"Glasgow, Scotland, Scotland",Glasgow,,permanent,Gregory Martin International,Engineering Jobs,25000 - 35000/annum 25-35K,30000,cv-library.co.uk
12612844,Modelling and simulation analyst,Mathematical Modeller / Simulation Analyst / O...,"Hampshire, South East, South East",Hampshire,,permanent,Gregory Martin International,Engineering Jobs,20000 - 40000/annum 20-40K,30000,cv-library.co.uk


In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 244768 entries, 12612628 to 72705235
Data columns (total 11 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Title               244767 non-null  object
 1   FullDescription     244768 non-null  object
 2   LocationRaw         244768 non-null  object
 3   LocationNormalized  244768 non-null  object
 4   ContractType        65442 non-null   object
 5   ContractTime        180863 non-null  object
 6   Company             212338 non-null  object
 7   Category            244768 non-null  object
 8   SalaryRaw           244768 non-null  object
 9   SalaryNormalized    244768 non-null  int64 
 10  SourceName          244767 non-null  object
dtypes: int64(1), object(10)
memory usage: 22.4+ MB


In [63]:
df.describe(include='all')

Unnamed: 0,Title,FullDescription,LocationRaw,LocationNormalized,ContractType,ContractTime,Company,Category,SalaryRaw,SalaryNormalized,SourceName
count,244767,244768,244768,244768,65442,180863,212338,244768,244768,244768.0,244767
unique,135435,242138,20986,2732,2,2,20812,29,97286,,167
top,Business Development Manager,What is expected of you as a Registered Nurse ...,London,UK,full_time,permanent,UKStaffsearch,IT Jobs,"50,000-74,999 yearly",,totaljobs.com
freq,921,18,15605,41093,57538,151521,4997,38483,1923,,48149
mean,,,,,,,,,,34122.577576,
std,,,,,,,,,,17640.543124,
min,,,,,,,,,,5000.0,
25%,,,,,,,,,,21500.0,
50%,,,,,,,,,,30000.0,
75%,,,,,,,,,,42500.0,


# 1. Drop columns

In [64]:
df.drop(columns=['SalaryRaw', 'LocationRaw'], inplace=True)
df_test.drop(columns=['LocationRaw'], inplace=True)

In [65]:
print("Train data shape: ", df.shape)
print("Test data shape: ", df_test.shape)

Train data shape:  (244768, 9)
Test data shape:  (122463, 8)


# 2. Fill missing values

In [66]:
print('Missing values:')
df.isna().sum()/df.shape[0]*100

Missing values:


Title                  0.000409
FullDescription        0.000000
LocationNormalized     0.000000
ContractType          73.263662
ContractTime          26.108397
Company               13.249281
Category               0.000000
SalaryNormalized       0.000000
SourceName             0.000409
dtype: float64

In [67]:
def fill_missing_train_test(train_df, test_df):
    # Store the fill values from train
    fill_values = {}
    
    for col in train_df.columns:
        if train_df[col].dtype == 'O':  # object/string
            fill_values[col] = train_df[col].mode()[0]
        else:  # numbers
            fill_values[col] = train_df[col].mean()
    
    # Fill train and test with the same values
    train_filled = train_df.fillna(fill_values)
    test_filled = test_df.fillna(fill_values)
    
    return train_filled, test_filled

In [68]:
df, df_test = fill_missing_train_test(df, df_test)

In [69]:
df.isna().sum()/df.shape[0]*100

Title                 0.0
FullDescription       0.0
LocationNormalized    0.0
ContractType          0.0
ContractTime          0.0
Company               0.0
Category              0.0
SalaryNormalized      0.0
SourceName            0.0
dtype: float64

# 3. Duplicates

In [70]:
print("Train dataset duplicates BEFORE removing: ", df.duplicated().sum())
df.drop_duplicates(inplace=True)
print("Train dataset duplicates AFTER removing: ", df.duplicated().sum())

Train dataset duplicates BEFORE removing:  1
Train dataset duplicates AFTER removing:  0


# 4. Geostandarization - web scrapping

In [71]:
%pip install requests -q
%pip install beautifulsoup4 -q
import requests
from bs4 import BeautifulSoup as bs
import re
import time
import random
import json

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [72]:
def extract_number_from_text(text):
    match = re.search(r'(\d{1,3}(?:,\d{3})*)', text)
    return int(match.group(1).replace(',', '')) if match else None

In [73]:
def select_population_from_table(table):
    for header_row in table.select('tr:has(th)'):
        th = header_row.select_one('th')
        if 'Population' not in th.get_text():
            continue

        # next <tr> sibling (population data may be here)
        next_row = header_row.find_next_sibling('tr')


        # 1. population in the same row
        td = header_row.select_one('td')
        if td:
            val = extract_number_from_text(td.get_text())
            if val:
                return val
            
        # 2. population in the next row <td>
        if next_row and next_row.select_one('td'):
            val = extract_number_from_text(next_row.select_one('td').get_text())
            if val:
                return val

        # 3. multiple population years (bulleted list)
        if next_row and re.match(r'\s*â€¢\s*\d{4}', next_row.get_text()):
            # select all following <tr> until a break
            for tr in header_row.find_all_next('tr'):
                val = extract_number_from_text(tr.get_text())
                if val:
                    last_val = val
            return last_val
    return None

In [74]:
def get_page(url, headers, retries=3, delay_range=(1, 3)):
    for _ in range(retries):
        try:
            response = requests.get(url, headers=headers, timeout=10)
            if response.status_code == 200:
                return response
            if response.status_code == 404:
                return None
        except requests.exceptions.RequestException:
            pass
        time.sleep(random.uniform(*delay_range))
    return None

In [75]:
def fetch_infobox_table(url, headers, class_name='infobox'):
    page = get_page(url, headers)
    if not page:
        return None
    soup = bs(page.content, 'html.parser')
    return soup.find('table', class_=class_name)

In [76]:
def get_population_for_city(city, headers):
    urls = [
        f'https://en.wikipedia.org/wiki/{city}',
        f'https://en.wikipedia.org/wiki/{city}_(county)'
    ]
    for url in urls:
        table = fetch_infobox_table(url, headers)
        if table:
            pop = select_population_from_table(table)
            if pop:
                return pop
    return None

In [77]:
def get_population_for_location():
    headers = {"User-Agent": "LocationWebScrapper"}

    for city in cities:
        population = population_cache.get(city)
        if population is None:
            population = get_population_for_city(city, headers)
            if population:
                population_cache[city] = population
            else:
                not_working.add(city)
        print(f"{city}: {population}")

In [78]:
def print_missing_info(df):
    print(f"Missing data in population of location: {round(df[df['LocationPopulation'].isna()]['LocationNormalized'].count() / len(df) * 100, 2)}%, {df[df['LocationPopulation'].isna()]['LocationNormalized'].count()} cases")
    print()
    print(df[df['LocationPopulation'].isna()]['LocationNormalized'].value_counts()[:5])

In [79]:
# not_working = set()
# population_cache = {}

In [80]:
with open('data/population_cache.json', 'r', encoding='utf-8') as f:
    population_cache = json.load(f)

cities = df['LocationNormalized'].unique().tolist()
not_working = [city for city in cities if city not in population_cache]

df['LocationPopulation'] = df['LocationNormalized'].str.strip().map(lambda x: population_cache.get(x))

In [81]:
print_missing_info(df)
print()
print('not working cities: ', len(not_working))
print('cities in cache: ', len(population_cache))

Missing data in population of location: 17.59%, 43048 cases

LocationNormalized
South East London    11713
The City              6678
Central London        2607
Reading               2187
North West London     1104
Name: count, dtype: int64

not working cities:  999
cities in cache:  1733


In [82]:
# get_population_for_location()

In [83]:
# with open('data/population_cache.json', 'w', encoding='utf-8') as f:
#     json.dump(population_cache, f, ensure_ascii=False, indent=4)

# 4. Geostandarization

## 4.1. Get population data from geonames dataset

In [84]:
# selecting data only for GB - turn on once (long)
# cols = [
#     'geonameid','name','asciiname','alternatenames','lat','lon',
#     'feature_class','feature_code','country_code','cc2','admin1',
#     'admin2','admin3','admin4','population','elevation','dem','tz','moddate'
# ]

# geonames = pd.read_csv(
#     "allCountries.txt",
#     sep="\t",
#     names=cols,
#     usecols=['asciiname', 'alternatenames', 'country_code', 'feature_code', 'feature_class', 'admin1', 'admin2', 'admin3', 'lon', 'lat', 'population'],
#     dtype=str,
#     header=None
# )

# geonames_gb = geonames[geonames['country_code'] == 'GB'].copy().reset_index(drop=True)
# geonames_gb = geonames_gb[geonames_gb['feature_class'].isin(['P', 'A'])].reset_index()
# geonames_gb.loc[geonames_gb['feature_code'] == 'PCLI', 'asciiname'] = 'UK'
# geonames_gb.to_csv('geonames_gb.csv')

In [85]:
geonames_gb = pd.read_csv('geo_datasets/geonames_gb.csv')
geonames_gb.rename(columns={'asciiname': 'name'}, inplace=True)

## 4.2. Get population for all locations where it is directly possible

In [86]:
# get population for locations
pop_dict = geonames_gb['population'].copy()
pop_dict = geonames_gb.set_index(geonames_gb['name'].str.lower().str.strip())['population'].to_dict()

pop_dict_test = geonames_gb['population'].copy()
pop_dict_test = geonames_gb.set_index(geonames_gb['name'].str.lower().str.strip())['population'].to_dict()

df['LocationPopulation'] = df['LocationNormalized'].str.lower().str.strip().map(lambda x: pop_dict.get(x))
df_test['LocationPopulation'] = df_test['LocationNormalized'].str.lower().str.strip().map(lambda x: pop_dict_test.get(x))

In [87]:
def print_missing_info(df):
    print(f"Missing data in population of location: {round(df[df['LocationPopulation'].isna()]['LocationNormalized'].count() / len(df) * 100, 2)}%, {df[df['LocationPopulation'].isna()]['LocationNormalized'].count()} cases")
    print()
    print(df[df['LocationPopulation'].isna()]['LocationNormalized'].value_counts()[:5])

In [88]:
print_missing_info(df)

Missing data in population of location: 12.44%, 30460 cases

LocationNormalized
South East London    11713
Central London        2607
West Midlands         2540
Berkshire             1502
West Yorkshire        1072
Name: count, dtype: int64


In [89]:
print_missing_info(df_test)

Missing data in population of location: 12.33%, 15098 cases

LocationNormalized
South East London    5714
Central London       1347
West Midlands        1236
Berkshire             786
West Yorkshire        568
Name: count, dtype: int64


## 4.3. Remove directions and assign population to other fitting names

In [90]:
def fill_missing_population(df, location_col='LocationNormalized', pop_col='LocationPopulation', pop_dict=None):
    directions = ['North', 'South', 'East', 'West', 'Central']

    df[location_col] = df[location_col].replace(directions, '', regex=True).str.strip()

    missing_mask = df[pop_col].isna()
    missing_locations = df.loc[missing_mask, location_col].str.lower().str.strip()

    pop_dict_missing = {loc: pop_dict.get(loc, np.nan) for loc in missing_locations}
    df.loc[missing_mask, pop_col] = missing_locations.map(pop_dict_missing)

    return df

In [91]:
df = fill_missing_population(df, pop_dict=pop_dict)
df_test = fill_missing_population(df_test, pop_dict=pop_dict_test)

In [92]:
print_missing_info(df)

Missing data in population of location: 4.58%, 11216 cases

LocationNormalized
Midlands                    3456
Berkshire                   1502
Cheshire                     871
Yorkshire and Humberside     683
Bedfordshire                 544
Name: count, dtype: int64


In [93]:
print_missing_info(df_test)

Missing data in population of location: 4.55%, 5570 cases

LocationNormalized
Midlands                    1692
Berkshire                    786
Cheshire                     407
Yorkshire and Humberside     333
Bedfordshire                 272
Name: count, dtype: int64


## 4.4. Find population for Midlands in NUT regions

In [94]:
# remove locations out of GB
uk_lat_mask = (geonames_gb['lat'] >= 49) & (geonames_gb['lat'] <= 61)
uk_lon_mask = (geonames_gb['lon'] >= -10) & (geonames_gb['lon'] <= 2)
geonames_gb = geonames_gb[(geonames_gb['country_code'] == 'GB') & (uk_lat_mask) & (uk_lon_mask)]

In [95]:
nuts = pd.read_excel("geo_datasets/NUTS.xlsx")
nuts['NUTS118NM'] = nuts['NUTS118NM'].str.replace('(England)', '', regex=False).str.strip()
nuts = nuts.rename(columns={'NUTS118NM': 'name', 'LONG': 'lon', 'LAT': 'lat'})

In [96]:
# find the closest point in geonames in nuts
from scipy.spatial import cKDTree
tree = cKDTree(geonames_gb[['lat', 'lon']].values)
nuts_coords = nuts[['lat', 'lon']].values

distances, indices = tree.query(nuts_coords, k=1)  # k=1 -> 1 neighbour

nuts['population'] = geonames_gb.iloc[indices]['population'].values
nuts_population = dict(zip(nuts['name'], nuts['population']))

In [97]:
from typing import Counter

# combine West and East Midlands
nuts_population = {**{k: v for k, v in nuts_population.items() if 'Midlands' not in k},
                 **{'Midlands': sum(v for k, v in nuts_population.items() if 'Midlands' in k)}}

In [98]:
nuts_population

{'North East': 1126,
 'North West': 0,
 'Yorkshire and The Humber': 0,
 'East of England': 686,
 'London': 10750,
 'South East': 0,
 'South West': 0,
 'Wales': 0,
 'Scotland': 8830,
 'Northern Ireland': 0,
 'Midlands': 50878}

In [99]:
# impute nuts locations
def impute_nuts_location(df, nuts_population):
    population_from_dict = df['LocationNormalized'].map(nuts_population)
    mask = ((df['LocationPopulation'].isnull()) | (df['LocationPopulation'] == 0)) & population_from_dict.notnull()
    df.loc[mask, 'LocationPopulation'] = population_from_dict[mask]
    return df

In [100]:
df = impute_nuts_location(df, nuts_population)
df_test = impute_nuts_location(df_test, nuts_population)

In [101]:
print_missing_info(df)

Missing data in population of location: 3.17%, 7760 cases

LocationNormalized
Berkshire                   1502
Cheshire                     871
Yorkshire and Humberside     683
Bedfordshire                 544
Edinburgh Technopole         408
Name: count, dtype: int64


## 4.5. Cast rest of cases as 'UK'

In [102]:
def impute_uk_population(df):
    mask = df['LocationPopulation'].isna() | (df['LocationPopulation'] == 0)
    uk_pop = df.loc[df['LocationNormalized'].str.lower().eq('uk'), 'LocationPopulation'].dropna().iloc[0] if any(df['LocationNormalized'].str.lower().eq('uk')) else np.nan
    df.loc[mask, ['LocationNormalized', 'LocationPopulation']] = ['UK', uk_pop]
    return df

In [103]:
df = impute_uk_population(df)
df_test = impute_uk_population(df_test)

In [104]:
print_missing_info(df_test)

Missing data in population of location: 0.0%, 0 cases

Series([], Name: count, dtype: int64)


In [105]:
df['LocationPopulation'].value_counts().head()

LocationPopulation
66488991.0    109312
8961989.0      45511
541263.0        3516
50878.0         3456
1157603.0       3061
Name: count, dtype: int64

In [106]:
df.drop(columns=['LocationNormalized'], inplace=True)
df_test.drop(columns=['LocationNormalized'], inplace=True)

# Word2Vec

In [107]:
%pip install nltk -q
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
%pip install gensim -q
from gensim.models import Word2Vec
import multiprocessing

Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Note: you may need to restart the kernel to use updated packages.


In [108]:
# tokenization
def tokenize_text(text):
    return word_tokenize(text.lower())

In [109]:
# average vectors
def document_vector(word_list, model, vector_size):
    # Initialize a zero vector
    vector = np.zeros(vector_size)
    count = 0
    
    # Sum the vectors of all words in the text
    for word in word_list:
        if word in model.wv:
            vector += model.wv[word]
            count += 1
            
    # Return the average vector
    if count != 0:
        return vector / count
    else:
        # Return the zero vector if no words were found in the vocabulary
        return vector

In [None]:
titles = df['Title'].apply(tokenize_text).tolist()
descriptions = df['FullDescription'].apply(tokenize_text).tolist()
all_sentences = titles + descriptions

In [None]:
vector_size = 50  # output dim
window = 5        # context window
min_count = 5     # filter rare words
workers = multiprocessing.cpu_count() - 1

w2v_model = Word2Vec(
    sentences=all_sentences,
    vector_size=vector_size,
    window=window,
    min_count=min_count,
    workers=workers,
    sg=0
)

### Prepare ngrams

In [None]:
titles = df['Title'].apply(lambda x: document_vector(tokenize_text(x), w2v_model, vector_size))
descriptions = df['FullDescription'].apply(lambda x: document_vector(tokenize_text(x), w2v_model, vector_size))

In [None]:
title_df = pd.DataFrame(titles.tolist(), index=df.index).add_prefix('Title_vec_')
desc_df = pd.DataFrame(descriptions.tolist(), index=df.index).add_prefix('FullDescription_vec_')

In [None]:
texts = pd.concat([title_df, desc_df], axis=1)

# Transformer

In [None]:
%pip install transformers -q
from transformers import RobertaTokenizer, RobertaModel
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm import tqdm

In [None]:
class RobertaFeatureDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

In [None]:
def extract_cls_vectors(model, data_loader, device):
    model.eval()
    all_cls_vectors = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Extraction [CLS] RoBERTa"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            cls_vector = outputs.last_hidden_state[:, 0, :]
            
            all_cls_vectors.append(cls_vector.cpu().numpy())

    final_vector_array = np.concatenate(all_cls_vectors, axis=0)
    return final_vector_array

In [None]:
MAX_LEN = 128
model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaModel.from_pretrained(model_name)
model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

In [None]:
# Prepare data

In [None]:
texts_concat = df['Title'] + ' [SEP] ' + df['FullDescription']
texts_list = texts_concat.tolist()

In [None]:
dataset = RobertaFeatureDataset(texts_list)
data_loader = DataLoader(dataset, batch_size=16, shuffle=False)

In [None]:
final_vector_array = extract_cls_vectors(model, data_loader, device)

texts_w2v_roberta = pd.DataFrame(
    final_vector_array,
    index=df.index
)

texts_w2v_roberta.columns = [f'cls_{i}' for i in range(final_vector_array.shape[1])]

In [None]:
texts_w2v_roberta.head()

# title-uni_gram & description-roberta

roberta

In [None]:
%pip install transformers -q
from transformers import RobertaTokenizer, RobertaModel
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm import tqdm

In [None]:
MAX_LEN = 128
model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaModel.from_pretrained(model_name)
model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

In [None]:
description_ds = RobertaFeatureDataset(df['FullDescription'].tolist())
description_dl = DataLoader(description_ds, batch_size=16, shuffle=False)

In [None]:
final_vector_array = extract_cls_vectors(model, description_dl, device)

description_roberta = pd.DataFrame(
    final_vector_array,
    index=df.index
)

description_roberta.columns = [f'cls_{i}' for i in range(final_vector_array.shape[1])]

In [None]:
desc_roberta_train = description_roberta.loc[train.index]
desc_roberta_test  = description_roberta.loc[test.index]

tf-idf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.decomposition import TruncatedSVD
import joblib

In [None]:
# test submission
tfidf_title = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 1))
X_test_title  = tfidf_title.transform(test["Title"])

In [None]:
tfidf_title = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 1))
X_train_title = tfidf_title.fit_transform(train["Title"])
X_test_title  = tfidf_title.transform(test["Title"])

In [None]:
from sklearn.utils.extmath import randomized_svd

U, Sigma, VT = randomized_svd(X_train_title, n_components=50, n_iter=5, random_state=42)
X_train_title = U @ np.diag(Sigma)
X_test_title  = X_test_title.dot(VT.T)

In [None]:
X_train_title.shape

In [None]:
desc_roberta_train.shape

In [None]:
df.shape

In [None]:
train.shape

In [None]:
train_text_combined = np.concatenate([desc_roberta_train, X_train_title], axis=1)
test_text_combined = np.concatenate([desc_roberta_test, X_test_title], axis=1)

In [None]:
train_text_combined_df = pd.DataFrame(train_text_combined)
test_text_combined_df = pd.DataFrame(test_text_combined)

train_text_combined_df.to_parquet('data/texts_uni_roberta_train.parquet', index=True)
test_text_combined_df.to_parquet('data/texts_uni_roberta_test.parquet',  index=True)

# 5. Split data

In [110]:
# tabular
from sklearn.model_selection import train_test_split
train, val = train_test_split(df, test_size=0.3, random_state=42)

In [None]:
# word2vec
texts_train = texts.loc[train.index]
texts_val = texts.loc[val.index]
texts_train.to_pickle('data/texts_w2v_train.pkl')
texts_val.to_pickle('data/texts_w2v_val.pkl')

In [None]:
# roberta
texts_roberta_train = texts_w2v_roberta.loc[train.index]
texts_roberta_test  = texts_w2v_roberta.loc[val.index]
texts_roberta_train.to_parquet('data/texts_roberta_train.parquet', index=True)
texts_roberta_test.to_parquet('data/texts_roberta_test.parquet',  index=True)

# 6. One hot encoding

In [111]:
# select most common source in category group
category_to_source = train.groupby('Category')['SourceName'].agg(lambda x: x.mode()[0]).to_dict()
train['SourceName'] = train['Category'].map(category_to_source)
val['SourceName'] = val['Category'].map(category_to_source)
df_test['SourceName'] = df_test['Category'].map(category_to_source)

In [112]:
train = pd.get_dummies(train, columns = ['ContractType', 'ContractTime', 'Category', 'SourceName'], drop_first=True, dtype=int)
val = pd.get_dummies(val, columns = ['ContractType', 'ContractTime', 'Category', 'SourceName'], drop_first=True, dtype=int)
test = pd.get_dummies(df_test, columns = ['ContractType', 'ContractTime', 'Category', 'SourceName'], drop_first=True, dtype=int)

# 7. Target Encoding - mean salary of company instead of company name

In [113]:
# combining companies by two first words
train['CompanyPrefix'] = train['Company'].apply(lambda x: ' '.join(str(x).split()[:2]))
val['CompanyPrefix'] = val['Company'].apply(lambda x: ' '.join(str(x).split()[:2]))
test['CompanyPrefix'] = test['Company'].apply(lambda x: ' '.join(str(x).split()[:2]))

In [114]:
# mean salary by company
mean_company = train.groupby('CompanyPrefix')['SalaryNormalized'].mean()
train['CompanyEncoded'] = train['CompanyPrefix'].map(mean_company)
val['CompanyEncoded'] = val['CompanyPrefix'].map(mean_company)
test['CompanyEncoded'] = test['CompanyPrefix'].map(mean_company)

# filling not existing companies in test with global mean
global_mean = train['SalaryNormalized'].mean()
val['CompanyEncoded'] = val['CompanyEncoded'].fillna(global_mean)
test['CompanyEncoded'] = test['CompanyEncoded'].fillna(global_mean)

train.drop(columns=['Company', 'CompanyPrefix'], inplace=True)
val.drop(columns=['Company', 'CompanyPrefix'], inplace=True)
test.drop(columns=['Company', 'CompanyPrefix'], inplace=True)

In [115]:
mean_company.head()

CompanyPrefix
.Michael Page    77500.000000
1 1              24462.857143
10 TRINITY       45000.000000
100 percent      40500.000000
100% IT          40750.000000
Name: SalaryNormalized, dtype: float64

# 9. Tf-idf

In [116]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.decomposition import TruncatedSVD
import joblib

In [117]:
def prepare_tfidf(df, n_grams):
    tfidf_description = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=n_grams)
    tfidf_title = TfidfVectorizer(max_features=1000, stop_words='english', ngram_range=n_grams)

    X_description = tfidf_description.fit_transform(df["FullDescription"])
    X_title = tfidf_title.fit_transform(df["Title"])

    return hstack([X_description, X_title]), tfidf_description, tfidf_title

In [118]:
def transform_tfidf(df, tfidf_description, tfidf_title):
    X_description = tfidf_description.transform(df["FullDescription"])
    X_title = tfidf_title.transform(df["Title"])

    return hstack([X_description, X_title])

In [None]:
X_train_text, tfidf_description, tfidf_title = prepare_tfidf(train, n_grams=(1, 1))
print('train completed')
X_val_text = transform_tfidf(val, tfidf_description, tfidf_title)
print('val completed')
X_test_text = transform_tfidf(test, tfidf_description, tfidf_title)
print('tdidf test completed')

# dimenshion reduction
svd = TruncatedSVD(n_components=50, random_state=42)

X_train_text = svd.fit_transform(X_train_text)
print('svd train completed')
X_val_text = svd.transform(X_val_text)
print('svd val completed')
X_test_text = svd.transform(X_test_text)

train completed
val completed
tdidf test completed


In [None]:
# saving
ngram = 'uni'  # 'uni', 'bi', 'tri'...
np.save(f"data/X_train_text_{ngram}.npy", X_train_text)
np.save(f"data/X_val_text_{ngram}.npy", X_val_text)
np.save(f"data/X_test_text_{ngram}.npy", X_test_text)

# 10. Tabular data saving

In [None]:
train_tab = train.drop(columns=['Title', 'FullDescription'])
val_tab = val.drop(columns=['Title', 'FullDescription'])
test_tab = test.drop(columns=['Title', 'FullDescription'])

In [None]:
train_tab.to_csv('data/train_preprocessed.csv', index=False)
val_tab.to_csv('data/val_preprocessed.csv', index=False)
test_tab.to_csv('data/test_preprocessed.csv', index=True)

In [None]:
train_tab.head(3)