In [4]:
import pandas as pd
import numpy as np
%pip install openpyxl -q

Note: you may need to restart the kernel to use updated packages.


In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)

In [None]:
df = pd.read_csv('data/Train_rev1.csv', index_col='Id')
df_test = pd.read_csv('data/Test_rev1.csv', index_col='Id')

In [None]:
print("Train data shape: ", df.shape)
print("Test data shape: ", df_test.shape)

In [None]:
df.head(3)

In [None]:
df.info()

In [None]:
df.describe(include='all')

# 1. Drop columns

In [None]:
df.drop(columns=['SalaryRaw', 'LocationRaw'], inplace=True)
df_test.drop(columns=['LocationRaw'], inplace=True)

In [None]:
print("Train data shape: ", df.shape)
print("Test data shape: ", df_test.shape)

# 2. Fill missing values

In [None]:
print('Missing values:')
df.isna().sum()/df.shape[0]*100

In [None]:
def fill_missing_train_test(train_df, test_df):
    # Store the fill values from train
    fill_values = {}
    
    for col in train_df.columns:
        if train_df[col].dtype == 'O':  # object/string
            fill_values[col] = train_df[col].mode()[0]
        else:  # numbers
            fill_values[col] = train_df[col].mean()
    
    # Fill train and test with the same values
    train_filled = train_df.fillna(fill_values)
    test_filled = test_df.fillna(fill_values)
    
    return train_filled, test_filled

In [None]:
df, df_test = fill_missing_train_test(df, df_test)

In [None]:
df.isna().sum()/df.shape[0]*100

# 3. Duplicates

In [None]:
print("Train dataset duplicates BEFORE removing: ", df.duplicated().sum())
df.drop_duplicates(inplace=True)
print("Train dataset duplicates AFTER removing: ", df.duplicated().sum())

# 4a. Geostandarization - web scrapping

In [None]:
%pip install requests -q
%pip install beautifulsoup4 -q
import requests
from bs4 import BeautifulSoup as bs
import re
import time
import random
import json

In [None]:
def extract_number_from_text(text):
    match = re.search(r'(\d{1,3}(?:,\d{3})*)', text)
    return int(match.group(1).replace(',', '')) if match else None

In [None]:
def select_population_from_table(table):
    for header_row in table.select('tr:has(th)'):
        th = header_row.select_one('th')
        if 'Population' not in th.get_text():
            continue

        # next <tr> sibling (population data may be here)
        next_row = header_row.find_next_sibling('tr')


        # 1. population in the same row
        td = header_row.select_one('td')
        if td:
            val = extract_number_from_text(td.get_text())
            if val:
                return val
            
        # 2. population in the next row <td>
        if next_row and next_row.select_one('td'):
            val = extract_number_from_text(next_row.select_one('td').get_text())
            if val:
                return val

        # 3. multiple population years (bulleted list)
        if next_row and re.match(r'\s*•\s*\d{4}', next_row.get_text()):
            # select all following <tr> until a break
            for tr in header_row.find_all_next('tr'):
                val = extract_number_from_text(tr.get_text())
                if val:
                    last_val = val
            return last_val
    return None

In [None]:
def get_page(url, headers, retries=3, delay_range=(1, 3)):
    for _ in range(retries):
        try:
            response = requests.get(url, headers=headers, timeout=10)
            if response.status_code == 200:
                return response
            if response.status_code == 404:
                return None
        except requests.exceptions.RequestException:
            pass
        time.sleep(random.uniform(*delay_range))
    return None

In [None]:
def fetch_infobox_table(url, headers, class_name='infobox'):
    page = get_page(url, headers)
    if not page:
        return None
    soup = bs(page.content, 'html.parser')
    return soup.find('table', class_=class_name)

In [None]:
def get_population_for_city(city, headers):
    urls = [
        f'https://en.wikipedia.org/wiki/{city}',
        f'https://en.wikipedia.org/wiki/{city}_(county)'
    ]
    for url in urls:
        table = fetch_infobox_table(url, headers)
        if table:
            pop = select_population_from_table(table)
            if pop:
                return pop
    return None

In [None]:
def get_population_for_location():
    headers = {"User-Agent": "LocationWebScrapper"}

    for city in cities:
        population = population_cache.get(city)
        if population is None:
            population = get_population_for_city(city, headers)
            if population:
                population_cache[city] = population
            else:
                not_working.add(city)
        print(f"{city}: {population}")

In [None]:
def print_missing_info(df):
    print(f"Missing data in population of location: {round(df[df['LocationPopulation'].isna()]['LocationNormalized'].count() / len(df) * 100, 2)}%, {df[df['LocationPopulation'].isna()]['LocationNormalized'].count()} cases")
    print()
    print(df[df['LocationPopulation'].isna()]['LocationNormalized'].value_counts()[:5])

In [None]:
# not_working = set()
# population_cache = {}

In [None]:
with open('data/population_cache.json', 'r', encoding='utf-8') as f:
    population_cache = json.load(f)

cities = df['LocationNormalized'].unique().tolist()
not_working = [city for city in cities if city not in population_cache]

df['LocationPopulation'] = df['LocationNormalized'].str.strip().map(lambda x: population_cache.get(x))

In [None]:
print_missing_info(df)
print()
print('not working cities: ', len(not_working))
print('cities in cache: ', len(population_cache))

In [None]:
# get_population_for_location()

In [None]:
# with open('data/population_cache.json', 'w', encoding='utf-8') as f:
#     json.dump(population_cache, f, ensure_ascii=False, indent=4)

# 4b. Geostandarization - using common datasets

## 4.1. Get population data from geonames dataset

In [None]:
# selecting data only for GB - turn on once (long)
# cols = [
#     'geonameid','name','asciiname','alternatenames','lat','lon',
#     'feature_class','feature_code','country_code','cc2','admin1',
#     'admin2','admin3','admin4','population','elevation','dem','tz','moddate'
# ]

# geonames = pd.read_csv(
#     "allCountries.txt",
#     sep="\t",
#     names=cols,
#     usecols=['asciiname', 'alternatenames', 'country_code', 'feature_code', 'feature_class', 'admin1', 'admin2', 'admin3', 'lon', 'lat', 'population'],
#     dtype=str,
#     header=None
# )

# geonames_gb = geonames[geonames['country_code'] == 'GB'].copy().reset_index(drop=True)
# geonames_gb = geonames_gb[geonames_gb['feature_class'].isin(['P', 'A'])].reset_index()
# geonames_gb.loc[geonames_gb['feature_code'] == 'PCLI', 'asciiname'] = 'UK'
# geonames_gb.to_csv('geonames_gb.csv')

In [None]:
geonames_gb = pd.read_csv('geo_datasets/geonames_gb.csv')
geonames_gb.rename(columns={'asciiname': 'name'}, inplace=True)

## 4.2. Get population for all locations where it is directly possible

In [None]:
# get population for locations
pop_dict = geonames_gb['population'].copy()
pop_dict = geonames_gb.set_index(geonames_gb['name'].str.lower().str.strip())['population'].to_dict()

pop_dict_test = geonames_gb['population'].copy()
pop_dict_test = geonames_gb.set_index(geonames_gb['name'].str.lower().str.strip())['population'].to_dict()

df['LocationPopulation'] = df['LocationNormalized'].str.lower().str.strip().map(lambda x: pop_dict.get(x))
df_test['LocationPopulation'] = df_test['LocationNormalized'].str.lower().str.strip().map(lambda x: pop_dict_test.get(x))

In [None]:
def print_missing_info(df):
    print(f"Missing data in population of location: {round(df[df['LocationPopulation'].isna()]['LocationNormalized'].count() / len(df) * 100, 2)}%, {df[df['LocationPopulation'].isna()]['LocationNormalized'].count()} cases")
    print()
    print(df[df['LocationPopulation'].isna()]['LocationNormalized'].value_counts()[:5])

In [None]:
print_missing_info(df)

In [None]:
print_missing_info(df_test)

## 4.3. Remove directions and assign population to other fitting names

In [None]:
def fill_missing_population(df, location_col='LocationNormalized', pop_col='LocationPopulation', pop_dict=None):
    directions = ['North', 'South', 'East', 'West', 'Central']

    df[location_col] = df[location_col].replace(directions, '', regex=True).str.strip()

    missing_mask = df[pop_col].isna()
    missing_locations = df.loc[missing_mask, location_col].str.lower().str.strip()

    pop_dict_missing = {loc: pop_dict.get(loc, np.nan) for loc in missing_locations}
    df.loc[missing_mask, pop_col] = missing_locations.map(pop_dict_missing)

    return df

In [None]:
df = fill_missing_population(df, pop_dict=pop_dict)
df_test = fill_missing_population(df_test, pop_dict=pop_dict_test)

In [None]:
print_missing_info(df)

In [None]:
print_missing_info(df_test)

## 4.4. Find population for Midlands in NUT regions

In [None]:
# remove locations out of GB
uk_lat_mask = (geonames_gb['lat'] >= 49) & (geonames_gb['lat'] <= 61)
uk_lon_mask = (geonames_gb['lon'] >= -10) & (geonames_gb['lon'] <= 2)
geonames_gb = geonames_gb[(geonames_gb['country_code'] == 'GB') & (uk_lat_mask) & (uk_lon_mask)]

In [None]:
nuts = pd.read_excel("geo_datasets/NUTS.xlsx")
nuts['NUTS118NM'] = nuts['NUTS118NM'].str.replace('(England)', '', regex=False).str.strip()
nuts = nuts.rename(columns={'NUTS118NM': 'name', 'LONG': 'lon', 'LAT': 'lat'})

In [None]:
# find the closest point in geonames in nuts
from scipy.spatial import cKDTree
tree = cKDTree(geonames_gb[['lat', 'lon']].values)
nuts_coords = nuts[['lat', 'lon']].values

distances, indices = tree.query(nuts_coords, k=1)  # k=1 -> 1 neighbour

nuts['population'] = geonames_gb.iloc[indices]['population'].values
nuts_population = dict(zip(nuts['name'], nuts['population']))

In [None]:
from typing import Counter

# combine West and East Midlands
nuts_population = {**{k: v for k, v in nuts_population.items() if 'Midlands' not in k},
                 **{'Midlands': sum(v for k, v in nuts_population.items() if 'Midlands' in k)}}

In [None]:
nuts_population

In [None]:
# impute nuts locations
def impute_nuts_location(df, nuts_population):
    population_from_dict = df['LocationNormalized'].map(nuts_population)
    mask = ((df['LocationPopulation'].isnull()) | (df['LocationPopulation'] == 0)) & population_from_dict.notnull()
    df.loc[mask, 'LocationPopulation'] = population_from_dict[mask]
    return df

In [None]:
df = impute_nuts_location(df, nuts_population)
df_test = impute_nuts_location(df_test, nuts_population)

In [None]:
print_missing_info(df)

## 4.5. Cast rest of cases as 'UK'

In [None]:
def impute_uk_population(df):
    mask = df['LocationPopulation'].isna() | (df['LocationPopulation'] == 0)
    uk_pop = df.loc[df['LocationNormalized'].str.lower().eq('uk'), 'LocationPopulation'].dropna().iloc[0] if any(df['LocationNormalized'].str.lower().eq('uk')) else np.nan
    df.loc[mask, ['LocationNormalized', 'LocationPopulation']] = ['UK', uk_pop]
    return df

In [None]:
df = impute_uk_population(df)
df_test = impute_uk_population(df_test)

In [None]:
print_missing_info(df_test)

In [None]:
df['LocationPopulation'].value_counts().head()

In [None]:
df.drop(columns=['LocationNormalized'], inplace=True)
df_test.drop(columns=['LocationNormalized'], inplace=True)

# Text - Word2Vec

In [None]:
%pip install nltk -q
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
%pip install gensim -q
from gensim.models import Word2Vec
import multiprocessing

In [None]:
# tokenization
def tokenize_text(text):
    return word_tokenize(text.lower())

In [None]:
# average vectors
def document_vector(word_list, model, vector_size):
    # Initialize a zero vector
    vector = np.zeros(vector_size)
    count = 0
    
    # Sum the vectors of all words in the text
    for word in word_list:
        if word in model.wv:
            vector += model.wv[word]
            count += 1
            
    # Return the average vector
    if count != 0:
        return vector / count
    else:
        # Return the zero vector if no words were found in the vocabulary
        return vector

In [None]:
def train_word2vec(df, title_col='Title', desc_col='FullDescription', 
                   vector_size=50, window=5, min_count=5, sg=0):
    
    """
    Trains a Word2Vec model on titles and descriptions from a DataFrame.
    
    Args:
        df (pd.DataFrame): DataFrame containing the data.
        title_col (str): Name of the column with titles.
        desc_col (str): Name of the column with descriptions.
        vector_size (int): Dimensionality of the output vectors.
        window (int): Context window size.
        min_count (int): Minimum word frequency to consider.
        sg (int): 0 = CBOW, 1 = Skip-gram.
    
    Returns:
        gensim.models.Word2Vec: Trained Word2Vec model.
    """

    titles = df[title_col].apply(tokenize_text).tolist()
    print(f"Tokenized {len(titles)} titles.")

    descriptions = df[desc_col].apply(tokenize_text).tolist()
    print(f"Tokenized {len(descriptions)} descriptions.")

    all_sentences = titles + descriptions
    print(f"Total sentences for training: {len(all_sentences)}")
    
    
    workers = max(1, multiprocessing.cpu_count() - 1)
    
    model = Word2Vec(
        sentences=all_sentences,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        workers=workers,
        sg=sg
    )
    
    return model

In [None]:
# w2v_model = train_word2vec(df)

In [None]:
def vectorize_text_columns(df, w2v_model, vector_size, title_col='Title', desc_col='FullDescription'):
    
    titles = df[title_col].apply(lambda x: document_vector(tokenize_text(x), w2v_model, vector_size))
    descriptions = df[desc_col].apply(lambda x: document_vector(tokenize_text(x), w2v_model, vector_size))
    
    title_df = pd.DataFrame(titles.tolist(), index=df.index).add_prefix(f'{title_col}_vec_')
    desc_df = pd.DataFrame(descriptions.tolist(), index=df.index).add_prefix(f'{desc_col}_vec_')
    
    return pd.concat([title_df, desc_df], axis=1)

In [None]:
# texts_w2v = vectorize_text_columns(df, w2v_model, vector_size=50)

# Text - Transformer

In [None]:
%pip install transformers -q
from transformers import RobertaTokenizer, RobertaModel
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm import tqdm

In [None]:
class RobertaFeatureDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

In [None]:
def extract_cls_vectors(model, data_loader, device):
    """
    Converts specified text columns in a DataFrame into Word2Vec vector representations.    
    """

    model.eval()
    all_cls_vectors = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Extraction [CLS] RoBERTa"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            cls_vector = outputs.last_hidden_state[:, 0, :]
            
            all_cls_vectors.append(cls_vector.cpu().numpy())

    final_vector_array = np.concatenate(all_cls_vectors, axis=0)
    return final_vector_array

In [61]:
MAX_LEN = 128
model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaModel.from_pretrained(model_name)
model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda


In [62]:
# Prepare data

In [None]:
def extract_roberta_features(df, model, device, title_col='Title', desc_col='FullDescription', batch_size=16):
    """
    Extracts CLS token vectors from a RoBERTa model for combined title and description text.
    """
    # Combine title and description with [SEP] token
    texts_concat = df[title_col] + ' [SEP] ' + df[desc_col]
    texts_list = texts_concat.tolist()
    
    # Create dataset and dataloader
    dataset = RobertaFeatureDataset(texts_list)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
    # Extract CLS vectors
    final_vector_array = extract_cls_vectors(model, data_loader, device)
    
    # Convert to DataFrame
    feature_df = pd.DataFrame(final_vector_array, index=df.index)
    feature_df.columns = [f'cls_{i}' for i in range(final_vector_array.shape[1])]
    
    return feature_df


In [None]:
# texts_w2v_roberta = extract_roberta_features(df, model, device)

# 5. Split data

In [None]:
# tabular
from sklearn.model_selection import train_test_split
train, val = train_test_split(df, test_size=0.3, random_state=42)
test = df_test.copy()

In [None]:
# word2vec
texts_train = texts_w2v.loc[train.index]
texts_val = texts_w2v.loc[val.index]
texts_train.to_pickle('data/texts_w2v_train.pkl')
texts_val.to_pickle('data/texts_w2v_val.pkl')

In [None]:
# roberta
texts_roberta_train = texts_w2v_roberta.loc[train.index]
texts_roberta_test  = texts_w2v_roberta.loc[val.index]
texts_roberta_train.to_parquet('data/texts_roberta_train.parquet', index=True)
texts_roberta_test.to_parquet('data/texts_roberta_test.parquet',  index=True)

# Text - Tf-idf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.decomposition import TruncatedSVD
import joblib

In [None]:
def prepare_tfidf(df, n_grams):
    tfidf_description = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=n_grams)
    tfidf_title = TfidfVectorizer(max_features=1000, stop_words='english', ngram_range=n_grams)

    X_description = tfidf_description.fit_transform(df["FullDescription"])
    X_title = tfidf_title.fit_transform(df["Title"])

    return hstack([X_description, X_title]), tfidf_description, tfidf_title

In [None]:
def transform_tfidf(df, tfidf_description, tfidf_title):
    X_description = tfidf_description.transform(df["FullDescription"])
    X_title = tfidf_title.transform(df["Title"])

    return hstack([X_description, X_title])

In [None]:
X_train_text, tfidf_description, tfidf_title = prepare_tfidf(train, n_grams=(1, 1))
print('train completed')
X_val_text = transform_tfidf(val, tfidf_description, tfidf_title)
print('val completed')
X_test_text = transform_tfidf(test, tfidf_description, tfidf_title)
print('tdidf test completed')

# dimenshion reduction
svd = TruncatedSVD(n_components=50, random_state=42)

X_train_text = svd.fit_transform(X_train_text)
print('svd train completed')
X_val_text = svd.transform(X_val_text)
print('svd val completed')
X_test_text = svd.transform(X_test_text)

In [None]:
# saving
ngram = 'uni'  # 'uni', 'bi', 'tri'...
np.save(f"data/X_train_text_{ngram}.npy", X_train_text)
np.save(f"data/X_val_text_{ngram}.npy", X_val_text)
np.save(f"data/X_test_text_{ngram}.npy", X_test_text)

# Text - Combine Tf-idf & Roberta

The column 'Title' is going to be transformed by tf-idf (uni-grams) and 'Description' by Roberta.

Roberta

In [70]:
# train
train_ds = RobertaFeatureDataset(train['FullDescription'].tolist())
train_dl = DataLoader(train_ds, batch_size=16, shuffle=False)
train_vectors = extract_cls_vectors(model, train_dl, device)
desc_roberta_train = pd.DataFrame(train_vectors, index=train.index)
desc_roberta_train.columns = [f'cls_{i}' for i in range(train_vectors.shape[1])]

Extraction [CLS] RoBERTa: 100%|██████████| 10709/10709 [21:40<00:00,  8.24it/s]


In [71]:
desc_roberta_train.to_parquet("data/desc_roberta_train.parquet", index=False)

In [72]:
# val
val_ds = RobertaFeatureDataset(val['FullDescription'].tolist())
val_dl = DataLoader(val_ds, batch_size=16, shuffle=False)
val_vectors = extract_cls_vectors(model, val_dl, device)
desc_roberta_val = pd.DataFrame(val_vectors, index=val.index)
desc_roberta_val.columns = [f'cls_{i}' for i in range(val_vectors.shape[1])]

Extraction [CLS] RoBERTa: 100%|██████████| 4590/4590 [09:14<00:00,  8.28it/s]


In [73]:
desc_roberta_val.to_parquet("data/desc_roberta_val.parquet", index=False)

In [None]:
# test
test_ds = RobertaFeatureDataset(test['FullDescription'].tolist())
test_dl = DataLoader(test_ds, batch_size=16, shuffle=False)
test_vectors = extract_cls_vectors(model, test_dl, device)
desc_roberta_test = pd.DataFrame(test_vectors, index=test.index)
desc_roberta_test.columns = [f'cls_{i}' for i in range(test_vectors.shape[1])]

In [76]:
desc_roberta_test.to_parquet("data/desc_roberta_test.parquet", index=False)

In [5]:
# read description
X_train_desc = pd.read_parquet("data/desc_roberta_train.parquet")
X_val_desc = pd.read_parquet("data/desc_roberta_val.parquet")
X_test_desc = pd.read_parquet("data/desc_roberta_test.parquet")

Tf-idf

In [None]:
tfidf_title = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 1))
X_train_title = tfidf_title.fit_transform(train["Title"])
X_val_title  = tfidf_title.transform(val["Title"])
X_test_title  = tfidf_title.transform(test["Title"])

In [None]:
svd = TruncatedSVD(n_components=50, random_state=42)

X_train_title = svd.fit_transform(X_train_title)
print('train svd completed')
X_val_title = svd.transform(X_val_title)
print('val svd completed')
X_test_title = svd.transform(X_test_title)

In [2]:
X_train_title = np.load("data/X_train_title.npy")
X_val_title = np.load("data/X_val_title.npy")
X_test_title = np.load("data/X_test_title.npy")

Combine & Save

In [8]:
train_text_combined = np.concatenate([X_train_desc, X_train_title], axis=1)
val_text_combined = np.concatenate([X_val_desc, X_val_title], axis=1)
test_text_combined = np.concatenate([X_test_desc, X_test_title], axis=1)

train_text_combined_df = pd.DataFrame(train_text_combined)
val_text_combined_df = pd.DataFrame(val_text_combined)
test_text_combined_df = pd.DataFrame(test_text_combined)

train_text_combined_df.to_parquet('data/texts_uni_roberta_train.parquet', index=True)
val_text_combined_df.to_parquet('data/texts_uni_roberta_val.parquet',  index=True)
test_text_combined_df.to_parquet('data/texts_uni_roberta_test.parquet',  index=True)

# 6. One hot encoding

In [None]:
# select most common source in category group
category_to_source = train.groupby('Category')['SourceName'].agg(lambda x: x.mode()[0]).to_dict()
train['SourceName'] = train['Category'].map(category_to_source)
val['SourceName'] = val['Category'].map(category_to_source)
df_test['SourceName'] = df_test['Category'].map(category_to_source)

In [None]:
train = pd.get_dummies(train, columns = ['ContractType', 'ContractTime', 'Category', 'SourceName'], drop_first=True, dtype=int)
val = pd.get_dummies(val, columns = ['ContractType', 'ContractTime', 'Category', 'SourceName'], drop_first=True, dtype=int)
test = pd.get_dummies(df_test, columns = ['ContractType', 'ContractTime', 'Category', 'SourceName'], drop_first=True, dtype=int)

# 7. Target Encoding - mean salary of company instead of company name

In [None]:
# combining companies by two first words
train['CompanyPrefix'] = train['Company'].apply(lambda x: ' '.join(str(x).split()[:2]))
val['CompanyPrefix'] = val['Company'].apply(lambda x: ' '.join(str(x).split()[:2]))
test['CompanyPrefix'] = test['Company'].apply(lambda x: ' '.join(str(x).split()[:2]))

In [None]:
# mean salary by company
mean_company = train.groupby('CompanyPrefix')['SalaryNormalized'].mean()
train['CompanyEncoded'] = train['CompanyPrefix'].map(mean_company)
val['CompanyEncoded'] = val['CompanyPrefix'].map(mean_company)
test['CompanyEncoded'] = test['CompanyPrefix'].map(mean_company)

# filling not existing companies in test with global mean
global_mean = train['SalaryNormalized'].mean()
val['CompanyEncoded'] = val['CompanyEncoded'].fillna(global_mean)
test['CompanyEncoded'] = test['CompanyEncoded'].fillna(global_mean)

train.drop(columns=['Company', 'CompanyPrefix'], inplace=True)
val.drop(columns=['Company', 'CompanyPrefix'], inplace=True)
test.drop(columns=['Company', 'CompanyPrefix'], inplace=True)

In [None]:
mean_company.head()

# 10. Tabular data saving

In [None]:
train_tab = train.drop(columns=['Title', 'FullDescription'])
val_tab = val.drop(columns=['Title', 'FullDescription'])
test_tab = test.drop(columns=['Title', 'FullDescription'])

In [None]:
train_tab.to_csv('data/train_preprocessed.csv', index=False)
val_tab.to_csv('data/val_preprocessed.csv', index=False)
test_tab.to_csv('data/test_preprocessed.csv', index=True)

In [None]:
train_tab.head(3)