In [1]:
import os
from pathlib import Path
import re

import pandas as pd
import numpy as np
from scipy.sparse import hstack
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge

In [2]:
data_path = Path(os.getcwd()).parent/"data"

df = pd.read_csv(data_path/"salary-train.csv")
df_test = pd.read_csv(data_path/"salary-test-mini.csv")

In [3]:
def save_ans(*ans, delimetr=" "):
    with open(data_path.parent/"res.txt", "w") as f:
        f.write(delimetr.join(map(str, ans)))

In [4]:
df.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,International Sales Manager London ****k ****...,London,permanent,33000
1,An ideal opportunity for an individual that ha...,London,permanent,50000
2,Online Content and Brand Manager// Luxury Reta...,South East London,permanent,40000
3,A great local marketleader is seeking a perman...,Dereham,permanent,22500
4,Registered Nurse / RGN Nursing Home for Young...,Sutton Coldfield,,20355


In [5]:
df_test.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,We currently have a vacancy for an HR Project ...,Milton Keynes,contract,
1,A Web developer opportunity has arisen with an...,Manchester,permanent,


In [6]:
df["LocationNormalized"].fillna('nan', inplace=True)
df["ContractTime"].fillna('nan', inplace=True)
df_test["LocationNormalized"].fillna('nan', inplace=True)
df_test["ContractTime"].fillna('nan', inplace=True)

In [7]:
df.iloc[:, 0] = df["FullDescription"]\
                .str.lower()\
                .replace(to_replace=r"[^A-Za-z0-9]", value=" ", regex=True)
df_test.iloc[:, 0] = df_test["FullDescription"]\
                     .str.lower()\
                     .replace(to_replace=r"[^A-Za-z0-9]", value=" ", regex=True)

In [8]:
tfidf = TfidfVectorizer(min_df=5)
X_text = tfidf.fit_transform(df["FullDescription"])
X_text_test = tfidf.transform(df_test["FullDescription"])

In [9]:
enc = DictVectorizer()
X_cat = enc.fit_transform(df[["LocationNormalized", "ContractTime"]].to_dict("records"))
X_cat_test = enc.transform(df_test[["LocationNormalized", "ContractTime"]].to_dict("records"))

In [10]:
X = hstack([X_text, X_cat])
X_test = hstack([X_text_test, X_cat_test])

In [11]:
y = df.iloc[:, 3].to_numpy()

In [12]:
reg = Ridge(alpha=1, fit_intercept=False, solver='lsqr')
reg.fit(X, y)

Ridge(alpha=1, fit_intercept=False, solver='lsqr')

In [13]:
preds = reg.predict(X_test)
preds

array([56098.46761723, 37153.23926334])

In [14]:
save_ans(*np.around(preds, 2))

In [15]:
df.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,international sales manager london k ...,London,permanent,33000
1,an ideal opportunity for an individual that ha...,London,permanent,50000
2,online content and brand manager luxury reta...,South East London,permanent,40000
3,a great local marketleader is seeking a perman...,Dereham,permanent,22500
4,registered nurse rgn nursing home for young...,Sutton Coldfield,,20355
