# Linear regression

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Ridge
import re
import scipy

In [2]:
%%capture 
%run nihil_ml_utils.ipynb

In [4]:
# data format:
# Full description - large text, Location Normalized - categorical town or some place
# ContractTime - categorical type of vacancy, SalaryNormalized - nans
data_train = pd.read_csv(to_path('salary-train.csv'))
data_test = pd.read_csv(to_path('salary-test-mini.csv'))

In [6]:
# Full description conversion:
#   replace all not text data by spaces, also convert to lowercase it.
#   convert by TfidfVectorizer to sparse matrix with l rows and tons of
#   feature columns

In [7]:
def process_text(text):
    return re.sub('[^a-zA-Z0-9]', ' ', text.lower())

In [9]:
data_train['FullDescription'] = data_train['FullDescription'].map(process_text)
data_test['FullDescription'] = data_test['FullDescription'].map(process_text)
tfidf_enc = TfidfVectorizer(min_df=5)
# min_df - ignore elements with freq less then x
X_train_tfidf = tfidf_enc.fit_transform(data_train['FullDescription'])
X_test_tfidf = tfidf_enc.transform(data_test['FullDescription'])

In [10]:
# Location Normalized, ContractTime conversion:
#   fill nans with 'nan' to count it as category
#   use DictVectorizer to one-hot encode merge of this two columns,
#   i.e. convert categorical features (merged as one dict) to set of binary
#   features

In [11]:
def fill_na(df):
    df['LocationNormalized'].fillna('nan', inplace=True)
    df['ContractTime'].fillna('nan', inplace=True)

In [12]:
fill_na(data_train)
# fill_na(data_test)
dict_enc = DictVectorizer()
X_train_categ = dict_enc.fit_transform(
    data_train[['LocationNormalized', 'ContractTime']].to_dict('records'))
X_test_categ = dict_enc.transform(
    data_test[['LocationNormalized', 'ContractTime']].to_dict('records'))

In [13]:
# Combine feature sparse matrices
X_train = scipy.sparse.hstack([X_train_tfidf, X_train_categ])
X_test = scipy.sparse.hstack([X_test_tfidf, X_test_categ])

In [14]:
# Get y
y_train = data_train['SalaryNormalized']

In [15]:
# Train Ridge linear regression
reg = Ridge(alpha=1, random_state=241)
reg.fit(X_train, y_train)

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=241, solver='auto', tol=0.001)

In [17]:
# Predict test set
y_test = reg.predict(X_test)
print(f'{y_test[0]:.2f} {y_test[1]:.2f}')

56555.62 37188.32
