In [3]:
import pandas as pd
import re
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from scipy.sparse import hstack
from sklearn.linear_model import Ridge

In [4]:
train_data = pd.read_csv('salary-train.csv')
test_data = pd.read_csv('salary-test-mini.csv')
train_data.columns = ['FullDescription', 'LocationNormalized', 'ContractTime', 'SalaryNormalized']
test_data.columns = ['FullDescription', 'LocationNormalized', 'ContractTime', 'SalaryNormalized']

In [5]:
X_train = train_data[['FullDescription', 'LocationNormalized', 'ContractTime']]
Y_train = train_data['SalaryNormalized']
X_test = test_data[['FullDescription', 'LocationNormalized', 'ContractTime']]

In [6]:
train_data.loc[:,['LocationNormalized', 'ContractTime']]

Unnamed: 0,LocationNormalized,ContractTime
0,London,permanent
1,London,permanent
2,South East London,permanent
3,Dereham,permanent
4,Sutton Coldfield,
...,...,...
59995,Whitley Bay,contract
59996,Macclesfield,permanent
59997,Baldock,
59998,The City,permanent


In [7]:
train_text = X_train.loc[:,'FullDescription'].str.lower().replace('[^a-zA-Z0-9]', ' ', regex = True)

In [8]:
test_text = X_test.loc[:,'FullDescription'].str.lower().replace('[^a-zA-Z0-9]', ' ', regex = True)

In [9]:
test_categ = X_test.loc[:,['LocationNormalized', 'ContractTime']]

In [10]:
train_categ = X_train.loc[:,['LocationNormalized', 'ContractTime']]

In [11]:
train_categ.fillna('nan', inplace=True)
test_categ.fillna('nan', inplace=True)

In [12]:
train_categ

Unnamed: 0,LocationNormalized,ContractTime
0,London,permanent
1,London,permanent
2,South East London,permanent
3,Dereham,permanent
4,Sutton Coldfield,
...,...,...
59995,Whitley Bay,contract
59996,Macclesfield,permanent
59997,Baldock,
59998,The City,permanent


In [13]:
vectorizer = TfidfVectorizer(min_df=5)
X_train_vec = vectorizer.fit_transform(train_text)
X_test_vec = vectorizer.transform(test_text)

In [18]:
X_train_vec

<60000x22861 sparse matrix of type '<class 'numpy.float64'>'
	with 8365759 stored elements in Compressed Sparse Row format>

In [16]:
enc = DictVectorizer()
X_train_categ = enc.fit_transform(train_categ.to_dict('records'))
X_test_categ = enc.transform(test_categ.to_dict('records'))

In [17]:
X_train_categ

<60000x1766 sparse matrix of type '<class 'numpy.float64'>'
	with 120000 stored elements in Compressed Sparse Row format>

In [19]:
X_for_train = hstack([X_train_vec, X_train_categ])
X_for_test = hstack([X_test_vec, X_test_categ])

In [21]:
X_for_train.shape

(60000, 24627)

In [22]:
regressor = Ridge(alpha = 1, random_state=241)

In [23]:
regressor.fit(X_for_train, Y_train)

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=241, solver='auto', tol=0.001)

In [24]:
import numpy
for i in regressor.predict(X_for_test):
    print(str(numpy.round(i,2)))

56565.33
37140.63
