In [1]:
import pandas as pd

data_train = pd.read_csv('salary-train.csv')
data_train.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,International Sales Manager London ****k ****...,London,permanent,33000
1,An ideal opportunity for an individual that ha...,London,permanent,50000
2,Online Content and Brand Manager// Luxury Reta...,South East London,permanent,40000
3,A great local marketleader is seeking a perman...,Dereham,permanent,22500
4,Registered Nurse / RGN Nursing Home for Young...,Sutton Coldfield,,20355


In [2]:
data_train.FullDescription = data_train.FullDescription.str.lower().replace('[^a-zA-Z0-9]', ' ', regex = True)
data_train.LocationNormalized = data_train.LocationNormalized.str.lower()
data_train.ContractTime = data_train.ContractTime.str.lower()

data_train.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,international sales manager london k ...,london,permanent,33000
1,an ideal opportunity for an individual that ha...,london,permanent,50000
2,online content and brand manager luxury reta...,south east london,permanent,40000
3,a great local marketleader is seeking a perman...,dereham,permanent,22500
4,registered nurse rgn nursing home for young...,sutton coldfield,,20355


In [3]:
# TfidfVectorizer - Convert a collection of raw documents to a matrix of TF-IDF features
from sklearn.feature_extraction.text import TfidfVectorizer

# transforming text to feature-vector using TF-IDF Vectorizer
vectorizer = TfidfVectorizer(encoding='latin-1', min_df=5)  # words in min 5 objects

In [4]:
# replacing all ' ' to 'nan' in cols LocationNormalized and ContractTime
data_train.LocationNormalized.fillna('nan', inplace=True)
data_train.ContractTime.fillna('nan', inplace=True)

In [5]:
# one-hot-coding means fransform text data to binary data
from sklearn.feature_extraction import DictVectorizer

# to get one-hot-coding 
encoder = DictVectorizer()
X_train_description = vectorizer.fit_transform(data_train['FullDescription'])
X_train_categ = encoder.fit_transform(data_train[['LocationNormalized', 'ContractTime']].to_dict('records'))

In [6]:
from scipy.sparse import hstack

train_set = hstack([X_train_description, X_train_categ])

In [7]:
from sklearn.linear_model import Ridge

model = Ridge(alpha=1, random_state=241)
model.fit(train_set, data_train.SalaryNormalized)

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=241, solver='auto', tol=0.001)

In [10]:
data_test = pd.read_csv('salary-test-mini.csv')
data_test.FullDescription = data_test.FullDescription.str.lower().replace('[^a-zA-Z0-9]', ' ', regex = True)

data_test.LocationNormalized = data_test.LocationNormalized.str.lower()
data_test.ContractTime = data_test.ContractTime.str.lower()

data_test.LocationNormalized.fillna('nan', inplace=True)
data_test.ContractTime.fillna('nan', inplace=True)

In [15]:
X_test_description = vectorizer.transform(data_test.FullDescription)
X_test_categ = encoder.transform(data_test[['LocationNormalized', 'ContractTime']].to_dict('records'))

test_set = hstack([X_test_description, X_test_categ])

In [19]:
res = model.predict(test_set)
res

array([ 56555.61500155,  37188.32442618])

In [20]:
with open('q1.txt', 'w') as f:
    f.write(str(res[0]) + ' ' + str(res[1]))
f.close()