In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

In [2]:
# Database imported
df = pd.read_csv('cleaned_products.csv', lineterminator='\n')

In [3]:
# Separate features and label
X = df[['product_name', 'product_description', 'location']]
y = df['price']

In [4]:
# Create a transformer
# This vectorises our features - no tokenisation, etc. necessary!
transformer = ColumnTransformer(
    [('vect1', TfidfVectorizer(), 'product_name'),
     ('vect2', TfidfVectorizer(), 'product_description'),
     ('vect3', TfidfVectorizer(), 'location')],
    remainder='passthrough'
)

In [5]:
# Create a pipeline
pipeline = Pipeline(
    [
        ("colt", transformer),
        ("lr", LinearRegression())
    ]
)

In [6]:
# Different parameters for cross-validation
parameters = {
    'colt__vect1__ngram_range': ((1, 1), (1, 2)),
    'colt__vect1__min_df': (0.005, 0.008, 0.01),
    'colt__vect2__ngram_range': ((1, 1), (1, 2), (1, 3)),
    'colt__vect2__min_df': (0.005, 0.008, 0.01),
}

In [7]:
# Actually perform the CV
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [9]:
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5)

In [12]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


In [13]:
# Predictions
# Here, we predict the price of each item based on the three textual features
grid_search.predict(X_test)

array([-7.65857624e+03,  4.20500291e+00,  1.13613804e+04, ...,
       -2.07980737e+04,  1.10462126e+04, -1.21392095e+04])

In [16]:
# Find RMSE of this model
np.sqrt(metrics.mean_squared_error(y_test, grid_search.predict(X_test)))

305805.6785127249