# Naive Model:

imports:

In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, confusion_matrix, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder

from keras.layers import Flatten, Dense, LSTM, Input, Dropout, BatchNormalization, Embedding, Lambda, dot
from keras.models import Model, Sequential
from math import sqrt
from gensim.models import Word2Vec, KeyedVectors

from sklearn.decomposition import PCA

from time import time

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

import itertools
import datetime

from keras.preprocessing.sequence import pad_sequences
import keras.backend as K
from keras.optimizers import Adadelta
from keras.callbacks import ModelCheckpoint

import re

# Google Drive mounting for reading data
from google.colab import drive
drive.mount('/content/drive/')

# Static seed for reproduction
seed = 0

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


loading the data:


In [0]:
# Dataset is now stored in a Pandas Dataframe
train_df = pd.read_csv('./drive/My Drive/Colab Notebooks/project_3/train.csv', encoding="ISO-8859-1")
p_description = pd.read_csv('./drive/My Drive/Colab Notebooks/project_3/product_descriptions.csv', encoding="ISO-8859-1")
test_df = pd.read_csv('./drive/My Drive/Colab Notebooks/project_3/test2.csv', encoding="ISO-8859-1")

In [0]:
train_df = train_df.merge(p_description, how='left', on='product_uid')
test_df = test_df.merge(p_description, how='left', on='product_uid')

preprocecing the data:


In [0]:
def naive_clean(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    nostop = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    return ''.join(nostop)
  
  
ps = PorterStemmer()

def stem_string(words):
  return "".join([ps.stem(word) for word in words.lower().split()])

def count_words(str_words):
  str_words, str_search = str_words
  return len(set(str_words).intersection(str_search))

In [0]:
import nltk
nltk.download('punkt')

In [0]:
train_df['queries'] = train_df['search_term'].apply(lambda x: nltk.word_tokenize(str(x).lower()))
train_df['descriptions'] = train_df['product_description'].apply(lambda x: nltk.word_tokenize(str(x).lower()))
train_df['intersec'] = train_df[['queries', 'descriptions']].apply(count_words, axis = 1)

In [88]:
train_df['descriptions'][:10]

0    [not, only, do, angles, make, joints, stronger...
1    [not, only, do, angles, make, joints, stronger...
2    [behr, premium, textured, deckover, is, an, in...
3    [update, your, bathroom, with, the, delta, ver...
4    [update, your, bathroom, with, the, delta, ver...
5    [achieving, delicious, results, is, almost, ef...
6    [achieving, delicious, results, is, almost, ef...
7    [achieving, delicious, results, is, almost, ef...
8    [the, quantum, adjustable, 2-light, led, black...
9    [get, the, house, of, fara, 3/4, in, ., x, 3, ...
Name: descriptions, dtype: object

In [0]:
train_df['intersec'] = train_df[['queries', 'descriptions']].apply(count_words, axis = 1)

In [0]:
test_df['queries'] = test_df['search_term'].apply(lambda x: nltk.word_tokenize(str(x).lower()))
test_df['descriptions'] = test_df['product_description'].apply(lambda x: nltk.word_tokenize(str(x).lower()))
test_df['intersec'] = test_df[['queries', 'descriptions']].apply(count_words, axis = 1)

In [0]:
x_train, x_val, y_train, y_val = train_test_split(train_df[['intersec']], train_df['relevance'])
x_test, x_val2, y_test, y_val2 = train_test_split(train_df[['intersec']], train_df['relevance'], test_size = 0.0)

using linear model:

In [97]:
from sklearn import linear_model
clf  = linear_model.LinearRegression()
clf.fit(x_train, y_train)

preds = clf.predict(x_val)
rms = sqrt(mean_squared_error(y_val, preds))
print('validation:')
print('RMSE:'+ str(rms))
print('MAE:'+ str(mean_absolute_error(preds, y_val)))


validation:
RMSE:0.5264495372956901
MAE:0.4314584737856713


In [98]:
preds = clf.predict(x_train)
rms = sqrt(mean_squared_error(y_train, preds))
print('train:')
print('RMSE:'+ str(rms))
print('MAE:'+ str(mean_absolute_error(preds, y_train)))


train:
RMSE:0.5270511609319343
MAE:0.43357293977557415


In [99]:
preds = clf.predict(x_test)
rms = sqrt(mean_squared_error(y_test, preds))
print('test:')
print('RMSE:'+ str(rms))
print('MAE:'+ str(mean_absolute_error(preds, y_test)))


test:
RMSE:0.5269008173937199
MAE:0.4330443161410942


using Lasso model:

In [100]:
from sklearn import linear_model
clf  = linear_model.Lasso(alpha=0.1)
clf.fit(x_train, y_train)

preds = clf.predict(x_val)
rms = sqrt(mean_squared_error(y_val, preds))
print('validation:')
print('RMSE:'+ str(rms))
print('MAE:'+ str(mean_absolute_error(preds, y_val)))


validation:
RMSE:0.5331304457415034
MAE:0.4360701320755827


In [101]:
preds = clf.predict(x_train)
rms = sqrt(mean_squared_error(y_train, preds))
print('train:')
print('RMSE:'+ str(rms))
print('MAE:'+ str(mean_absolute_error(preds, y_train)))


train:
RMSE:0.534263551123226
MAE:0.4385246942278386


In [102]:
preds = clf.predict(x_test)
rms = sqrt(mean_squared_error(y_test, preds))
print('test:')
print('RMSE:'+ str(rms))
print('MAE:'+ str(mean_absolute_error(preds, y_test)))

test:
RMSE:0.5339804963721828
MAE:0.43791104540483616
