# Introduction

### Imports

In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import json
from collections import Counter


import statsmodels.api as sm
from sklearn.linear_model import LinearRegression, LassoCV, ElasticNetCV, RidgeCV, Lasso, Ridge, ElasticNet

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse

%matplotlib inline

# Options for pandas
pd.options.display.max_columns = 150
pd.options.display.max_rows = 150
pd.options.display.max_colwidth = 200

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Introduction" data-toc-modified-id="Introduction-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Introduction</a></span><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1.0.1"><span class="toc-item-num">1.0.1&nbsp;&nbsp;</span>Imports</a></span></li></ul></li></ul></li><li><span><a href="#Modeling" data-toc-modified-id="Modeling-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Modeling</a></span></li></ul></div>

In [9]:
with open('data/honolulu_craiglist_surfboards.json') as datafile:
    data = json.load(datafile)
hi_df = pd.DataFrame(data)

with open('data/los_angeles_craiglist_surfboards.json') as datafile:
    data = json.load(datafile)
la_df = pd.DataFrame(data)

with open('data/san_diego_craiglist_surfboards.json') as datafile:
    data = json.load(datafile)
sd_df = pd.DataFrame(data)

df = hi_df.append([la_df,sd_df])

In [10]:
df.head()

Unnamed: 0,condition,description,location,manufacturer,model_name,price,size_dimensions,title,url
0,,"\n \nVery good condition wade tokoro shortboard surfboard, fcsii fin boxes, no dings, very little use\n\n5’8 x 19 x 2.25\n\nThank you\n\n\n\nSurfboards, surf board, boards, board, fins, fcs...",Honolulu,,,250.0,,Tokoro surfboard,https://honolulu.craigslist.org/oah/spo/d/honolulu-tokoro-surfboard/7005760652.html
1,excellent,"\n \n6'2""x20.5x2 prototype board from Eric Arakawa. made for an industry pro. Biaxial fiberglass. FCS boxes, thruster. Waist of the board is farther towards the tail making a fun board that...",Honolulu,Eric Arakawa,Prototype,180.0,,Arakawa Prototype Surfboard,https://honolulu.craigslist.org/oah/spo/d/wheeler-army-airfield-arakawa-prototype/7011893010.html
2,good,"\n \nAndreini surfboard. 6'6"". Large volume. Board had delam on top, it was repaired and sealed, but not filled/leveled with q-cell. Futures fin boxes in a tri/thruster. Cheap plastic fins ...",Honolulu,Andreini,,80.0,"6'6""x20""x2""",Andreini Surfboard,https://honolulu.craigslist.org/oah/spo/d/wheeler-army-airfield-andreini-surfboard/7011895192.html
3,good,\n \n5’11 x 18.75 x 2.37 - 27.46L\n\nKrank model by J. Kashiwai. Squash tail w/ future box thruster setup.\n\nBoard is in decent to good condition. No open dings at all and still has a lot ...,Honolulu,Jason Kashiwai,Krank,250.0,5’11 x 18.75 x 2.37 = 27.46L,5’11 J. Kashiwai “Krank” Surfboard w/ Fins,https://honolulu.craigslist.org/oah/spo/d/honolulu-511-kashiwai-krank-surfboard/7011909978.html
4,,\n \n6' Egg Shortboard Surfboard with fins\n\nThe listed dimensions are 6’ x 19 3/4 x 2 3/4\n\n,Honolulu,,,220.0,6',6' Egg Shortboard Surfboard,https://honolulu.craigslist.org/oah/spo/d/honolulu-6-egg-shortboard-surfboard/7011911532.html


In [11]:
len(df)

2781

In [12]:
df.isnull().sum()/df.isnull().count()*100

condition          44.516361
description         0.000000
location            0.000000
manufacturer       51.564186
model_name         65.336210
price               0.000000
size_dimensions    61.165049
title               0.000000
url                 0.000000
dtype: float64

In [25]:
def extract_bigram_condition(text):
    condition_list = ['new', 'excellent', 'good', 'like new', 'fair']
    trigram_condition_list = []
    for i,word in enumerate(text):
        if word in condition_list:
            trigram_condition_list.append(' '.join(text[i-1:i+2:2]))
    return trigram_condition_list

#for unrelated results with surfboard in description and not in title
def is_surfboard(title):
    title_words = re.sub(r'[,?.!@#$&*-:\'\"\/]', ' ', title).lower().split(' ')
    surfboard_in_title = 0
    surfboard_match_list = ['surfboard', 'surf', 'board', 'sup', 'fish', 'stub', 
                            'shortboard', 'short', 'longboard', 'long', 'foamie', 'foam', 'softtop', 'paddleboard', 'surfboards']
    valid_surf_brands_list = []
    for word in title_words:
        if word in surfboard_match_list:
            surfboard_in_title = 1
    return surfboard_in_title


def clean_description(text):
    stripped_text = re.sub(r'\\n|\n|[,?.!@#$&*:\/]', ' ', text).strip().lower()
    clean_text = [word for word in stripped_text.split(' ') if word != '']
    return clean_text

def ngram(text, n_gram):
    n_gram_list = []
    for i in range(len(text)-n_gram):
        text_seq = ' '.join(text[i:i+n_gram])
        if text_seq not in n_gram_list:
            n_gram_list.append(text_seq)
    return n_gram_list

def bigram_condition_filter(text):
    condition_list = ['new', 'excellent', 'good', 'like new', 'fair']
    
    bigram_list = ['in condition', 'very condition', 'brand never', 'like condition', 'in shape', 'surfboard condition', 
              'a hybrid', 'brand surfboard', 'brand condition', 'a board', 'near condition', 'board condition', 
               'brand board', 'in working', 'brand carbon', 'brand sup', 'really condition', 'like no', 'like used',
              'like only', 'almost condition', 'brand for', 'brand beautiful', 'like i', 'brand js', 'like', '- condition',
              'very quality', 'brand -', 'like in', 'brand only', 'brand quality', 'brand kazuma', 'sale condition', 
              'condition', 'used condition', 'brand save', 'like and', 'brand check', 'brand performance', 'really shape',
              '(like condition)', 'brand it']
    
    for i,word in enumerate(text):
        if word in condition_list:
            if ' '.join(text[i-1:i+2:2]) in bigram_list:
                return word
                



In [35]:
bigram_list = ['in condition', 'very condition', 'brand never', 'like condition', 'in shape', 'surfboard condition', 
              'a hybrid', 'brand surfboard', 'brand condition', 'a board', 'near condition', 'board condition', 
               'brand board', 'in working', 'brand carbon', 'brand sup', 'really condition', 'like no', 'like used',
              'like only', 'almost condition', 'brand for', 'brand beautiful', 'like i', 'brand js', 'like', '- condition',
              'very quality', 'brand -', 'like in', 'brand only', 'brand quality', 'brand kazuma', 'sale condition', 
              'condition', 'used condition', 'brand save', 'like and', 'brand check', 'brand performance', 'really shape',
              '(like condition)', 'brand it']
condition_list = ['new', 'excellent', 'good', 'like new', 'fair']
not_condition_list = []
for text in df['description_word_list']:
    for i,word in enumerate(text):
        if ' '.join(text[i-1:i+2:2]) in bigram_list and word not in condition_list:
            not_condition_list.append(word)
            
Counter(not_condition_list).most_common()

similar_conditions = ['great', 'perfect', 'decent', 'pristine']

combine like new and excellent into one condition

look for other conditions that are similar to condition list

outliers for price possibly multiple boards or look in description for plural board words

create filters for description to weed out guranteed noise (len of description)

scrape other sites



In [None]:
df['bigram_condition'] = df['description_word_list'].apply(lambda x: extract_bigram_condition(x))

In [None]:
condition_list = ['new', 'excellent', 'good', 'like new', 'fair']
all_bigrams = []

for i,text in enumerate(df['description_word_list']):
    for i,word in enumerate(text):
        if word in condition_list:
            all_bigrams.append(' '.join(text[i-1:i+2:2]))

dict(Counter(all_bigrams).most_common())

https://www.surfstationstore.com/collections/used-surfboards

https://www.usedsurfboardshawaii.com/?post_type=product 

combine all clean df into one and predict price together

To do:

get more data (~2500 rows)

clean data




In [19]:
#clean description of html tags and punctuations, strip whitespace, lowercase, and split at spaces for word extraction
df['description_word_list'] = df['description'].apply(lambda x: clean_description(x))

#extract condition of board from description
# df['condition'] = df['description_word_list'].apply(lambda x: extract_condition_filter(x))

#filter out unrelated products with surfboard referred only in description
df['surfboard_in_title'] = df['title'].apply(lambda x: is_surfboard(x))

# extract board dimensions from size_dimensions column to create separate board dimension features

# df['length'] =  df['size_dimensions'].apply(lambda x: extract_board_dimension(x, length_pattern))
# df['width'] =  df['size_dimensions'].apply(lambda x: extract_board_dimension(x, width_pattern))
# df['thickness'] =  df['size_dimensions'].apply(lambda x: extract_board_dimension(x, thickness_pattern))




In [None]:
# investigate frequent description words for low/high cost boards

In [None]:
# words likely to have a strong negative correlatation with price
one_hot_word_list = ['ding', 'dings', 'yellow', 'patched', 'patch', 'repaired', 'fixed', 'fix', 'carbon',
                     'fiber', 'pressure', 'dent', 'old', 'salvage', 'crack', 'cracks', 'epoxy', 'foamie', 
                     'foam', 'softtop', 'longboard', 'shortboard', 'sup']
description_dummy_cols = []

df.reset_index(inplace=True)
for word in one_hot_word_list:
    #init one-hot columns
    df['DV_'+word] = 0
    description_dummy_cols.append('DV_'+word)
    
for i,word_list in enumerate(df['description_word_list']):
    for word in one_hot_word_list:
        if word in word_list:
            df.at[i, 'DV_'+word] = 1
        else: 
            df.at[i, 'DV_'+word] = 0


In [None]:
def extract_board_dimension(text, pattern):
    try:
        raw_dimension = pattern.findall(text)[0]
    except (IndexError, TypeError):
        return np.NaN
    int_dimension = list(map(int, re.findall('\d+', raw_dimension)))
    print(int_dimension)
    
    #str dimension includes fraction
    if len(int_dimension) == 3:
        return int_dimension[0] + (int_dimension[1]/int_dimension[2])
    elif len(int_dimension) == 2:
        return int_dimension[0] + (int_dimension[1] / 10)
    else:
        return int_dimension[0]
    

length_pattern = re.compile('(?<![Xx\d+\"])\d+[\u0080-\uFFFF\'\.]\d+(?=[\"\s*Xx])')
width_pattern = re.compile('(?<=[Xx])\s?\d+[\"/s*\.]?\s?\d?[\"\/]?\d+?(?=[\s?\"Xx?])')
thickness_pattern = re.compile('(?<=[Xx?])\s?\d+[\"/s*\.]?\s?\d+?[\"\/]?\d+?(?=[\",]?)(?![\dXx])')
                    
text = """9'-0" x 22.625" x 2.625" """
extract_board_dimension(text, width_pattern)

extract indiv dimensions as indiv col from dimen col
if nan after extract from description

df = df.dropna(subset=['condition', 'dimensions'], how='any')

# Modeling

In [None]:
df.describe()

In [9]:
test_data = df[df['surfboard_in_title']==1].copy()
test_data = test_data.dropna(how='any')

In [10]:
len(test_data)

584

In [None]:
one_hot_df = pd.get_dummies(test_data[['condition','manufacturer', 'model_name']])
data_x = pd.merge(one_hot_df, test_data[description_dummy_cols], how='inner', left_on=one_hot_df.index, right_on=test_data.index)
data_y = test_data['price']


In [None]:
data_x = data_x.loc[:, data_x.columns != 'key_0']

In [None]:
len(data_x)

In [None]:

X = sm.add_constant(data_x)
results = sm.OLS(list(data_y),X).fit()
results.summary()

In [None]:
def model_results(model_list, train_x, train_y, test_x, test_y):
    
    model_dict = []
    for name, model in model_list.items():
        print(name)
        model_stats = {}
        
        alpha_dict = {'alpha': [.01,.1,1,10,100,1000]}
        if name in ['elastic', 'lasso']:
            model = GridSearchCV(model, alpha_dict, cv=3)
            model.fit(train_x, train_y)
            model_stats['best_alpha'] = model.best_params_['alpha']
            model_stats['coef'] = model.best_estimator_.coef_
        else:
            model.fit(train_x,train_y)
            model_stats['coef'] = np.around(model.coef_)
            if name != 'linear':
                model_stats['best_alpha'] = model.alpha_
        
        pred_y = model.predict(test_x)
        
        model_stats['r2'] = model.score(test_x, test_y)
        model_stats['root_MSE'] = np.sqrt(mse(test_y, pred_y))
        model_stats['MAE'] = np.abs(test_y - pred_y).mean()
        model_stats['MAPE'] = (np.abs(test_y - pred_y) / test_y).mean() * 100
        model_stats['model_name'] = name
        
        
        model_dict.append(model_stats)
            
    model_df = pd.DataFrame(model_dict).set_index('model_name')

    return model_df

In [None]:
train_x, test_x, train_y, test_y = train_test_split(
    data_x, data_y, test_size=.2)

elastic = ElasticNet()
ridge = RidgeCV(alphas=(.01,1,10,100,1000))
lasso = Lasso()
linear = LinearRegression()
model_list = {'elastic': elastic, 'ridge': ridge, 'lasso': lasso, 'linear': linear}

model_df = model_results(model_list, train_x, train_y, test_x, test_y)

In [None]:
model_df