# Introduction

### Imports

In [153]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import json
from collections import Counter

%matplotlib inline

# Options for pandas
pd.options.display.max_columns = 150
pd.options.display.max_rows = 150
pd.options.display.max_colwidth = 200

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Introduction" data-toc-modified-id="Introduction-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Introduction</a></span><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1.0.1"><span class="toc-item-num">1.0.1&nbsp;&nbsp;</span>Imports</a></span></li></ul></li></ul></li><li><span><a href="#Data-Cleaning:-Part-1-(Filtering)" data-toc-modified-id="Data-Cleaning:-Part-1-(Filtering)-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Data Cleaning: Part 1 (Filtering)</a></span></li><li><span><a href="#Data-Cleaning:-Part-2-(Null-Values)" data-toc-modified-id="Data-Cleaning:-Part-2-(Null-Values)-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Data Cleaning: Part 2 (Null Values)</a></span></li></ul></div>

bay area
https://sfbay.craigslist.org/search/sga?query=surfboard&sort=rel

In [154]:
with open('data/honolulu_craiglist_surfboards.json') as datafile:
    data = json.load(datafile)
hi_df = pd.DataFrame(data)

with open('data/raw_data/los_angeles_craiglist_surfboards.json') as datafile:
    data = json.load(datafile)
la_df = pd.DataFrame(data)

with open('data/raw_data/san_diego_craiglist_surfboards.json') as datafile:
    data = json.load(datafile)
sd_df = pd.DataFrame(data)

df = hi_df.append([la_df,sd_df])

In [155]:
df.head()

Unnamed: 0,condition,description,location,manufacturer,model_name,price,size_dimensions,title,url
0,,"\n \nVery good condition wade tokoro shortboard surfboard, fcsii fin boxes, no dings, very little use\n\n5’8 x 19 x 2.25\n\nThank you\n\n\n\nSurfboards, surf board, boards, board, fins, fcs...",Honolulu,,,250.0,,Tokoro surfboard,https://honolulu.craigslist.org/oah/spo/d/honolulu-tokoro-surfboard/7005760652.html
1,excellent,"\n \n6'2""x20.5x2 prototype board from Eric Arakawa. made for an industry pro. Biaxial fiberglass. FCS boxes, thruster. Waist of the board is farther towards the tail making a fun board that...",Honolulu,Eric Arakawa,Prototype,180.0,,Arakawa Prototype Surfboard,https://honolulu.craigslist.org/oah/spo/d/wheeler-army-airfield-arakawa-prototype/7011893010.html
2,good,"\n \nAndreini surfboard. 6'6"". Large volume. Board had delam on top, it was repaired and sealed, but not filled/leveled with q-cell. Futures fin boxes in a tri/thruster. Cheap plastic fins ...",Honolulu,Andreini,,80.0,"6'6""x20""x2""",Andreini Surfboard,https://honolulu.craigslist.org/oah/spo/d/wheeler-army-airfield-andreini-surfboard/7011895192.html
3,good,\n \n5’11 x 18.75 x 2.37 - 27.46L\n\nKrank model by J. Kashiwai. Squash tail w/ future box thruster setup.\n\nBoard is in decent to good condition. No open dings at all and still has a lot ...,Honolulu,Jason Kashiwai,Krank,250.0,5’11 x 18.75 x 2.37 = 27.46L,5’11 J. Kashiwai “Krank” Surfboard w/ Fins,https://honolulu.craigslist.org/oah/spo/d/honolulu-511-kashiwai-krank-surfboard/7011909978.html
4,,\n \n6' Egg Shortboard Surfboard with fins\n\nThe listed dimensions are 6’ x 19 3/4 x 2 3/4\n\n,Honolulu,,,220.0,6',6' Egg Shortboard Surfboard,https://honolulu.craigslist.org/oah/spo/d/honolulu-6-egg-shortboard-surfboard/7011911532.html


In [156]:
len(df)

2781

# Data Cleaning: Part 1 (Filtering)
Filter out:
    - Duplicates ads
    - Unrelated ads
    - Single ad multiple products (e.g. multiple boards and/or includes fins, leashes, board bags etc.)
   

In [157]:
#drop duplicate ads
df = df.drop_duplicates(subset=['description']).copy()

In [158]:
def clean_text(text):
    stripped_text = re.sub(r'\\n|\n|[(),?.!@#$&*:\/]', ' ', text).strip().lower()
    clean_text = [word for word in stripped_text.split(' ') if word != '']
    return clean_text

def ngram(text, n_gram):
    n_gram_list = []
    for i in range(len(text)-n_gram):
        text_seq = ' '.join(text[i:i+n_gram])
        if text_seq not in n_gram_list:
            n_gram_list.append(text_seq)
    return n_gram_list

def one_board_filter(text, title=False):
    multiple_products_words = ['boards', 'surfboards']
    multiple_boards = 0
    surfboard_in_title = 0
    
    if title == True:
        surfboard_word_list = ['surfboard', 'surf', 'board', 'sup', 'fish', 'stub', 'shortboard', 'short', 
                                   'longboard', 'long', 'foamie', 'foam', 'softtop', 'paddleboard']
        for word in text:
            if word in multiple_products_words:
                multiple_boards = 1
            
            elif word in surfboard_word_list:
                surfboard_in_title = 1
        
        if multiple_boards == 1 or surfboard_in_title == 0:
            return 0
        else:
            return 1
    
    else:
        multiple_products = 0
        for word in text:
            if word in multiple_products_words:
                multiple_products = 1
        if multiple_products == 1:
            return 0
        else:
            return 1
        
def bigram_multiple_products_filter(bigram_list):
    bigram_count = 0
    bigram_check_list = ['does include', 'does includes', 'will include', 'are included', 'are including', 'is included', 
                    'is including', 'is with','come with', 'comes with']
    
    for bigram in bigram_list:
        if bigram in bigram_check_list:
            bigram_count += 1
    if bigram_count >= 1:
        return 0
    else:
        return 1

In [159]:
#create clean description and title word list columns to be used for filtering rows
df['description_word_list'] = df['description'].apply(lambda x: clean_text(x))
df['title_word_list'] = df['title'].apply(lambda x: clean_text(x))

# filter out rows that include multiple boards/products
df['one_board_description'] = df['description_word_list'].apply(lambda x: one_board_filter(x))
df['one_board_title'] = df['title_word_list'].apply(lambda x: one_board_filter(x, title=True))

df['bigram_description_list'] = df['description_word_list'].apply(lambda x: ngram(x, 2))
df['bigram_one_board_description'] = df['bigram_description_list'].apply(lambda x: bigram_multiple_products_filter(x))




In [160]:
df = df.loc[(df['one_board_description'] == 1) & (df['one_board_title'] == 1) 
            & (df['bigram_one_board_description'] == 1)].copy()

df.reset_index(inplace=True)

# Data Cleaning: Part 2 (Null Values)

In [161]:
df.isnull().sum()/df.isnull().count()*100

index                            0.000000
condition                       51.280120
description                      0.000000
location                         0.000000
manufacturer                    52.635542
model_name                      67.771084
price                            0.000000
size_dimensions                 61.897590
title                            0.000000
url                              0.000000
description_word_list            0.000000
title_word_list                  0.000000
one_board_description            0.000000
one_board_title                  0.000000
bigram_description_list          0.000000
bigram_one_board_description     0.000000
dtype: float64

In [162]:
def ngram_condition_filter(text, ngram=2):
    condition_list = ['new', 'excellent', 'good', 'like new', 'fair', 'great', 'perfect', 'decent', 'pristine', 'mint',
                  'poor']
    bigram_list = ['in condition', 'very condition', 'like condition', 'in shape', 'surfboard condition', 'a hybrid', 
               'a board', 'near condition', 'board condition', 'in working', 'brand carbon', 'really condition', 
               'almost condition', '- condition', 'very quality', 'sale condition', 'condition', 'used condition', 
               'really shape', 'nearly condition', 'absolute condition', 'out condition']
    
    unigram_list = ['brand', 'like']

    for i,word in enumerate(text):
        if word in condition_list:
            if ' '.join(text[i-(ngram-1):i+ngram:ngram]) in bigram_list or text[i-1] in unigram_list:
                return word


def remove_secondary_words(text_list, secondary_list):
    primary_words = [word for word in text_list if word not in secondary_list]
    return primary_words

def extract_model_manufacturer(text_list, match_list):
    extracted_text = [word for word in text_list if word in match_list]
    return extracted_text

def motorola_filter(text_list):
    for word in text_list:
        if word == 'motorola':
            return 1
        else:
            continue
    return 0
    

In [163]:
df['condition_from_description'] = df['description_word_list'].apply(lambda x: ngram_condition_filter(x))

#replace nulls in condition with condition extracted from description
df.loc[df['condition'].isnull(),'condition'] = df['condition_from_description']

#create separate columns for each board measurement
# df['length'] =  df['size_dimensions'].apply(lambda x: extract_board_dimension(x, length_pattern))
# df['width'] =  df['size_dimensions'].apply(lambda x: extract_board_dimension(x, width_pattern))
# df['thickness'] =  df['size_dimensions'].apply(lambda x: extract_board_dimension(x, thickness_pattern))

In [164]:
# # words likely to have a strong negative correlatation with price
# one_hot_word_list = ['ding', 'dings', 'yellow', 'patched', 'patch', 'repaired', 'fixed', 'fix', 'carbon',
#                      'fiber', 'pressure', 'dent', 'old', 'salvage', 'crack', 'cracks', 'epoxy', 'foamie', 
#                      'foam', 'softtop', 'longboard', 'shortboard', 'sup']
# description_dummy_cols = []

# df.reset_index(inplace=True)
# for word in one_hot_word_list:
#     #init one-hot columns
#     df['DV_'+word] = 0
#     description_dummy_cols.append('DV_'+word)
    
# for i,word_list in enumerate(df['description_word_list']):
#     for word in one_hot_word_list:
#         if word in word_list:
#             df.at[i, 'DV_'+word] = 1
#         else: 
#             df.at[i, 'DV_'+word] = 0


In [192]:
secondary_list = ['surfboard', 'surfboards', 'surf', 'board', 'boards', 'longboard', 'longboards', 'shortboard', 
                  'shortboards' 'softop', 'design', 'designs', 'shape', 'shaper', 'shapes', 'model', 'serie', 'series',
                  'version', 'type', 'vintage', 'by', 'the', 'and', 'or', 'shop', 'on', 'up', 'company']

df['manufacturer_word_list'] = df['manufacturer'].astype(str).apply(lambda x: remove_secondary_words(clean_text(x),
                                                                  secondary_list))
df['model_name_word_list'] = df['model_name'].astype(str).apply(lambda x: remove_secondary_words(clean_text(x),
                                                                  secondary_list))

df['is_motorola'] = df['manufacturer_word_list'].apply(lambda x: motorola_filter(x))

df = df[df['is_motorola'] == 0].copy()

manufacturer_list = np.unique([word for word_list in df['manufacturer_word_list'].tolist() for word in word_list])
model_name_list = np.unique([word for word_list in df['model_name_word_list'].tolist() for word in word_list])



In [168]:
df['manufacturer_from_description'] = df['description_word_list'].apply(lambda x: extract_model_manufacturer(x, manufacturer_list))

df['model_name_from_description'] = df['description_word_list'].apply(lambda x: extract_model_manufacturer(x, model_name_list))


In [64]:
def extract_board_dimension(text, pattern):
    try:
        raw_dimension = pattern.findall(text)[0]
    except (IndexError, TypeError):
        return np.NaN
    int_dimension = list(map(int, re.findall('\d+', raw_dimension)))
    print(int_dimension)
    
    #str dimension includes fraction
    if len(int_dimension) == 3:
        return int_dimension[0] + (int_dimension[1]/int_dimension[2])
    elif len(int_dimension) == 2:
        return int_dimension[0] + (int_dimension[1] / 10)
    else:
        return int_dimension[0]
    

length_pattern = re.compile('(?<![Xx\d+\"])\d+[\u0080-\uFFFF\'\.]\d+(?=[\"\s*Xx])')
width_pattern = re.compile('(?<=[Xx])\s?\d+[\"/s*\.]?\s?\d?[\"\/]?\d+?(?=[\s?\"Xx?])')
thickness_pattern = re.compile('(?<=[Xx?])\s?\d+[\"/s*\.]?\s?\d+?[\"\/]?\d+?(?=[\",]?)(?![\dXx])')
                    
text = """9'-0" x 22.625" x 2.625" """
extract_board_dimension(text, width_pattern)

[22, 625]


84.5

In [65]:
bigram_list = ['in condition', 'very condition', 'brand never', 'like condition', 'in shape', 'surfboard condition', 
              'a hybrid', 'brand surfboard', 'brand condition', 'a board', 'near condition', 'board condition', 
               'brand board', 'in working', 'brand carbon', 'brand sup', 'really condition', 'like no', 'like used',
              'like only', 'almost condition', 'brand for', 'brand beautiful', 'like i', 'brand js', 'like', '- condition',
              'very quality', 'brand -', 'like in', 'brand only', 'brand quality', 'brand kazuma', 'sale condition', 
              'condition', 'used condition', 'brand save', 'like and', 'brand check', 'brand performance', 'really shape',
              '(like condition)', 'brand it']
condition_list = ['new', 'excellent', 'good', 'like new', 'fair']
not_condition_list = []
for text in df['description_word_list']:
    for i,word in enumerate(text):
        if ' '.join(text[i-1:i+2:2]) in bigram_list and word not in condition_list:
            not_condition_list.append(word)
            
Counter(not_condition_list).most_common()



[('great', 49),
 ('perfect', 11),
 ('decent', 5),
 ('fun', 4),
 ('sick', 2),
 ('solid', 2),
 ('mint', 2),
 ('short', 2),
 ('used', 2),
 ('begginers', 2),
 ('large', 2),
 ('one', 2),
 ('real', 1),
 ('best', 1),
 ('rough', 1),
 ('step-up', 1),
 ('obo', 1),
 ('fast', 1),
 ('awesome', 1),
 ('larger', 1),
 ('new)', 1),
 ('shorter', 1),
 ('vintage', 1),
 ('repaired', 1),
 ('as-is', 1),
 ('padded', 1),
 ('seaworthy', 1),
 ("7'", 1),
 ('surfable', 1),
 ('foil', 1),
 ('v-cluster', 1),
 ('poor', 1),
 ('and', 1),
 ('excelelnt', 1),
 ('small', 1),
 ('unique', 1),
 ('nice', 1),
 ('surfboard', 1),
 ('midsize', 1),
 ('excelent', 1),
 ('loose', 1),
 ('infinity', 1),
 ('every', 1),
 ('cool', 1),
 ('them)', 1),
 ('hp', 1),
 ('conventional', 1),
 ('grom', 1),
 ('usable', 1),
 ('surf', 1)]

combine like new and excellent into one condition

look for other conditions that are similar to condition list

outliers for price possibly multiple boards or look in description for plural board words

create filters for description to weed out guranteed noise (len of description)

scrape other sites



In [83]:
df['condition'].value_counts()

good         261
new          218
like new     186
fair          58
great         24
excellent      2
perfect        2
mint           1
Name: condition, dtype: int64

In [89]:
condition_map_dict = {'new': 'new',
                      'perfect': 'new',
                      'pristine': 'like new',
                      'excellent': 'like new',
                      'mint': 'like new',
                      'great': 'great',
                      'good': 'good',
                      'fair': 'fair',
                      'decent': 'fair',
                      'poor': 'poor'}
df['condition'] = df['condition'].map(condition_map_dict)

In [85]:
df['condition'].value_counts()

good        261
new         220
fair         58
great        24
like new      3
Name: condition, dtype: int64

In [104]:
# condition_list = ['new', 'excellent', 'good', 'like new', 'fair', 'great', 'perfect', 'decent', 'pristine', 'mint',
#                   'poor']
# all_bigrams = []

# for i,text in enumerate(df['description_word_list']):
#     for i,word in enumerate(text):
#         if word in condition_list:
#             all_bigrams.append(' '.join(text[i-1:i+2:2]))

# dict(Counter(all_bigrams).most_common())

https://www.surfstationstore.com/collections/used-surfboards

https://www.usedsurfboardshawaii.com/?post_type=product 

combine all clean df into one and predict price together

To do:

get more data (~2500 rows)

clean data




extract indiv dimensions as indiv col from dimen col
if nan after extract from description

df = df.dropna(subset=['condition', 'dimensions'], how='any')

In [None]:
test_data = df[df['surfboard_in_title']==1].copy()
test_data = test_data.dropna(how='any')

len(test_data)

In [None]:
one_hot_df = pd.get_dummies(test_data[['condition','manufacturer', 'model_name']])
data_x = pd.merge(one_hot_df, test_data[description_dummy_cols], how='inner', left_on=one_hot_df.index, right_on=test_data.index)
data_y = test_data['price']

data_x = data_x.loc[:, data_x.columns != 'key_0']

len(data_x)