# Introduction

### Imports

In [47]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import json
%matplotlib inline

# Options for pandas
pd.options.display.max_columns = 150
pd.options.display.max_rows = 150
pd.set_option('max_colwidth', 50000)


<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Introduction" data-toc-modified-id="Introduction-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Introduction</a></span><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1.0.1"><span class="toc-item-num">1.0.1&nbsp;&nbsp;</span>Imports</a></span></li></ul></li></ul></li><li><span><a href="#Single-Fix-Cases" data-toc-modified-id="Single-Fix-Cases-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Single Fix Cases</a></span></li><li><span><a href="#Variable-Cleaning" data-toc-modified-id="Variable-Cleaning-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Variable Cleaning</a></span></li><li><span><a href="#Export-Clean-Json" data-toc-modified-id="Export-Clean-Json-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Export Clean Json</a></span></li></ul></div>

In [48]:
cd /Users/marshallmamiya/courses/thinkful_ds/capstone_final/data/raw_data/

/Users/marshallmamiya/courses/thinkful_ds/capstone_final/data/raw_data


In [49]:
with open('used_surfboard_hawaii.json') as datafile:
    data = json.load(datafile)
df = pd.DataFrame(data)


In [50]:
df = df.dropna(how='any')

In [51]:
len(df)

734

# Single Fix Cases

In [52]:
#fix grammatical mistakes

df.at[675, 'width'] = 'W 21″'
df.at[343, 'width'] = 'W 20 1/3″'
df.at[647, 'width'] = 'W 19.75"'

df.at[596, 'thickness'] = 'T 2.43'
df.at[614, 'thickness'] = 'T 2.32'
df.at[626, 'thickness'] = 'T 2 5/16'
df.at[719, 'thickness'] = 'T 2 1/4'
df.at[262, 'thickness'] = 'T 2 1/2'
df.at[647, 'thickness'] = 'T 2 3/8'


# Variable Cleaning

In [53]:
def dimension_to_float(board_dimensions, pattern, dimension_type):
    uni_dimension_list = re.findall(pattern, board_dimensions.strip())

    if uni_dimension_list == []:
        return np.NaN
    
    if dimension_type == 'height':
        
        if len(uni_dimension_list) == 1:
            return float(uni_dimension_list[0])
        elif len(uni_dimension_list) == 2:
            return int(uni_dimension_list[0]) + (int(uni_dimension_list[1])/100)
    
    else:
        
        if len(uni_dimension_list) == 1:
            return float(uni_dimension_list[0])
        
        elif len(uni_dimension_list) == 2 and len(uni_dimension_list[1]) == 1:
            return int(uni_dimension_list[0]) + (int(uni_dimension_list[1])/10)
        
        elif len(uni_dimension_list) == 2 and len(uni_dimension_list[1]) >= 2:
            return int(uni_dimension_list[0]) + (int(uni_dimension_list[1][:2])/100)
        
        else:
            return int(uni_dimension_list[0]) + (int(uni_dimension_list[1])/int(uni_dimension_list[2]))
                                    

In [54]:
def single_condition(text):
    text_list = [condition.strip() for condition in text.lower().split(',')]
    condition_list = ['new', 'like new', 'excellent', 'very good', 'good', 'fair', 'poor']
    conditon_map = {'new':'new',
                    'like new': 'like_new',
                    'excellent': 'like_new',
                    'very good': 'great',
                    'good': 'good',
                    'fair': 'fair',
                    'poor': 'poor'}
    
    for condition in condition_list:
        if condition in text_list:
            return conditon_map[condition]
                                    

def extract_model_name(text):
    matches = re.findall(r'(?<=\().*?(?=\))', text)
    if len(matches) == 0:
        matches = re.findall('(Round\sPin|Squash|Round|Pin|Diamond|Swallow|Chop|Bat|Fish|Square) Tail(?=\sRef#)', text)
    if len(matches) == 0:
        model = np.NaN
    else:
        model = matches[0]
    return model

def clean_text(text, word_list=False):
    text = str(text)
    if word_list == True:
        stripped_text = re.sub(r'\\n|\n|[(),?.!@#$&*:\/]', ' ', text).strip().lower()
        clean_text = [word for word in stripped_text.split(' ') if word != '']

    else:
        clean_text = re.sub(r'\\n|\n|[(),?.!@#$&*:\/]', ' ', text).strip().lower()
    
    return clean_text

In [55]:
height_pattern = '\d+(?=[ftin\u0080-\uFFFF])'
width_pattern = '(?<=[A-Z]\s)\d+|(?<=[A-Z]\s\s)\d+|\d+(?=\/)|(?<=[.\/])\d+'
thickness_pattern = '(?<=[A-Z]\s)\d+|(?<=[A-Z]\s\s)\d+|\d+(?=\/)|(?<=[.\/])\d+'


In [56]:
df['model_name'] = df['title'].apply(extract_model_name)

df['height_float'] = df['height'].apply(lambda x: dimension_to_float(x, height_pattern, dimension_type='height'))
df['width_float'] = df['width'].apply(lambda x: dimension_to_float(x, width_pattern, dimension_type='width'))
df['thickness_float'] = df['thickness'].apply(lambda x: dimension_to_float(x, thickness_pattern, dimension_type='thickness'))

df['uni_condition'] = df['condition'].apply(single_condition)

df['description_word_list'] = df['description'].apply(lambda x: clean_text(x, word_list=True))
df['clean_manufacturer'] = df['manufacturer'].apply(clean_text)
df['clean_model_name'] = df['model_name'].apply(clean_text)

In [57]:
clean_df = df[['clean_manufacturer', 'clean_model_name', 'price', 'description_word_list', 'uni_condition', 'height_float', 'width_float', 'thickness_float']]

In [58]:
clean_df = clean_df.rename(columns={
    'clean_manufacturer': 'manufacturer', 
    'clean_model_name': 'model_name',
    'uni_condition': 'condition',
    'height_float': 'height',
    'width_float': 'width',
    'thickness_float': 'thickness'})

In [59]:
clean_df.head()

Unnamed: 0,manufacturer,model_name,price,description_word_list,condition,height,width,thickness
0,tokoro,4vc,585.0,"[brand, new, shortboard, with, no, signs, of, previous, use]",new,6.02,18.875,2.375
1,donald takayama guy takayama,scorpion,749.0,"[excellent, condition, hybrid, fun, board, with, minimal, signs, of, previous, use]",like_new,7.04,22.0,2.5
2,peralta,round pin,565.0,"[excellent, condition, longboard, with, minimal, signs, of, previous, use]",like_new,9.0,21.75,2.75
3,donald takayama takayama,squash,1250.0,"[excellent, condition, nose, rider, longboard, with, minimal, signs, of, previous, use]",like_new,9.0,21.5,2.5
4,arakawa eric arakawa,mr 200,250.0,"[good, condition, step, up, with, various, pressure, dents, but, otherwise, minor, signs, of, previous, use]",good,6.08,18.5,2.31


# Export Clean Json

In [60]:
cd ~/courses/thinkful_ds/capstone_final/data/clean_data/

/Users/marshallmamiya/courses/thinkful_ds/capstone_final/data/clean_data


In [61]:
clean_df.to_json('clean_usb_data.json')