<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Cleaning-Objectives:" data-toc-modified-id="Cleaning-Objectives:-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Cleaning Objectives:</a></span><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1.0.1"><span class="toc-item-num">1.0.1&nbsp;&nbsp;</span>Imports</a></span></li></ul></li></ul></li><li><span><a href="#Clean-Description-Column" data-toc-modified-id="Clean-Description-Column-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Clean Description Column</a></span></li><li><span><a href="#Create-Condition-Column" data-toc-modified-id="Create-Condition-Column-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Create Condition Column</a></span></li><li><span><a href="#Create-Separate-Board-Dimension-Columns" data-toc-modified-id="Create-Separate-Board-Dimension-Columns-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Create Separate Board Dimension Columns</a></span></li><li><span><a href="#Export-Clean-Json" data-toc-modified-id="Export-Clean-Json-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Export Clean Json</a></span></li></ul></div>

# Cleaning Objectives:
    - Remove HTML tags from description
    - Extract condition from description
    - Separate board dimensions into separate columns
    

### Imports

In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import json
from collections import Counter
%matplotlib inline

# Options for pandas
pd.options.display.max_columns = 150
pd.options.display.max_rows = 150
pd.set_option('max_colwidth', 50000)


examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


In [2]:
with open('board_source_surfboards.json') as datafile:
    data = json.load(datafile)
df = pd.DataFrame(data)


In [3]:
len(df)

3023

# Clean Description Column

In [4]:
df['description'] = df['description_list']

In [5]:
def clean_text(text):
    stripped_text = re.sub(r'\\n|\n|[(),?.!@#$&*:\/]', ' ', text).strip().lower()
    clean_text = [word for word in stripped_text.split(' ') if word != '']
    return clean_text

# Create Condition Column

In [6]:
def extract_condition(description_word_list):
    condition_list = ['new', 'like new', 'excellent', 'great', 'good', 'fair', 'poor']
    condition_bigram = ['in to', 'in condition']
    for i,word in enumerate(description_word_list):
        if word in condition_list:
            return word
    

In [7]:
df['description_word_list'] = df['description'].apply(clean_text)
df['condition'] = df['description_word_list'].apply(extract_condition)

In [8]:
null_condition_df = df[df['condition'].isnull()]
null_condition_df['condition'] = null_condition_df['title'].apply(lambda x: extract_condition(clean_text(x)))

df.loc[df['condition'].isnull(), 'condition'] = null_condition_df['condition']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [9]:
df.dropna(subset=['condition'], inplace=True)

# Create Separate Board Dimension Columns

In [36]:
def dimension_to_float(board_dimensions, pattern, dimension_type):
    uni_dimension_list = re.findall(pattern, board_dimensions.strip())
    if uni_dimension_list == []:
        print(board_dimensions)
        return np.NaN
    
    if dimension_type == 'height':
        
        if len(uni_dimension_list) == 1:
            return float(uni_dimension_list[0])
        elif len(uni_dimension_list) == 2:
            return int(uni_dimension_list[0]) + (int(uni_dimension_list[1])/100)
    
    else:
        
        if len(uni_dimension_list) == 1:
            return float(uni_dimension_list[0])
        
        elif len(uni_dimension_list) == 2 and len(uni_dimension_list[1]) == 1:
            return int(uni_dimension_list[0]) + (int(uni_dimension_list[1])/10)
        
        elif len(uni_dimension_list) == 2 and len(uni_dimension_list[1]) >= 2:
            return int(uni_dimension_list[0]) + (int(uni_dimension_list[1][:2])/100)
        
        else:
            return int(uni_dimension_list[0]) + (int(uni_dimension_list[1])/int(uni_dimension_list[2]))
    
    
def single_condition(text):
    text_list = [condition.strip() for condition in text.lower().split(',')]
    condition_list = ['new', 'like new', 'excellent', 'very good', 'good', 'fair', 'poor']
    conditon_map = {'new':'new',
                    'like new': 'like_new',
                    'excellent': 'like_new',
                    'very good': 'great',
                    'good': 'good',
                    'fair': 'fair',
                    'poor': 'poor'}
    
    for condition in condition_list:
        if condition in text_list:
            return conditon_map[condition]
                                    

height_pattern = re.compile(r"""
(?<=^)\d{1,2}(?=\'?\'\'?\s[Xx])           # (?<= Start of Str) Feet (?= X or x)
|(?<=^)\d{1,2}(?=\'?\d{1,2}[\"\'\'?]\s[Xx]) # (?<= Start of Str) Feet (?= Inches, X or x)
|(?<=\d[\d?]\')\d{1,2}(?=\"?\'\'?\s[Xx])       # (?<= Feet) Inches (?= ' or '' or , X or X)
""", re.X)

width_pattern = re.compile(r"""
(?<=[Xx]\s)\d{2}(?=\"?\s[Xx])             # Inches (?= X or x)
|(?<=[Xx]\s)\d{2}\.\d+(?=\"?\s[Xx])       # Float Inches (?= X or x)
|(?<=[Xx]\s)\d{2}(?=\s\d+\/\d+\"?\s[Xx])  # Inches (?= Fraction, X or x)
|(?<=\d{2}\s)\d+(?=\/\d+\"?\s[Xx])        # Numerator Inches (?= /Denominator, X or x)
|(?<=\d\/)\d+(?=\"?\s[Xx])                # Denominator Inches (?= X or x)
""", re.X)


thickness_pattern = re.compile(r"""
(?<=[Xx]\s)\d(?=\s\d+\/\d+$)              # (?<= X or x) Inches (?= Fraction, End of Str)
|(?<=[Xx]\s)\d(?=\s\d+\/\d+\s\()          # (?<= X or x) Inches (?= Fraction, Board Liters)
|(?<=[Xx]\s)\d(?=\s\(|$)                  # (?<= X or x) Inches (?= End of Str)
|(?<=[Xx]\s)\d\.\d+(?=\s\(|$)             # (?<= X or x) Float Inches (?= Board Liters OR End of Str)
|(?<=\d\s)\d+(?=\/\d+$)                   # (?<= Inches) Numerator Inches (?= /Denominator, End of Str)
|(?<=\d\s)\d+(?=\/\d+\s\()                # (?<= Inches) Numerator Inches (?= /Denominator, Board Liters)
|(?<=\d\/)\d+(?=\s\()                     # (?<= Numerator/) Denominator Inches (?= Board Liters)
|(?<=\d\/)\d+(?=$)                        # (?<= Numerator/) Denominator Inches (End of Str)
""", re.X)

##FIX!
x = """6'6" x 20.75 x 3 (40.8L)"""
dimension_to_float(x, height_pattern, dimension_type='height')

6.0

In [11]:
df['height'] = df['board_dimensions'].apply(lambda x: dimension_to_float(x, height_pattern, dimension_type='height'))
df['width'] = df['board_dimensions'].apply(lambda x: dimension_to_float(x, width_pattern, dimension_type='width'))
df['thickness'] = df['board_dimensions'].apply(lambda x: dimension_to_float(x, thickness_pattern, dimension_type='thickness'))

6'6" x 20 3/4 x 2 3/4
6'2" x 21 1/2 x 3 1/8
7'6" x 22 x 2 7/8 (55L)
8'8" x 23 x 3 1/8
6'10" x 20 3/4 x 2 7/8
7'2" x 21 x 3
5'10" x 20.75 x 2.75
8'6" x 20.5 x 3.3
8'2" x 22.3 x 3.2 (60L)
8'9" x 22 3/8 x 2 3/4
5'10" x 20 3/8 x 2 5/8 (33L)
7'11" x 21.88 x 3 (65.65L)
8'10" x 23 x 3 1/8
9'4" x 23 x 3
9′8" x 23 1/2 x 3 1/8
6'1" x 19 7/8 x 2 7/8 (32.8L)
5'11" x 20 3/4 x 2 3/4 (36L)
9'2 x 22 3/4 x 3
5'10" x 20 x 2 1/2
5'10" x 22 x 2 11/16
6'5" x 21.87 x 2.9 (44.7L)
6'6" x 19 3/4 x 2 1/2
8'8" x 22 3/4 x 2 7/8
5'10" x 20.4 x 2.5 (33.7L)
9'9" x 23 1/8 x 3 1/8
10'6" x 23 1/2 x 3 5/8
7'6" x 21 1/2 x 3
9'8" x 23 5/8 x 3
9'6" x 23 x 3 1/8
9'1" x 23 x 2 7/8
5'7" x 20.25 x 2.42
5'6" x 19 3/8 x 2 1/2 (29.7L)
9'3" x 22 1/2 x 2 7/8
8'6" x 22 1/4 x 2 3/4 (59.52L)
5'7" x 20 1/2 x 2 5/8
8'7" x 22 1/2 x 2 3/4 (61.5L)
6'8" x 22 x 3 (48.8L)
9'9" x 23 x 3 
9'4" x 23.75 x 3.15
5'4" x 20 7/8 x 2 1/2 (31.5L)
6'6" x 20.50 x 2.65 (39.0L)
9'3" x 23 1/8 x 3 1/4
6'2" x 20 5/8 x 2 3/4 (37.5L)
9'1" x 23 x 3
7'5" x 21 x 3


In [12]:
df.isnull().sum()/df.isnull().count()*100

manufacturer              0.000000
model_name                0.000000
board_dimensions          0.000000
price                     0.000000
url                       0.000000
title                     0.000000
description_list          0.000000
description               0.000000
description_word_list     0.000000
condition                 0.000000
height                   35.016722
width                     0.234114
thickness                 0.033445
dtype: float64

In [13]:
# df = df.dropna(subset=['height', 'width', 'thickness']).reset_index()

In [14]:
# grammatical user mistakes
# df.drop(df.index[1413], inplace=True)

# Export Clean Json

In [15]:
# df.to_csv('data/clean_data/')