<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Cleaning-Objectives:" data-toc-modified-id="Cleaning-Objectives:-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Cleaning Objectives:</a></span><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1.0.1"><span class="toc-item-num">1.0.1&nbsp;&nbsp;</span>Imports</a></span></li></ul></li></ul></li><li><span><a href="#Clean-Description-Column" data-toc-modified-id="Clean-Description-Column-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Clean Description Column</a></span></li><li><span><a href="#Create-Condition-Column" data-toc-modified-id="Create-Condition-Column-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Create Condition Column</a></span></li><li><span><a href="#Create-Separate-Board-Dimension-Columns" data-toc-modified-id="Create-Separate-Board-Dimension-Columns-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Create Separate Board Dimension Columns</a></span></li><li><span><a href="#Export-Clean-Json" data-toc-modified-id="Export-Clean-Json-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Export Clean Json</a></span></li></ul></div>

# Cleaning Objectives:
    - Remove HTML tags from description
    - Extract condition from description
    - Separate board dimensions into separate columns
    

### Imports

In [88]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import json
from collections import Counter
%matplotlib inline

# Options for pandas
pd.options.display.max_columns = 150
pd.options.display.max_rows = 150
pd.set_option('max_colwidth', 50000)


In [89]:
cd ~/courses/thinkful_ds/capstone_final/data/

/Users/marshallmamiya/courses/thinkful_ds/capstone_final/data


In [90]:
with open('raw_data/board_source_surfboards.json') as datafile:
    data = json.load(datafile)
df = pd.DataFrame(data)

original_count = len(df)
print('Original Count: ', original_count)

Original Count:  3023


# Clean Description Column

In [91]:
df['description'] = df['description_list']

In [92]:
def clean_text(text, word_list=False):
    if word_list == True:
        stripped_text = re.sub(r'\\n|\n|[(),?.!@#$&*:\/]', ' ', text).strip().lower()
        clean_text = [word for word in stripped_text.split(' ') if word != '']

    else:
        clean_text = re.sub(r'\\n|\n|[(),?.!@#$&*:\/]', ' ', text).strip().lower()
    
    return clean_text

# Create Condition Column

In [93]:
def extract_condition(description_word_list):
    condition_list = ['new', 'like new', 'excellent', 'great', 'good', 'fair', 'poor']
    condition_bigram = ['in to', 'in condition']
    for i,word in enumerate(description_word_list):
        if word in condition_list:
            if description_word_list[i-1] == 'like':
                return 'like_new'
            else:
                return word
    

In [94]:
df['description_word_list'] = df['description'].apply(lambda x: clean_text(x, word_list=True))
df['condition'] = df['description_word_list'].apply(extract_condition)

In [95]:
null_condition_df = df[df['condition'].isnull()]
null_condition_df['condition'] = null_condition_df['title'].apply(lambda x: extract_condition(clean_text(x)))

df.loc[df['condition'].isnull(), 'condition'] = null_condition_df['condition']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [96]:
print('Previous Count: ', len(df))

df.dropna(subset=['condition'], inplace=True)

print('Current Count: ', len(df))

Previous Count:  3023
Current Count:  2723


# Create Separate Board Dimension Columns

In [97]:
def dimension_to_float(board_dimensions, pattern, dimension_type):
    uni_dimension_list = re.findall(pattern, board_dimensions.strip())

    if uni_dimension_list == []:
        return np.NaN
    
    if dimension_type == 'height':
        
        if len(uni_dimension_list) == 1:
            return float(uni_dimension_list[0])
        elif len(uni_dimension_list) == 2:
            return int(uni_dimension_list[0]) + (int(uni_dimension_list[1])/100)
    
    else:
        
        if len(uni_dimension_list) == 1:
            return float(uni_dimension_list[0])
        
        elif len(uni_dimension_list) == 2 and len(uni_dimension_list[1]) == 1:
            return int(uni_dimension_list[0]) + (int(uni_dimension_list[1])/10)
        
        elif len(uni_dimension_list) == 2 and len(uni_dimension_list[1]) >= 2:
            return int(uni_dimension_list[0]) + (int(uni_dimension_list[1][:2])/100)
        
        else:
            return int(uni_dimension_list[0]) + (int(uni_dimension_list[1])/int(uni_dimension_list[2]))
                                    

In [98]:
height_pattern = re.compile(r"""
                                                             # RE Pattern Description 
(?<=^)\d{1,2}(?=\'?\s[Xx])                                   # (?<= Start of Str) Feet (?= X or x)
|(?<=^)\d{1,2}(?=\'?\d{1,2}\"?\s[Xx]|\'?\d{1,2}\'\'?\s[Xx])  # (?<= Start of Str) Feet (?= Inches, '' or ", X or x)
|(?<=\d\')\d{1,2}(?=\"?\s[Xx]|\'\'?\s[Xx])                   # (?<= Feet) Inches (?= ' or '' or , X or X)
""", re.X)

width_pattern = re.compile(r"""
(?<=[Xx]\s)\d{2}(?=\"?\s[Xx])                                # Inches (?= X or x)
|(?<=[Xx]\s)\d{2}\.\d+(?=\"?\s[Xx])                          # Float Inches (?= X or x)
|(?<=[Xx]\s)\d{2}(?=\s\d+\/\d+\"?\s[Xx])                     # Inches (?= Fraction, X or x)
|(?<=\d{2}\s)\d+(?=\/\d+\"?\s[Xx])                           # Numerator Inches (?= /Denominator, X or x)
|(?<=\d\/)\d+(?=\"?\s[Xx])                                   # Denominator Inches (?= X or x)
""", re.X)


thickness_pattern = re.compile(r"""
(?<=[Xx]\s)\d(?=\s\d+\/\d+$)                                 # (?<= X or x) Inches (?= Fraction, End of Str)
|(?<=[Xx]\s)\d(?=\s\d+\/\d+\s\()                             # (?<= X or x) Inches (?= Fraction, Board Liters)
|(?<=[Xx]\s)\d(?=\s\(|$)                                     # (?<= X or x) Inches (?= End of Str)
|(?<=[Xx]\s)\d\.\d+(?=\s\(|$)                                # (?<= X or x) Float Inches (?= Board Liters OR End of Str)
|(?<=\d\s)\d+(?=\/\d+$)                                      # (?<= Inches) Numerator Inches (?= /Denominator, End of Str)
|(?<=\d\s)\d+(?=\/\d+\s\()                                   # (?<= Inches) Numerator Inches (?= /Denominator, Board Liters)
|(?<=\d\/)\d+(?=\s\()                                        # (?<= Numerator/) Denominator Inches (?= Board Liters)
|(?<=\d\/)\d+(?=$)                                           # (?<= Numerator/) Denominator Inches (End of Str)
""", re.X)

In [99]:
df['height'] = df['board_dimensions'].apply(lambda x: dimension_to_float(x, height_pattern, dimension_type='height'))
df['width'] = df['board_dimensions'].apply(lambda x: dimension_to_float(x, width_pattern, dimension_type='width'))
df['thickness'] = df['board_dimensions'].apply(lambda x: dimension_to_float(x, thickness_pattern, dimension_type='thickness'))

df['clean_price'] = df['price'].apply(lambda x: float(x.strip('$')))
df['clean_manufacturer'] = df['manufacturer'].apply(clean_text)
df['clean_model_name'] = df['model_name'].apply(clean_text)




In [100]:
df.isnull().sum()/df.isnull().count()*100

manufacturer             0.000000
model_name               0.000000
board_dimensions         0.000000
price                    0.000000
url                      0.000000
title                    0.000000
description_list         0.000000
description              0.000000
description_word_list    0.000000
condition                0.000000
height                   1.175174
width                    0.220345
thickness                0.036724
clean_price              0.000000
clean_manufacturer       0.000000
clean_model_name         0.000000
dtype: float64

In [101]:
print('Previous Count: ', len(df))

df = df.dropna(subset=['height', 'width', 'thickness']).reset_index(drop=True)

print('Current Count: ', len(df))


Previous Count:  2723
Current Count:  2685


# Export Clean Json

In [102]:
clean_df = df[['clean_manufacturer', 'clean_model_name', 'clean_price', 'description_word_list', 'condition', 'height', 'width', 'thickness']]

In [103]:
clean_df = clean_df.rename(columns={
    'clean_manufacturer': 'manufacturer',
    'clean_model_name': 'model_name',
    'clean_price': 'price'
    })

In [104]:
clean_df.head()

Unnamed: 0,manufacturer,model_name,price,description_word_list,condition,height,width,thickness
0,7s,superfish,325.0,"[this, board, is, in, excellent, condition, with, light, pressures, and, no, dings, fins, and, traction, are, available, at, a, discounted, price, with, all, surfboard, purchases, the, superfish, is, a, great, balance, of, stability, and, turning, in, a, fish, hybrid]",excellent,6.06,20.75,2.75
1,wave tools,classic twin fin,525.0,"[this, board, is, in, excellent, near, new, condition, with, no, very, light, pressures, and, no, dings, fins, are, available, at, a, discounted, price, with, all, surfboard, purchases, wave, tools, twin, fins, became, the, 1, seller, over, all, surf, brands, in, the, 1980’s, when, lance, collins, developed, the, lift, and, rails, that, made, twins, so, successful]",excellent,6.02,21.5,3.125
2,easy button,mini longboard 4+1,575.0,"[this, board, is, brand, new, full, resin, tint, with, fin, and, deck, patch, this, is, easy, button’s, no, logo, version, fins, are, available, at, a, discounted, price, with, all, surfboard, purchases, american, made, classic, longboard, shape, in, a, mini, version, super, fun, and, maneuverable, to, surf, gives, you, the, paddle, power, of, a, longboard, without, having, to, lug, around, a, 9′, plus, board, concave, scooped, nose, 4+1, fcs, fin, system, can, be, ridden, as, a, quad, 2+1, or, single]",new,7.06,22.0,2.875
3,mandala,parabola mini glider,1120.0,"[this, board, is, brand, new, fins, are, available, at, a, discounted, price, with, all, surfboard, purchases, the, parabola, has, an, almost, parabolic, template, meaning, the, nose, and, tail, are, almost, the, same, width, it, has, a, hulled, out, bottom, and, has, minimal, rocker, all, making, for, a, fast, and, very, fun, board]",new,8.08,23.0,3.125
4,minard,mid-length,375.0,"[this, board, is, in, excellent, condition, with, no, dings, and, very, mild, pressures]",excellent,6.1,20.75,2.875


In [105]:
# fix data entry grammatical mistake (10" to 10')
clean_df.at[1760, 'height'] = 10.0

# fix data entry grammatical mistake (7/16" to 2.438 (2 7/16"))
clean_df.at[386, 'thickness'] = 2.438
clean_df.at[1291, 'thickness'] = 2.438

# fix data entry grammatical mistake (5/8" to 2.625 (2 5/8"))
clean_df.at[1437, 'thickness'] = 2.625


In [106]:
clean_df.to_json('clean_data/clean_board_source_data.json')