In [193]:
import pandas as pd
import numpy as np
import re
import datetime

In [155]:
df = pd.read_csv('data_craigslist.csv').drop(columns=['Unnamed: 0', 'level_0', 'index'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Price                   2938 non-null   float64
 1   Neighborhood            2938 non-null   object 
 2   URL                     2938 non-null   object 
 3   Date                    2938 non-null   object 
 4   bicycle frame material  1864 non-null   object 
 5   bicycle type            1864 non-null   object 
 6   frame size              1749 non-null   object 
 7   suspension              829 non-null    object 
 8   wheel size              1749 non-null   object 
 9   num image               2938 non-null   int64  
 10  text                    2600 non-null   object 
 11  condition               1380 non-null   object 
 12  electric assist         575 non-null    object 
 13  make / manufacturer     1173 non-null   object 
 14  model name / number     713 non-null    

In [156]:
df.head()

Unnamed: 0,Price,Neighborhood,URL,Date,bicycle frame material,bicycle type,frame size,suspension,wheel size,num image,text,condition,electric assist,make / manufacturer,model name / number,brake type,handlebar type,part type,size / dimensions,serial number
0,1900.0,"(Hudson, FL)",https://newyork.craigslist.org/brk/bik/d/hudso...,2022-04-14 10:05,,,,,,0,,,,,,,,,,
1,300.0,(Fairfield CT),https://newyork.craigslist.org/fct/bik/d/fairf...,2022-04-14 10:05,,,,,,0,,,,,,,,,,
2,190.0,(BROOKLYN),https://newyork.craigslist.org/brk/bik/d/brook...,2022-04-14 10:01,,,,,,0,,,,,,,,,,
3,250.0,(West Village),https://newyork.craigslist.org/mnh/bik/d/new-y...,2022-04-14 08:25,,,,,,0,,,,,,,,,,
4,800.0,(Brooklyn),https://newyork.craigslist.org/brk/bik/d/brook...,2022-04-14 00:22,,,,,,0,,,,,,,,,,


In [157]:
df['Neighborhood'] = df['Neighborhood'].apply(lambda x:str(x)[2:(len(x)-1)])

In [158]:
df['bicycle frame material'].unique()

array([nan, ' steel', ' titanium', ' other/unknown', ' aluminum',
       ' alloy', ' carbon fiber', ' composite'], dtype=object)

In [159]:
def fix_bfm(x):
    if pd.isna(x) or x.strip() == 'other/unknown':
        return np.nan
    else:
        return x.strip()

df['bicycle frame material'] = df['bicycle frame material'].apply(fix_bfm)

In [160]:
df['bicycle type'].unique()

array([nan, ' other', ' bmx', ' track', ' road', ' mountain',
       ' hybrid/comfort', ' kids', ' folding', ' cruiser', ' gravel',
       ' cargo/pedicab'], dtype=object)

In [161]:
def fix_bt(x):
    if pd.isna(x) or x.strip() == 'other':
        return np.nan
    else:
        return x.strip()

df['bicycle type'] = df['bicycle type'].apply(fix_bt)

In [162]:
df['frame size'].unique()

array([nan, ' Medium', " I don't know, standard", ' 19', ' 54',
       ' all sizes', ' 58cm', ' 54cm medium', ' 16.5"', ' 1', ' All',
       ' 20”', ' 24”', ' 20"', ' 19"', ' 36cm  - 14 inches', ' 17”',
       ' 20 inches', ' 56cm', ' 36 inches wide', ' 26”', ' 23.6',
       ' 20.75', ' 20 inch', ' 21 Inches', ' Varies', ' 50 cm?',
       ' 29 Inches', ' 26', ' Large and Medium', ' ALL',
       ' 16 in (40.64 cm)', ' all', ' unknown', ' 18"', ' various',
       ' Regular size', ' large', ' 16", 18", or 20"', ' XL'],
      dtype=object)

In [163]:
def fix_fs(x):
    if pd.isna(x):
        return np.nan
    numbers = re.findall(r'\d+', x)
    if len(numbers) == 0:
        if 'medium' in x.lower():
            return 'medium'
        elif 'large' in x.lower():
            return 'large'
        elif 'small' in x.lower():
            return 'small'
        else:
            return np.nan
    else:
        if '"' or "'" or 'inches' or 'inch' or 'in' in x.lower(): #it's in inches
            size = float(numbers[0]) # assume it's the first number
            if size < 16:
                return 'small'
            if size > 19:
                return 'large'
            else:
                return 'medium'
        else: #cm
            size = float(numbers[0])
            if size < 53:
                return 'small'
            if size > 56:
                return 'large'
            else:
                return 'medium'

df['frame size'] = df['frame size'].apply(fix_fs)

In [164]:
df['suspension'].unique()

array([nan, ' frame and fork (full suspension)', ' none (rigid)',
       ' other/unknown', ' suspension fork (hardtail)'], dtype=object)

In [165]:
def fix_s(x):
    if pd.isna(x):
        return 0
    if 'fork' in x.lower():
        return 1
    else:
        return 0

df['suspension'] = df['suspension'].apply(fix_s)

In [166]:
df['wheel size'].unique()

array([nan, ' 700C', ' 16 in', ' 20 in', ' other/unknown', ' 24 in',
       ' 26 in', ' 27 in', ' 29 in', ' 18 in', ' 12 in', ' 25 in',
       ' 27.5 in'], dtype=object)

In [167]:
def fix_ws(x):
    if pd.isna(x):
        return np.nan
    if x.strip() == '700C':
        return 29
    numbers = re.findall(r'\d+', x)
    if len(numbers)==0:
        return np.nan
    else:
        return int(numbers[0])

df['wheel size'] = df['wheel size'].apply(fix_ws)

In [168]:
df['condition'].unique()

array([nan, ' good', ' like new', ' excellent', ' new', ' fair'],
      dtype=object)

In [169]:
def fix_c(x):
    if pd.isna(x):
        return np.nan
    if 'good' in x:
        return 'good'
    if 'new' or 'excellent' in x:
        return 'excellent'
    if 'fair' in x:
        return 'fair'

df['condition'] = df['condition'].apply(fix_c)

In [170]:
df['electric assist'].unique()

array([nan, ' pedal assist', ' none', ' throttle'], dtype=object)

In [171]:
def fix_ea(x):
    if pd.isna(x) or 'none' in x:
        return 0
    else:
        return 1

df['electric assist'] = df['electric assist'].apply(fix_ea)

In [172]:
df['make / manufacturer'].unique()

array([nan, ' Velosolex', ' Mongoose', ' trek giant Bianchi vitus',
       ' raleigh trek schwinn giant', ' Saris', ' Trek,Cannondale,GT',
       ' Haro GT Dyno Mongoose SE', ' trek giant diamondback',
       ' trek schwinn raleigh giant', ' TREK', ' Zizzo', ' SE',
       ' trek giant fuji schwinn raleigh', ' schwin, Huffy', ' Coaster',
       ' Shimano', ' Metakoo', ' SE Bikes', ' GT Bikes', ' NYC Bikes',
       ' fuji trek giat specialized', ' Se Bikes', ' Dynacraft',
       ' Rad Power Bikes', ' trek fuji giant raleigh', ' RAD POWER BIKES',
       ' RAD', ' jamis trek giant schwinn', ' trek fuji panasonic',
       ' trek giant gary fisher', ' Cyberbike', ' James', ' SportRack',
       ' Kazam', ' Honda', ' various', " I don't know",
       ' Schwinn Spin Bikes', ' Greenstar Bikes'], dtype=object)

In [173]:
def fix_mm(x):
    if pd.isna(x) or 'various' in x.lower() or x=="I don't know": 
        return np.nan
    if 'trek' in x.lower():
        return 'trek'
    if 'mongoose' in x.lower():
        return 'mongoose'
    if 'se' in x.lower():
        return 'se'
    else:
        return x.strip().lower()

df['make / manufacturer'] = df['make / manufacturer'].apply(fix_mm)

In [174]:
df['model name / number'].unique()

array([nan, ' Hutch', ' Bones', " Matte Trek Black/Blazin' Orange",
       ' Via', ' Fat', ' Stretch Rack', ' 350 Venture', ' BR-R7000',
       ' Trak100', ' Freewheel', ' CG BIG RIPPER',
       ' 2021 Pro Performer 20"', ' The Ultimate New York Bike',
       ' City Grounds Big Flyer', ' Mountain bikes',
       ' Radmini Step Thru 2', ' RadMini4', ' Woody Bike Stand',
       ' Rack Roof Bike Carrier SR4882RP', ' WILDERNESS TRAIL BIKES',
       ' Goldwing 1500', ' various', ' Ninja', ' AC Power', ' EcoCross'],
      dtype=object)

In [175]:
df['brake type'].unique()

array([nan, ' other/unknown', ' u-brakes', ' gyro/bmx',
       ' disc (mechanical)', ' v-brakes', ' caliper', ' disc (hydraulic)',
       ' none'], dtype=object)

In [176]:
def fix_bt(x):
    if pd.isna(x) or 'other/unknown' in x.lower() or 'none' in x.lower():
        return np.nan
    else:
        return x.strip().lower()

df['brake type'] = df['brake type'].apply(fix_bt)

In [177]:
df['handlebar type'].unique()

array([nan, ' drop', ' other/unknown', ' flat', ' bmx', ' riser'],
      dtype=object)

In [178]:
def fix_ht(x):
    if pd.isna(x) or 'other/unknown' in x:
        return np.nan
    else:
        return x.strip().lower()

df['handlebar type'] = df['handlebar type'].apply(fix_ht)

In [179]:
df['part type'].unique()

array([nan, ' other', ' racks, fenders, bags', ' brakes', ' saddle'],
      dtype=object)

In [180]:
def fix_pt(x):
    if pd.isna(x) or 'other' in x:
        return 0
    else:
        return 1

df['part type'] = df['part type'].apply(fix_pt)
df = df.where(df['part type']==0).dropna(how='all').drop(columns=['part type'])

In [181]:
df = df.drop(columns=['size / dimensions', 'serial number'])

In [191]:
df.columns = [column.replace(' ', '') for column in df.columns]

In [199]:
df['Date']

0       2022-04-14 10:05
1       2022-04-14 10:05
2       2022-04-14 10:01
3       2022-04-14 08:25
4       2022-04-14 00:22
              ...       
2933    2022-03-31 20:33
2934    2022-03-31 15:54
2935    2022-03-31 11:50
2936    2022-03-31 11:35
2937    2022-03-31 08:58
Name: Date, Length: 2823, dtype: object

In [202]:
df['days'] = [(datetime.datetime.today()-datetime.datetime.strptime(date_, "%Y-%m-%d %H:%M")).days for date_ in df['Date']]

In [207]:
df.to_csv('preprocessed_craiglist_data.csv')