In [1]:
import pandas as pd
import numpy as np
import re
import datetime
from os.path import exists

In [2]:
places_raw = pd.read_excel('places.xlsx', header=None)
places = places_raw[0].values

In [3]:
index=0
for city in places:
    if exists('data_craigslist'+str(city)+'.csv'):
        if index == 0:
            df = pd.read_csv('data_craigslist'+str(city)+'.csv').drop(columns=['Unnamed: 0'])
            index += 1
        else:
            df = pd.concat([df, pd.read_csv('data_craigslist'+str(city)+'.csv').drop(columns=['Unnamed: 0'])], axis=0, ignore_index=True)

In [4]:
# Gets rid of the rows where we don't have any features
new_index = []
for ind in df.index:
    try:
        if df.loc[ind].count() >= 7:
            new_index.append(ind)
    except:
        pass

df = df.loc[new_index].reset_index(drop=True)

In [5]:
df.columns

Index(['Price', 'Location', 'URL', 'Date', 'Title', 'num image', 'text',
       'condition', 'make / manufacturer', 'model name / number',
       'bicycle frame material', 'bicycle type', 'brake type',
       'electric assist', 'frame size', 'handlebar type', 'suspension',
       'wheel size', 'size / dimensions', 'serial number', 'part type',
       'paint color', 'year manufactured', 'engine hours (total)',
       'length overall (LOA)', 'propulsion type'],
      dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3953 entries, 0 to 3952
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Price                   3953 non-null   float64
 1   Location                3953 non-null   object 
 2   URL                     3953 non-null   object 
 3   Date                    3953 non-null   object 
 4   Title                   3953 non-null   object 
 5   num image               3953 non-null   int64  
 6   text                    3953 non-null   object 
 7   condition               1789 non-null   object 
 8   make / manufacturer     1567 non-null   object 
 9   model name / number     1095 non-null   object 
 10  bicycle frame material  1739 non-null   object 
 11  bicycle type            1739 non-null   object 
 12  brake type              635 non-null    object 
 13  electric assist         426 non-null    object 
 14  frame size              1634 non-null   

In [7]:
print(df['bicycle frame material'].unique())

[nan ' aluminum' ' alloy' ' other/unknown' ' steel' ' carbon fiber'
 ' titanium' ' composite' ' scandium']


In [8]:
def fix_bfm(x):
    if pd.isna(x) or x.strip() == 'other/unknown':
        return 'unknown'
    else:
        return x.strip()

df['bicycle frame material'] = df['bicycle frame material'].apply(fix_bfm)

In [9]:
print(df['bicycle frame material'].unique())

['unknown' 'aluminum' 'alloy' 'steel' 'carbon fiber' 'titanium'
 'composite' 'scandium']


In [10]:
df['bicycle type'].unique()

array([nan, ' mountain', ' other', ' kids', ' hybrid/comfort', ' cruiser',
       ' bmx', ' road', ' unicycle', ' folding', ' gravel',
       ' recumbent/trike', ' cyclocross', ' tandem', ' cargo/pedicab',
       ' track'], dtype=object)

In [11]:
def fix_bt(x):
    if pd.isna(x) or x.strip() == 'other':
        return 'unknown'
    else:
        return x.strip()

df['bicycle type'] = df['bicycle type'].apply(fix_bt)

In [12]:
df['bicycle type'].unique()

array(['unknown', 'mountain', 'kids', 'hybrid/comfort', 'cruiser', 'bmx',
       'road', 'unicycle', 'folding', 'gravel', 'recumbent/trike',
       'cyclocross', 'tandem', 'cargo/pedicab', 'track'], dtype=object)

In [13]:
df['frame size'].unique()

array([nan, ' Medium', ' Large', ' 26 inch', ' Fits most bikes!',
       ' 12” wheels', ' 18', ' 0', ' 8"', ' 24”', ' All', ' 26" and 24"',
       ' medium', ' 19.5', ' any', ' 56cm', ' 54 cm, medium', ' 60cm',
       ' All sizes', ' 16 inch', ' unknown', ' N/A', ' Unknown', ' None',
       ' 15 inch', ' 60 cm', ' 18 "', ' 19in', ' 26 in',
       ' 42x40x10 inches', ' 52 cm', ' Any size', ' 15"', ' 18.5”',
       ' n/a', ' Kids size', ' 50cm', " Women's Medium", ' 20" wheels',
       ' 26in.', ' Medium/Large', ' small', ' Small', " don't reall know",
       ' 54cm medium17.5', ' Adult', ' good size', ' MEDIUM TO LARGE',
       ' 27"', ' 20"', ' XXL', ' 65cm', ' 18.25 in (46.35 cm)', ' 20',
       ' child size', ' 49 to 63 cm', ' 26”', ' 16', ' 26"', " 26'",
       ' 16”, 12”', ' About 16 inches.', ' ?', ' 24"', ' .', ' 26',
       ' 17 inch, 25 inch, etc.', ' 17.5 in  54 in',
       ' Standard Cruiser Size', ' kids', ' see pictures',
       ' 19inches 49cm', ' Heavy-duty 57 cm.', ' 20”

In [14]:
def fix_fs(x):
    if pd.isna(x):
        return 'unknown'
    numbers = re.findall(r'\d+', x)
    if len(numbers) == 0:
        if 'medium' in x.lower():
            return 2
        elif 'large' in x.lower():
            return 3
        elif 'small' in x.lower():
            return 1
        else:
            return 'unknown'
    else:
        if '"' or "'" or 'inches' or 'inch' or 'in' in x.lower(): #it's in inches
            size = float(numbers[0]) # assume it's the first number
            if size < 16:
                return 1
            if size > 19:
                return 3
            else:
                return 2
        else: #cm
            size = float(numbers[0])
            if size < 53:
                return 1
            if size > 56:
                return 3
            else:
                return 2

df['frame size'] = df['frame size'].apply(fix_fs)

In [15]:
df['frame size'].unique()

array(['unknown', 2, 3, 1], dtype=object)

In [16]:
df['suspension'].unique()

array([nan, ' suspension fork (hardtail)', ' none (rigid)',
       ' other/unknown', ' frame and fork (full suspension)'],
      dtype=object)

In [17]:
def fix_s(x):
    if pd.isna(x):
        return 0
    if 'fork' in x.lower():
        return 1
    else:
        return 0

df['suspension'] = df['suspension'].apply(fix_s)

In [18]:
df['suspension'].unique()

array([0, 1], dtype=int64)

In [19]:
df['wheel size'].unique()

array([nan, ' 26 in', ' other/unknown', ' 12 in', ' 24 in', ' 20 in',
       ' 27 in', ' 700C', ' 25 in', ' 29 in', ' 14 in', ' 16 in',
       ' 10 in', ' 650B', ' 18 in', ' 27.5 in', ' 28 in', ' 26.5 in',
       ' 650C'], dtype=object)

In [20]:
def fix_ws(x):
    if pd.isna(x):
        return 'unknown'
    if '700' in x.strip():
        return 29
    if '650' in x.strip():
        return 26
    numbers = re.findall(r'\d+', x)
    if len(numbers)==0:
        return 'unknown'
    else:
        if int(numbers[0]) > 30:
            return 'unknown'
        else:
            return int(numbers[0])

df['wheel size'] = df['wheel size'].apply(fix_ws)

In [21]:
df['wheel size'].unique()

array(['unknown', 26, 12, 24, 20, 27, 29, 25, 14, 16, 10, 18, 28],
      dtype=object)

In [22]:
df['condition'].unique()

array([nan, ' excellent', ' new', ' good', ' salvage', ' like new',
       ' fair'], dtype=object)

In [23]:
def fix_c(x):
    if pd.isna(x):
        return 'unknown'
    x = str(x)
    if 'good' in x:
        return 3
    if 'fair' in x:
        return 2
    if 'salvage' in x or 'poor' in x:
        return 1
    if 'new' or 'excellent' in x:
        return 4
    
    return 'unknown'


df['condition'] = df['condition'].apply(fix_c)

In [24]:
df['condition'].unique()

array(['unknown', 4, 3, 1, 2], dtype=object)

In [25]:
df['electric assist'].unique()

array([nan, ' none', ' other', ' throttle', ' pedal assist'], dtype=object)

In [26]:
def fix_ea(x):
    if pd.isna(x):
        return 0
    x = str(x)
    if 'other' in x or 'none' in x:
        return 0
    return 1

df['electric assist'] = df['electric assist'].apply(fix_ea)

In [27]:
df['electric assist'].unique()

array([0, 1], dtype=int64)

In [28]:
df['make / manufacturer'].unique()

array([nan, ' Thule', ' BodyKore', ' Trek', ' Huffy', ' Diamondback',
       ' joystar', ' critical', ' huffy', ' SE Bikes', ' Saris',
       ' Royce Union, Huffy', ' Kent', ' Real tree', ' Cyberbike',
       ' all makes', ' NS Bikes', ' Fuji', ' Allen', ' Schwinn', ' Sun',
       ' Marin', ' Sole', ' Specialized', ' Life Fitness, Cybex, Hammer',
       ' Yakima', ' HUSQVARNA', ' Gatormade Trailers', ' roadmaster',
       ' Orbea', ' Bikemate', ' Suzuki', ' Viathon', ' Harley Davidson',
       ' Tony Hawk', ' Scott', ' Stamina', ' central bike', ' Litespeed',
       ' Jamis', ' mongoose', ' Rollfast', ' Giant', ' Windsor',
       ' Coleman', ' Rad Power Bikes', ' Maxxhaul', ' Dynacraft',
       ' grabber', ' Sears, JC Higgins', ' Salsa', ' Rad Power Bike',
       ' BMX', ' Hollywood', ' Triton', ' CUB CADET', ' BASS CAT',
       ' hotwheels', ' Charge Bikes', ' Free Spirit', ' All', ' Gibson',
       ' Bjorn', ' Lamar Trailers', ' Yamaha', ' sixthreezero',
       ' Rock Solid Cargo', '

In [29]:
def fix_mm(x):
    if pd.isna(x) or 'unknown' in x.lower():
        return 0
    return 1

df['make / manufacturer'] = df['make / manufacturer'].apply(fix_mm)

In [30]:
df['make / manufacturer'].unique()

array([0, 1], dtype=int64)

In [31]:
df['model name / number'].unique()

array([nan, ' 961XT Speedway', ' G703', ' 4300', ' Comfort tek', ' cub',
       ' So Cal Flyer', ' Grand Slam/4', ' See description', ' Four Bike',
       ' Unknown', ' Holy', ' Ace 24', ' Panther', ' Bobcat Trail 4',
       ' SB900', ' Ruby Pro', ' Dr. Tray plus one', ' ZERO TURN MOWER',
       ' Hotshot/ Gooseneck Trailer', ' Hardrock', ' power', ' Alma Cup',
       ' Hotrock', ' Gxsr', ' M.1', ' Sportster SuperLow XL883',
       ' Duosonic 8053-38', ' CR1 Team', ' Elite', ' ben hur',
       ' Sabre Titanium', ' Ariel', ' Satellite', ' FastRoad SLR',
       ' Rascal', ' Wellington', ' CT200AB', ' RadRunner2', ' grabber',
       ' 8002114', ' Campeon', ' Rad Rover', ' Duo glide', ' MONGOOSE',
       ' 914XT', ' HR8000', ' 179 TRX', ' LT 42', ' PANTERA', ' na',
       ' Radwagon Caboose', ' Breeze', ' All', ' 83" x 14\'', ' WR 250F',
       " Around the Block Men's Cruiser", " 7' x 16' x 7'",
       ' Super Cycle Shuttle', ' gamefisher', ' Series 50', ' General',
       ' COMPACT', ' L

In [32]:
def fix_mn(x):
    if pd.isna(x) or 'unknown' in x.lower():
        return 0
    return 1

df['model name / number'] = df['model name / number'].apply(fix_mn)

In [33]:
df['model name / number'].unique()

array([0, 1], dtype=int64)

In [34]:
df['brake type'].unique()

array([nan, ' cantilever', ' disc (mechanical)', ' caliper', ' none',
       ' other/unknown', ' coaster', ' disc (hydraulic)', ' gyro/bmx',
       ' u-brakes', ' v-brakes', ' drum', ' hydraulic rim brakes'],
      dtype=object)

In [35]:
def fix_bt(x):
    if pd.isna(x) or 'other/unknown' in x.lower() or 'none' in x.lower():
        return 'unknown'
    else:
        x = str(x)
        if 'hydraulic' in x:
            return 'hydraulic'
        elif 'mechanical' in x:
            return 'mechanical'
        else:
            return 'special'

df['brake type'] = df['brake type'].apply(fix_bt)

In [36]:
df['brake type'].unique()

array(['unknown', 'special', 'mechanical', 'hydraulic'], dtype=object)

In [37]:
df['handlebar type'].unique()

array([nan, ' flat', ' bmx', ' other/unknown', ' drop', ' cruiser',
       ' triathlon', ' riser', ' downhill', ' aero', ' bullhorn'],
      dtype=object)

In [38]:
def fix_ht(x):
    if pd.isna(x) or 'other/unknown' in x:
        return 'unknown'
    else:
        return 'special'

df['handlebar type'] = df['handlebar type'].apply(fix_ht)

In [39]:
df['handlebar type'].unique()

array(['unknown', 'special'], dtype=object)

In [40]:
df['part type'].unique()

array([nan, ' other', ' racks, fenders, bags', ' saddle',
       ' grips, bar ends, tape', ' tires', ' wheels and wheel parts',
       ' crankset, bottom bracket, guards', ' brakes', ' pedals',
       ' headset, stem, spacers', ' frame only'], dtype=object)

In [41]:
def fix_pt(x):
    if pd.isna(x) or 'other' in x.lower():
        return 0
    else:
        return 1

df['part type'] = df['part type'].apply(fix_pt)
# Let's get rid of the listings about parts
df = df.where(df['part type']==0).dropna(how='all').drop(columns=['part type'])

In [42]:
df['size / dimensions'].unique()

array([nan, ' 1 1/4” hitch', ' 26.38 x 12.2 x 13.98',
       ' \u200e42 x 40 x 10 inches', ' 750', ' 56x9.5x4.5',
       ' 4-bike capacity', ' 168" X 86" X 0"', ' 192" X 84" X 0"',
       ' 168" X 83" X 0"', ' 26”', ' 2011---2016', ' 51x 48x30',
       ' 36 Volt 42 Volt 48 Volt', ' S, L, M, XL, XXL', ' See above',
       ' 5\'×17" UNFOLDED', ' 31.5 x 21 x 25.5 in (Interior)',
       ' 125cc 250cc 500cc', ' 8.5 x 28', " Women's size medium", ' 1111',
       ' Large', ' 22', ' na', ' 196cc', ' 9 x 22', ' 7.48 mm',
       ' 2500 sq feet', ' 6 feet', ' 12x5', ' 42', ' 59x32x42in',
       ' 7.5 feet tall approx', ' 5x1x3,12', " Men's Large",
       ' Fully Inspected', ' 41" x 20"', ' X-Large', ' 62"wide',
       ' 7X14X3 16K GN DUMP TRAILER', ' 25 LBS', ' 40’x 7’8”x 79”tall',
       ' 32" x 68" x 74"', ' 20”', ' 3x4', ' 600', ' Holds 4 Bikes',
       ' 52"', ' 3 Bike Rack', ' 2 bikes', ' 36',
       ' 10.32 x 7.52 x 2.4 inches', ' 110" x 36" wide x 30" high',
       ' 20 inch', ' 26x4.7', '

In [43]:
def fix_sd(x):
    if pd.isna(x):
        return 0
    else:
        return 1

df['size / dimensions'] = df['size / dimensions'].apply(fix_sd)

In [44]:
df['size / dimensions'].unique()

array([0, 1], dtype=int64)

In [45]:
df['paint color'].unique()

array([nan, ' black', ' grey', ' custom', ' white', ' silver', ' red'],
      dtype=object)

In [46]:
def fix_pc(x):
    if pd.isna(x):
        return 0
    else:
        return 1

df['paint color'] = df['paint color'].apply(fix_pc)

In [47]:
df['paint color'].unique()

array([0, 1], dtype=int64)

In [48]:
df['serial number'].unique()

array([nan, ' Unknown', ' F1068892', ' N/A', ' EN14872XINGING',
       ' TC19 222501', ' na', ' NSN not needed', ' Model GS32401', ' 000',
       ' 20A16575', ' M5060820', ' RU1F20V0502', ' yes', ' GS62660',
       ' un', ' EB720DO', ' S  / N', ' on request', ' none', ' M50',
       ' EZ-201304 468', ' 408 406 7864 cell', ' Var', ' Nome', ' 23-918',
       ' See Photos', ' unknown', ' EG010897', ' 00012345', ' .',
       ' various', ' Ninja 2021', ' Freestyle', ' ??', ' NA',
       ' WT0406C5218', ' WTU336CT0109R1207', ' N/a', ' 1234567890',
       ' CU5C21B3934', ' see photos'], dtype=object)

In [49]:
def fix_sn(x):
    if pd.isna(x) or 'unknown' in x.lower() or 'na' in x.lower() or 'none' in x.lower():
        return 0
    else:
        return 1

df['serial number'] = df['serial number'].apply(fix_sn)

In [50]:
df['serial number'].unique()

array([0, 1], dtype=int64)

In [51]:
df['year manufactured'].unique()

array([  nan, 2022., 2020., 2010., 1975., 2018., 2015., 1979., 2005.,
       2021., 2013., 1987., 2001., 2019., 2012., 2017., 2006.])

In [52]:
def fix_ym(x):
    if pd.isna(x):
        return 0
    else:
        return 1

df['year manufactured']=df['year manufactured'].apply(fix_ym)

In [53]:
df['year manufactured'].unique()

array([0, 1], dtype=int64)

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3891 entries, 0 to 3952
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Price                   3891 non-null   float64
 1   Location                3891 non-null   object 
 2   URL                     3891 non-null   object 
 3   Date                    3891 non-null   object 
 4   Title                   3891 non-null   object 
 5   num image               3891 non-null   float64
 6   text                    3891 non-null   object 
 7   condition               3891 non-null   object 
 8   make / manufacturer     3891 non-null   float64
 9   model name / number     3891 non-null   float64
 10  bicycle frame material  3891 non-null   object 
 11  bicycle type            3891 non-null   object 
 12  brake type              3891 non-null   object 
 13  electric assist         3891 non-null   float64
 14  frame size              3891 non-null   

In [55]:
df.columns

Index(['Price', 'Location', 'URL', 'Date', 'Title', 'num image', 'text',
       'condition', 'make / manufacturer', 'model name / number',
       'bicycle frame material', 'bicycle type', 'brake type',
       'electric assist', 'frame size', 'handlebar type', 'suspension',
       'wheel size', 'size / dimensions', 'serial number', 'paint color',
       'year manufactured', 'engine hours (total)', 'length overall (LOA)',
       'propulsion type'],
      dtype='object')

In [56]:
df = df.drop(columns=['length overall (LOA)', 'propulsion type', 'engine hours (total)'])

In [57]:
df.columns = [column.replace('/', '').lower().strip().replace(' ', '') for column in df.columns]

In [58]:
df.columns

Index(['price', 'location', 'url', 'date', 'title', 'numimage', 'text',
       'condition', 'makemanufacturer', 'modelnamenumber',
       'bicycleframematerial', 'bicycletype', 'braketype', 'electricassist',
       'framesize', 'handlebartype', 'suspension', 'wheelsize',
       'sizedimensions', 'serialnumber', 'paintcolor', 'yearmanufactured'],
      dtype='object')

In [59]:
df['days'] = [(datetime.datetime.today()-datetime.datetime.strptime(date_, "%Y-%m-%d %H:%M")).days for date_ in df['date']]

In [60]:
df.to_csv('preprocessed_data_craigslist.csv')