This script will download craigslist rental data for Toronto proper and store the information in a Google Cloud database. 

Notable libraries used will be Craigslist, Pandas and Numpy

In [88]:
#load library and class
from craigslist import CraigslistHousing
import pandas as pd
import numpy as np


In [89]:
#establish query, then seek results
cl = CraigslistHousing(site='toronto', area='tor', category='apa')
results = cl.get_results(sort_by='newest', geotagged=True, limit=100)

#Store results in a vector for easier referencing and analysis
listings = []
for result in results:
    listings.append(result)

#Print off top 3 to see dataset
print (listings[:3])

[{'id': '6841677680', 'repost_of': None, 'name': '255 Richmond E Parking & Utilities Icluded', 'url': 'https://toronto.craigslist.org/tor/apa/d/toronto-255-richmond-parking-utilities/6841677680.html', 'datetime': '2019-03-14 19:56', 'price': '$2200', 'where': '255 Richmond St E Toronto Unit 1410', 'has_image': True, 'has_map': True, 'geotag': (43.653118, -79.370401), 'bedrooms': '1', 'area': '550ft2'}, {'id': '6841677166', 'repost_of': None, 'name': 'Gorgeous University Apartment Space!', 'url': 'https://toronto.craigslist.org/tor/apa/d/toronto-gorgeous-university-apartment/6841677166.html', 'datetime': '2019-03-14 19:55', 'price': '$2650', 'where': '376 College St Toronto Ontario', 'has_image': True, 'has_map': True, 'geotag': (43.656996, -79.404929), 'bedrooms': '2', 'area': '800ft2'}, {'id': '6839891823', 'repost_of': '6375845070', 'name': 'Apart for rent', 'url': 'https://toronto.craigslist.org/tor/apa/d/toronto-apart-for-rent/6839891823.html', 'datetime': '2019-03-14 19:53', 'pric

In [122]:
#Convert JSON to Dataframe
df = pd.DataFrame.from_records(listings)

#replace "None" with NA
df.fillna(value=pd.np.nan, inplace=True)

#drop unnecessary columns
cols = [4,5,10]
df.drop(df.columns[cols], axis=1, inplace=True)

list(df)

#df.head

['area',
 'bedrooms',
 'datetime',
 'geotag',
 'id',
 'name',
 'price',
 'repost_of',
 'where']

In [123]:
#Remove unit of measurement from area column and store separately. 
#Checking where imperial or metric units are used and consolodating to imperial

df['area_measure'] = df['area'].str[-3:]
df['area'] = df['area'].str[:-3]
df.area_measure.unique()

array(['ft2', nan], dtype=object)

In [124]:
#Split Lat and Long into separate columns
lat = []
lon = []

#remove leading and lagging brackets
df['geotag']=df['geotag'].map(lambda x: str(x)[1:-1])

# For each row in a varible,
for row in df['geotag']:
    # Try to,
    try:
        # Split the row by comma and append
        # everything before the comma to lat
        lat.append(row.split(',')[0])
        # Split the row by comma and append
        # everything after the comma to lon
        lon.append(row.split(',')[1])
    # But if you get an error
    except:
        # append a missing value to lat
        lat.append(np.NaN)
        # append a missing value to lon
        lon.append(np.NaN)

# Create two new columns from lat and lon
df['latitude'] = lat
df['longitude'] = lon

In [152]:
#Convert Numeric Columns
df.area = df['area'].astype(str).astype(float)
df.bedrooms = df['bedrooms'].astype(str).astype(float)
df.id = df['id'].astype(str).astype(float)
df.latitude = df['latitude'].astype(str).astype(float)
df.longitude = df['longitude'].astype(str).astype(float)
df.repost_of = df['repost_of'].astype(str).astype(float)

    #remove "$" from price, then convert to float
df['price'] = df['price'].str[1:]
df.price = df['price'].astype(str).astype(float)

#Convert datetime to datetime
df['datetime']=pd.to_datetime(df['datetime'])

df.dtypes

area            float64
bedrooms        float64
datetime         object
geotag           object
id              float64
name             object
price           float64
repost_of       float64
where            object
area_measure     object
latitude        float64
longitude       float64
dtype: object

area                   float64
bedrooms               float64
datetime        datetime64[ns]
geotag                  object
id                     float64
name                    object
price                  float64
repost_of              float64
where                   object
area_measure            object
latitude               float64
longitude              float64
dtype: object