# Scrape data from craigslist

## Imports

In [1]:
import pandas as pd
import numpy as np

import requests
from bs4 import BeautifulSoup

import math
import re

# from IPython.display import display

pd.options.display.max_columns = None
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = 1000

## Setup

In [2]:
url = 'https://berlin.craigslist.org/d/housing/search/hhh'

In [3]:
html = requests.get(url).content
bs = BeautifulSoup(html, "html.parser")
print(bs.prettify())

<!DOCTYPE html>
<html class="no-js">
 <head>
  <title>
   Berlin Immo  – craigslist
  </title>
  <meta content="Berlin Immo  – craigslist" name="description"/>
  <meta content="IE=Edge" http-equiv="X-UA-Compatible">
   <link href="https://berlin.craigslist.org/search/hhh" rel="canonical"/>
   <link href="https://berlin.craigslist.org/search/hhh?format=rss" rel="alternate" title="RSS-Feed für craigslist | Berlin Immo  - craigslist" type="application/rss+xml"/>
   <link href="https://berlin.craigslist.org/search/hhh?s=120" rel="next"/>
   <meta content="width=device-width,initial-scale=1" name="viewport"/>
   <link href="//www.craigslist.org/styles/cl.css?v=784f71fcd82733e2b32e8d3284beaf58" media="all" rel="stylesheet" type="text/css"/>
   <link href="//www.craigslist.org/styles/search.css?v=bc035cbbc3978b0ec9df93944cdf349b" media="all" rel="stylesheet" type="text/css"/>
   <link href="//www.craigslist.org/styles/jquery-ui-clcustom.css?v=3b05ddffb7c7f5b62066deff2dda9339" media="all" rel=

In [4]:
rows = (bs.find('body')
 # .find('section', {'class':'page-container'})
 # .find('form', {'id':'searchform'})
 # .find('div', {'class':'content'})
        .find('ul', {'class':'rows'}))

## Functions

#### Function `get_data_one_page`
Crawls data from one page (usually 120 entries) and outputs a df

In [5]:
def get_data_one_page(rows):

    # define empty lists to be filled with crawled information
    lst_id = []
    lst_link = []
    lst_title = []
    lst_price = []
    lst_size = []
    lst_hood = []
    lst_date = []
    lst_img_link = []
    lst_desc = []
    lst_num_rooms = []

    for result in rows.find_all('li', {'class': 'result-row'}):
        
        # get information from overview page
        lst_id.append(result['data-pid']) if result['data-pid'] is not None else lst_id.append(np.NaN)
        lst_link.append(result.find('a', {'class': 'result-title hdrlnk'})['href']) if result.find('a', {'class': 'result-title hdrlnk'}) is not None else lst_link.append(np.NaN)
        lst_title.append(result.find('a', {'class': 'result-title hdrlnk'}).text) if result.find('a', {'class': 'result-title hdrlnk'}) is not None else lst_title.append(np.NaN)
        lst_price.append(result.find('span', {'class': 'result-price'}).text) if result.find('span', {'class': 'result-price'}) is not None else lst_price.append(np.NaN)
        lst_hood.append(result.find('span', {'class': 'result-hood'}).text) if result.find('span', {'class': 'result-hood'}) is not None else lst_hood.append(np.NaN)
        lst_date.append(result.find('time', {'class': 'result-date'})['datetime'])
        
        if result.find('span', {'class': 'housing'}) is not None:
            size = result.find('span', {'class': 'housing'}).text
            lst_size.append(int(re.findall(r"(\d+)m2", size)[0])) if len(re.findall(r"(\d+)m2", size)) >= 1 else lst_size.append(np.NaN)
            lst_num_rooms.append(int(re.findall(r"(\d+)br", size)[0])) if len(re.findall(r"(\d+)br", size)) >= 1 else lst_num_rooms.append(np.NaN)
        else:
            lst_size.append(np.NaN)
            lst_num_rooms.append(np.NaN)
            
        
        # get image link and description from detailed web page (not overview page)
        url = result.find('a', {'class': 'result-title hdrlnk'})['href']
        html = requests.get(url).content
        bs = BeautifulSoup(html, "html.parser")
        
        lst_img_link.append(bs.find('img', {'title': '1'})['src']) if bs.find('img', {'title': '1'}) is not None else lst_img_link.append(np.NaN)
        lst_desc.append(bs.find('section', {'id': 'postingbody'}).text.replace('\n', ' ').replace('QR Code Link to This Post', '').strip()) if bs.find('section', {'id': 'postingbody'}) is not None else lst_desc.append(np.NaN)    
        
        
    # define dict based on the crawled data to create the dataframe    
    di_results = {
        'id': lst_id,
        'title': lst_title,
        'price': lst_price,
        'size': lst_size,
        'rooms': lst_num_rooms,
        'hood': lst_hood,
        'date': lst_date,
        'description': lst_desc,
        'link': lst_link,
        'img_link': lst_img_link,
    }
    
    return pd.DataFrame(di_results)
    


#### Function `get_all_data`
Loops over all search pages and calls `get_data_one_page` and outputs a df

In [6]:
def get_all_data(url_main):
    html = requests.get(url_main).content
    bs = BeautifulSoup(html, "html.parser")
    
    # calculate number of pages to loop over
    num_entries = int(bs.find('span', {'class': 'totalcount'}).text)
    to_range = int(bs.find('span', {'class': 'rangeTo'}).text)
    num_pages = math.floor(num_entries/to_range)
    
    # for each search page, retrieve information and append to a master df
    for num_page in range(num_pages+1):
        url = f'https://berlin.craigslist.org/d/housing/search/hhh?s={num_page*120}'
        html = requests.get(url).content
        bs = BeautifulSoup(html, "html.parser")
        rows = (bs.find('body').find('ul', {'class':'rows'}))
        try:
            df = df.append(get_data_one_page(rows), ignore_index=True)
        except:
            df = get_data_one_page(rows)
    
    return df
        
    

## Crawl Data

In [7]:
%time df_all = get_all_data(url)

CPU times: user 32.1 s, sys: 880 ms, total: 33 s
Wall time: 5min 56s


## Data Cleaning

In [8]:
df_output = df_all.copy()

#### Make neighborhood names consistent

In [9]:
df_output['hood'] = df_output['hood'].str.replace('(', '').str.replace(')', '')

In [10]:
# define dictionary

di_hoods = {
    'kreuzberg': 'Kreuzberg',
    'prenzlau': 'Prenzlauer Berg',
    'neuk': 'Neukölln',
    'mitte': 'Mitte',
    'friedrichsh': 'Friedrichshain'
}

In [11]:
for k, v in di_hoods.items():
    df_output['hood'] = np.where(df_output['hood'].str.contains(k, case=False), v, df_output['hood'])

#### Clean `price` column

In [12]:
df_output['price'] = df_output['price'].str.strip('€').astype(float)

In [13]:
# remove high prices (=outliers?!)

df_output = df_output[df_output['price'] < 5000]

In [19]:
# remove low prices (=outliers?!)

df_output = df_output[df_output['price'] >= 10]

## Output JSON

In [20]:
df_output.head(3)

Unnamed: 0,id,title,price,size,rooms,hood,date,description,link,img_link
0,7075804163,ROOFTOP STUDIO IN NEUKÖLLN NEAR VOLKSPARK HASENHEIDE AND RATHAUS NEUKÖ,500.0,52.0,1.0,Neukölln,2020-02-15 17:09,"App. 45 sqm (484 sf) 2 persons max. 5th floor (lift) Fully Equipped Kitchen Digital TV Open Kitchen Balcony / Terrace Washing Machine Sofa Microwave Bed Linen & Towels Bathroom with Shower WIFI Connection Living Room Green View Queen Size Bed 1,60x2m The Apartment This bright and cosy rooftop studio apartment is perfect for one person or a couple. There is a queen size bed, a big cupboard and a Sony Tv for home entertainment. The open plan kitchen is fully equipped including a microwave. Enjoy breakfast and meals at the dining table which can seat up to three persons or on the balcony. The bright bathroom has a shower and washing machine. The apartment can be reached comfortably by a lift Location The apartment is located in Neukoelln near Boddinstrasse and Rathaus Neukoelln. In this multicultural area you will find lots of students, clubs, cafes and loads of shops and diverse restaurants. Also the annual art festival 48 Stunden Neukoelln takes place every summer where hundred...",https://berlin.craigslist.org/apa/d/rooftop-studio-in-neuklln-near/7075804163.html,https://images.craigslist.org/00808_gN4yLg6eDWh_600x450.jpg
1,7068725043,(^Studio Apartment Available w/ New Appliances! Patio! Pets Allowed^),500.0,53.0,1.0,Mitte,2020-02-15 17:08,"The apartment is perfectly located on the well known Torstrasse strip in Berlin ""Mitte"". It's surrounded by restaurants, bars and galleries. Tram/subway station ""Oranienburger Straße"" is only a few hundred meters away and ""Rosenthaler Platz"" is easily reachable by foot. Main sights like the Brandenburger Gate, Reichstag, Wall Museum, Alexanderplatz etc. are as well located in walking distance.",https://berlin.craigslist.org/apa/d/studio-apartment-available-new/7068725043.html,https://images.craigslist.org/00101_dvC2DdmGxbg_600x450.jpg
2,7062094492,Möblierte 4-Zimmer Etage nahe S-Bahnhof Hermsdorf,1100.0,113.0,4.0,Kreuzberg,2020-02-15 16:58,"Möblierte 4-Zimmer Etage in Villensouterrain nahe S-Bahnhof Hermsdorf Der Villenvorort Hermsdorf liegt im Berliner Norden zwischen Waidmannslust und Frohnau. Sehr ruhige bevorzugte Lage - 3 Gehminuten vom S-Bahnhof Hermsdorf und allen Einkaufmöglichkeiten entfernt, ca. 23 Minuten direkt zum Berliner Stadtzentrum / Brandenburger Tor ca. 10 Autominuten zum Flughafen Tegel. ca. 113 qm große Küche mit Herd, Kühlschrank und Waschmaschine, Duschbad mit WC, DSL Anschluss Die Räume sind komplett eingerichtet und ausgestattet. Monatsmiete 1100,00 € kalt Nebenkosten Eigenverbrauch/Energie Kaution Erstkontakt bitte AUSSCHLIESSLICH telefonisch von 10.00 bis 19.00 Uhr keine Mails / SMS Bitte keine Vermittleranfragen",https://berlin.craigslist.org/sub/d/mblierte-4-zimmer-etage-nahe-bahnhof/7062094492.html,https://images.craigslist.org/00P0P_dyGp10XnUJn_600x450.jpg


In [21]:
df_output.to_json('data_v3.json', orient='records')