In [103]:
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup

city = 'gdansk'
url = 'https://www.olx.pl/nieruchomosci/mieszkania/sprzedaz/{}/?page='.format(city)

columns = ['Title', 'Price', 'Sq_Footage', 'District']
df = pd.DataFrame(columns=columns)

In [104]:
pages = 1

In [105]:
for page_index in range(pages):
	next_url = url + str(page_index + 1)
	response = requests.get(next_url)

	if response.status_code == 200:
		soup = BeautifulSoup(response.text, 'html.parser')
		cards = soup.find_all('div', {'data-cy': 'l-card'})

	for card in cards:
		try:
			# SHORTEN TITLE NAME IF LARGER THAN 40
			title = card.find('h6').text
			if len(title) > 37:
				title = title[:37] + '...'
			price = card.find('p').text
			price = float(price[:price.index('z')].replace(' ', ''))
			sq_footage_data = card.find('span', {'class': 'css-643j0o'}).text
			sq_footage = float(sq_footage_data[:sq_footage_data.index('m') - 1].replace(',', '.'))
			# EXTRACT DISTRICT NAME, OTHERWISE SET GDAŃSK
			district = card.find('p', {'class': 'css-veheph er34gjf0'}).text
			if ',' in district:
				district = district[8 : ]
				district = district[: district.index(' ')]
				if district == 'Nowy':
					district += ' Port'
				if district == 'Wyspa':
					district += ' Sobieszewska'
			else:
				district = 'Unknown'
			
			df.loc[len(df)] = [title, price, sq_footage, district]
		except:
			continue

In [106]:
df.head()

Unnamed: 0,Title,Price,Sq_Footage,District
0,Nowe 4 pokoje_12 min od Centrum_Możli...,532790.0,60.6,Unknown
1,Mieszkanie bezczynszowe kup za 50% wa...,399000.0,75.0,Młyniska
2,Nadmorski Apartament MILA BALTICA Wys...,1499000.0,59.0,Unknown
3,4-pok. mieszkanie 90m² z ogrodem - 20...,399000.0,90.0,Chełm
4,Mieszkanie 75m2 - 4pokoje - 20 min. d...,399000.0,75.0,Chełm


In [107]:
unique_districts = df['District'].unique()
unique_districts

array(['Unknown', 'Młyniska', 'Chełm', 'Wrzeszcz', 'Nowy Port',
       'Wyspa Sobieszewska', 'Letnica', 'Jasień', 'Osowa', 'Brzeźno',
       'Brętowo', 'Ujeścisko', 'Orunia', 'Śródmieście', 'Żabianka'],
      dtype=object)

In [108]:
district_mapping = {district: index for index, district in enumerate(unique_districts)}
district_mapping

{'Unknown': 0,
 'Młyniska': 1,
 'Chełm': 2,
 'Wrzeszcz': 3,
 'Nowy Port': 4,
 'Wyspa Sobieszewska': 5,
 'Letnica': 6,
 'Jasień': 7,
 'Osowa': 8,
 'Brzeźno': 9,
 'Brętowo': 10,
 'Ujeścisko': 11,
 'Orunia': 12,
 'Śródmieście': 13,
 'Żabianka': 14}

In [109]:
df['District'] = df['District'].map(district_mapping)
df.head()

Unnamed: 0,Title,Price,Sq_Footage,District
0,Nowe 4 pokoje_12 min od Centrum_Możli...,532790.0,60.6,0
1,Mieszkanie bezczynszowe kup za 50% wa...,399000.0,75.0,1
2,Nadmorski Apartament MILA BALTICA Wys...,1499000.0,59.0,0
3,4-pok. mieszkanie 90m² z ogrodem - 20...,399000.0,90.0,2
4,Mieszkanie 75m2 - 4pokoje - 20 min. d...,399000.0,75.0,2
