# Developing data structure

In [101]:
import json
import pandas as pd
import numpy as np
import os

In [121]:
# Functions needed to cleasing data and extracting structured data from  lists of dictionaries
# Function to keep just numeric values and format them as float
def clean_prices(price):
    # Checking if numers are found in the string
    if any(char.isnumeric() for char in price):
        # Returning the float value of the price

        return float(''.join([char for char in price if char.isnumeric() or char == '.']))
    else:
        return None
# Function to remove weird characters from strings and extra spaces and any \t \n or \r
def clean_string(string: str):
    return string.replace('\n', '').replace('\t', '').replace('\r', '').strip()

# Functions for questioning data
# Question: Room size
def room_size(data):
    item = data['room_info']
    if 'pequeña' in item:
        return 'small'
    elif 'mediana' in item:
        return 'medium'
    elif 'grande' in item:
        return 'large'
    else:
        return None
    
# Question: Furnished
def furnished(data):
    item = data['room_info']
    if 'amueblada' in item:
        return True
    else:
        return False

# Question: Private bathroom
def private_bathroom(data):
    item = data['room_info']
    if 'baño privado' in item:
        return True
    else:
        return False

# Question: Individual or double
def individual_or_double(data):
    item = data['room_info']
    if 'individual' in item:
        return True
    elif 'doble' in item:
        return False
    else:
        # If not specified, we assume it is individual
        return True
    
# Question: Room position
def room_position(data):
    item = data['room_info']
    if ('exterior' in item) or( 'experior adaptado' in item):
        return 'exterior'
    elif ('interior' in item) or ('interior adaptado' in item):
        return 'interior'
    elif 'interior y exterior adaptado' in item:
        return 'to be defined'
    else:
        return None

# Flat info questions
# Rooms available
def rooms_available(data):
    item = data['flat_info']
    if 'hab' in item:
        # Get 'hab' postion and get previous item in list
        return int(item[item.index('hab') - 1])
    else:
        return None
# Flat size
def flat_size(data):
    item = data['flat_info']
    if 'm' in item:
        # Get 'm²' postion and get previous item in list
        return float(item[item.index('m') - 1])
    else:
        return None

# Number of bathrooms
def number_of_bathrooms(data):
    item = data['flat_info']
    if 'baño' in item:
        # Get 'baño' postion and get previous item in list
        return int(item[item.index('baño') - 1])
    elif 'baños' in item:
        # Get 'baños' postion and get previous item in list
        return int(item[item.index('baños') - 1])
    else:
        return None
    
# Elevator available
def elevator(data):
    item = data['flat_info']
    if 'ascensor' in item:
        return True
    else:
        return False

# Pool
def pool(data):
    item = data['flat_info']
    if ('piscina propia' in item )or ('piscina comunitaria' in item):
        #print(item)
        return True
    else:
        return False

# Terrace
def terrace(data):
    item = data['flat_info']
    if 'jardin/terraza' in item:
        return True
    else:
        return False

# Washer machine
def washer_machine(data):
    item = data['flat_info']
    if 'lavadora' in item:
        return True
    else:
        return False

# Dryer machine
def dryer_machine(data):
    item = data['flat_info']
    if 'secadora' in item:
        return True
    else:
        return False
    
# Air conditioning
def air_conditioning(data):
    item = data['flat_info']
    if 'aire acond.' in item:
        return True
    else:
        return False

# Heating
def heating(data):
    item = data['flat_info']
    if 'calefacción' in item:
        return True
    else:
        return False

# Internet
def internet(data):
    item = data['flat_info']
    if ('internet' in item) or ('wifi' in item):
        return True
    else:
        return False

# TV
def tv(data):
    item = data['flat_info']
    if 'tv' in item:
        return True
    else:
        return False

# storage
def storage(data):
    item = data['flat_info']
    if 'trastero' in item:
        return True
    else:
        return False

# Flat condition
def flat_condition(data):
    item = data['flat_info']
    if 'en buen estado' in item:
        return 'good'
    elif 'a estrenar' in item:
        return 'new'
    elif 'reformado' in item:
        return 'refurbished'
    elif 'a reformar' in item:
        return 'to be refurbished'
    else:
        return None
    
# Flat rules

# Pets allowed
def pets_allowed(data):
    item = data['rules']
    if 'se admiten mascotas' in item:
        return True
    elif 'no se admiten mascotas' in item:
        return False
    else:
        return None

# Couples allowed
def couples_allowed(data):
    item = data['rules']
    if 'se admiten parejas' in item:
        return True
    elif 'no se admiten parejas' in item:
        return False
    else:
        return None
    
# Smoking allowed
def smoking_allowed(data):
    item = data['rules']
    if 'se permite fumar' in item:
        return True
    elif 'no se permite fumar' in item:
        return False
    else:
        return None

In [103]:
files = os.listdir('include/pisocompartido/raw')

In [104]:
file = files[0]

In [105]:
with open(f'include/pisocompartido/raw/{file}') as f:
    raw_data = json.load(f)

In [106]:
# Checking data main structure and first 10 elements
for data in raw_data[:10]:
    print(json.dumps(data, indent=4) + '\n')

{
    "title": "habitaci\u00f3n en alquiler en raval-santa rosa-safaretjos",
    "price": "450 \u20ac",
    "address": "c/ pirineus n\u00fam. 104,                                                                                                                              raval-santa rosa-safaretjos,                                                                                                                             santa coloma de gramenet",
    "lat": "41.4464 ",
    "lon": "2.2184794",
    "description": "alquilo una habitaci\u00f3n individual para una persona preferiblemente chica que sea ordenada y limpia interesadas me pueden contactar",
    "room_info": [
        "",
        "mediana",
        "",
        "exterior"
    ],
    "flat_info": [
        "",
        "1",
        "hab",
        "",
        "1",
        "ba\u00f1o",
        "",
        "32",
        "m",
        "",
        "en buen estado"
    ],
    "conditions": [],
    "rules": [
        "",
        "no se ace

In [107]:
# Initializing new data structure
clean_data = []

In [108]:
# Removing '' and replacing them with None
for data in raw_data:
    for key in data.keys():
        data[key] = None if data[key] == '' else data[key]

In [109]:
# First formatting: price, cleasing and getting just numeric values
for data in raw_data:
    print(data)
    clean_data.append({
        'title': clean_string(data['title']) if data['title'] else None,
        'price': clean_prices(data['price']) if data['price'] else np.nan,
        'address': clean_string(data['address']) if data['address'] else None,
        'latitude': data['lat'] if data['lat'] else np.nan,
        'longitude': data['lon'] if data['lon'] else np.nan,
        'description': clean_string(data['description']) if data['description'] else None,
        'room_info': [clean_string(info) for info in data['room_info']] if data['room_info'] else None,
        'flat_info': [clean_string(info) for info in data['flat_info']] if data['flat_info'] else None,
        'rules': [clean_string(rule) for rule in data['rules'] ] if data['rules'] else None,
    })

{'title': 'habitación en alquiler en raval-santa rosa-safaretjos', 'price': '450 €', 'address': 'c/ pirineus núm. 104,                                                                                                                              raval-santa rosa-safaretjos,                                                                                                                             santa coloma de gramenet', 'lat': '41.4464 ', 'lon': '2.2184794', 'description': 'alquilo una habitación individual para una persona preferiblemente chica que sea ordenada y limpia interesadas me pueden contactar', 'room_info': ['', 'mediana', '', 'exterior'], 'flat_info': ['', '1', 'hab', '', '1', 'baño', '', '32', 'm', '', 'en buen estado'], 'conditions': [], 'rules': ['', 'no se aceptan mascotas', '', 'no se permiten parejas', '', 'no se permite fumar']}
{'title': 'habitación en alquiler en la creu de barberà', 'price': '300 €', 'address': 'c/ reis catòlics núm. 127,                           

# Structuring data

## Room info

Next steps will check what unique items room_info holds to provide info.

In [110]:
# Checking raw info given in room_info
room_info_items = []
for data in clean_data:
    for item in data['room_info']:
        if item not in room_info_items and item != '':
            room_info_items.append(item)

In [111]:
for i, v in enumerate(room_info_items):
    print(i, v)

0 mediana
1 exterior
2 amueblada
3 individual
4 grande
5 doble
6 llave propia
7 balcón propio
8 pequeña
9 baño propio
10 interior y exterior adaptados
11 exterior adaptado
12 interior adaptado


## What whas seen in raw data for room info:

1) Room size
2) Room location: internal or external
3) Furnished
4) Indivual or doubled
5) Personal key
6) private bathroom
7) private balcony

In [112]:
# Asking question to raw data


for data in clean_data:
    #print(data['room_info'])
    # Room size?
    data['room_size'] = room_size(data)

    # Furnished?
    data['furnished'] = furnished(data)

    # Private bathroom?
    data['private_bathroom'] = private_bathroom(data)

    # Individual or double?
    data['individual_or_double'] = individual_or_double(data)
    
    # Room position
    data['room_position'] = room_position(data)

## Flat info

As before, same process will be done for flat info



In [113]:
# Checking flat info unique values
flat_info_items = []
for data in clean_data:
    for item in data['flat_info']:
        if item not in flat_info_items and item != '':
            flat_info_items.append(item)

In [114]:
for i, v in enumerate(flat_info_items):
    print(i, v)

0 1
1 hab
2 baño
3 32
4 m
5 en buen estado
6 90
7 60
8 reformado
9 2
10 baños
11 120
12 104
13 8
14 0
15 80
16 10
17 4
18 95
19 lavadora
20 ascensor
21 wifi
22 tv
23 tendedero
24 plancha
25 3
26 54
27 75
28 balcón
29 85
30 77
31 aire acond.
32 jardín/terraza
33 100
34 trastero
35 200
36 piscina propia
37 secadora
38 65
39 105
40 114
41 67
42 a estrenar
43 55
44 150
45 calefacción
46 25
47 115
48 110
49 63
50 70
51 107
52 78
53 130
54 15
55 56
56 79
57 82
58 piscina comunitaria
59 87
60 109
61 9
62 5
63 145
64 6
65 180
66 68
67 140
68 66
69 125
70 40
71 88
72 155
73 50
74 74
75 73
76 230
77 12
78 127
79 175
80 225
81 52
82 7
83 98
84 160
85 93
86 92
87 83
88 64
89 270
90 300
91 315
92 30
93 164
94 135
95 165
96 11
97 76
98 400
99 72
100 42
101 179
102 a reformar
103 18
104 53
105 170
106 89
107 136
108 112
109 250
110 118
111 500
112 260
113 240
114 220
115 190
116 84
117 47
118 91
119 275
120 16
121 22
122 20
123 460
124 119
125 187
126 226
127 117
128 142
129 35
130 206
131 133
132 14

## What was seen in raw data for flat info

1) Flat size
2) Numeber of bathrooms
3) Rooms aviable
4) Flat conditions
5) Wifi
6) Washermachine
7) Dryer
8) Pool
9) Heater
10) iron
11) A/C
12) yard/rooftop
13) Elevator
14) TV
15) Storage

In [115]:
# Asking question to raw data for flat info

for data in clean_data:
    
    # Rooms availables
    data['rooms_available'] = rooms_available(data)

    # Flat size
    data['flat_size'] = flat_size(data)

    # Bathrooms
    data['number_of_bathrooms'] = number_of_bathrooms(data)

    # Elevator
    data['elevator'] = elevator(data)

    # Pool
    data['pool'] = pool(data)

    # Terrace
    data['terrace'] = terrace(data)

    # Washer machine
    data['washer_machine'] = washer_machine(data)

    # Dryer machine
    data['dryer_machine'] = dryer_machine(data)

    # Air conditioning
    data['air_conditioning'] = air_conditioning(data)

    # Heating
    data['heating'] = heating(data)

    # Internet
    data['internet'] = internet(data)

    # TV
    data['tv'] = tv(data)

    # Storage
    data['storage'] = storage(data)

    # Flat condition
    data['flat_condition'] = flat_condition(data)

## Rules info

In this section rules will be checked and the options will be:
True: Accepted
False: Not accepted
None: Not specified

In [117]:
# Getting all unique rules
rules = []
for data in clean_data:
    for rule in data['rules']:
        if rule not in rules and rule != '':
            rules.append(rule)

In [118]:
for i, v in enumerate(rules):
    print(i, v)

0 no se aceptan mascotas
1 no se permiten parejas
2 no se permite fumar
3 se aceptan mascotas
4 se permiten parejas
5 se permite fumar


## What rules users specify to rent:

1) Pets
2) Couples acceptance
3) Smoking


In [122]:
# Asking question to raw data for rules

for data in clean_data:
    
    # Are pets allowed?
    data['pets_allowed'] = pets_allowed(data)

    # Are couples allowed?
    data['couples_allowed'] = couples_allowed(data)

    # Is smoking allowed?
    data['smoking_allowed'] = smoking_allowed(data)

## Cleaning last items

Here items used and not necesary any more are removed to generate a single and clean table

In [124]:
data = pd.DataFrame(clean_data).drop(columns=['room_info', 'flat_info', 'rules'])

In [127]:
# Adding some useful information to track the original data
data['original_file'] = file
data['date'] = file.split('_')[-2].replace('.json', '')

In [129]:
# Saving data as csv in processed folder
data.to_csv(f'include/pisocompartido/processed/{file.split(".")[0]}.csv', index=False)