# Обработка загруженной информации о предложениях

В этом ноутбуке содержится код обработки данных о предложениях, полученных в результате запуска функций, описанных в ноутбуке `offers_grabber`. Загрузка производилась тремя частями, во время первой, в марте 2020 года, были получены данные о предложениях от застройщиков. И еще двумя частями, в конце мая 2020 года, были получены данные о новых предложениях от застройщиков и предложениях от агентов.

Целью описываемых здесь функций является подготовка данных к загрузке в ClickHouse.

In [2]:
import numpy as np
import pandas as pd

In [121]:
# Function for fixing small mistakes in the data after parsing
def process_offers_info(df_new_complexes, df_complexes, is_dev):
    n_offers = len(df_new_complexes['square'].values)

    squares_arr = df_new_complexes['square'].values
    floors_arr = df_new_complexes['floor'].values
    total_floors_arr = df_new_complexes['total_floors'].values
    total_prices_arr = df_new_complexes['total_price'].values
    prices_for_meter_arr = df_new_complexes['price_for_meter'].values
    rooms_arr = df_new_complexes['rooms'].values
    complex_names = df_new_complexes['complex_name'].values
    minutes_to_subway = df_new_complexes['time_to_subway'].values
    offer_links_arr = df_new_complexes['offer_link'].values
    
    complexes_info_names = df_complexes['complex_name'].values
    complexes_info_ids = df_complexes['complex_id'].values
    builders_info_ids = df_complexes['builder_id'].values
    
    # If the field "complex_deadline" is empty, it means that complex is already built
    # Fill Nan values in this column with values "Сдан"
    df_new_complexes['complex_deadline'] = df_new_complexes['complex_deadline'].fillna("Сдан")

    # Check the correctness of the numerical attributes and convert it to the float and int types
    squares = []
    floors = []
    total_floors = []
    total_prices = []
    prices_for_meter = []
    rooms_numb = []
    new_complexes_ids = []
    new_builders_ids = []
    minutes_to_sub_arr = []
    offer_ids = []

    for i in range(n_offers):
        squares.append(float(squares_arr[i].replace(',', '.')))
        floors.append(int(floors_arr[i]))
        total_floors.append(int(total_floors_arr[i]))
        total_prices.append(int(total_prices_arr[i]))
        prices_for_meter.append(int(prices_for_meter_arr[i]))
        
        # Fix complex name
        complex_names[i] = complex_names[i].replace('ЖК «', '').replace('»', '')
        
        # Get offer_id from offer_link
        offer_id = int(offer_links_arr[i].split('/')[-2])
        offer_ids.append(offer_id)
        
        # Fix room number parameter, make it integer
        room_numb = rooms_arr[i][0]
        if not (room_numb >= '0' and room_numb <= '9'):
            room_numb = 0
        else:
            room_numb = int(room_numb)  
        rooms_numb.append(room_numb)
        
        # Fix time to metro parameter
        if str(minutes_to_subway[i]) == 'nan':
            minutes_to_sub = -1
        else:
            minutes_to_sub = int(str(minutes_to_subway[i]).split(' ')[0])
        minutes_to_sub_arr.append(minutes_to_sub)
        
        # Add information about builder id and complex id
        b_id = -1
        c_id = -1
        for j in range(len(complexes_info_names)):
            if complexes_info_names[j].lower().find(complex_names[i].lower()) != -1:
                c_id = complexes_info_ids[j]
                b_id = builders_info_ids[j]
                break
        new_builders_ids.append(b_id)
        new_complexes_ids.append(c_id)

    df_new_complexes['square'] = squares
    df_new_complexes['floor'] = floors
    df_new_complexes['total_floors'] = total_floors
    df_new_complexes['total_price'] = total_prices
    df_new_complexes['price_for_meter'] = prices_for_meter
    df_new_complexes['complex_name'] = complex_names
    df_new_complexes['rooms_cnt'] = rooms_numb
    df_new_complexes['complex_id'] = new_complexes_ids
    df_new_complexes['builder_id'] = new_builders_ids
    df_new_complexes['minutes_to_subway'] = minutes_to_sub_arr
    df_new_complexes['offer_id'] = offer_ids
    
    # Prepare list of indexes for deletion (for that complexes, which are not in the complexes_info file)
    lst_to_delete = []
    for i in range(n_offers):
        if (new_complexes_ids[i] == -1):
            lst_to_delete.append(i)

    # Add new parameter - is this a developers offer (0 for agents' offers and 1 for developers)
    if is_dev:
        df_new_complexes['is_developers_offer'] = np.ones(n_offers)
    else:
        df_new_complexes['is_developers_offer'] = np.zeros(n_offers)
        
    # Delete offers, which ids are in the list for deletion
    df_offers_cutted = df_new_complexes.drop(lst_to_delete)
    
    df_to_analyse = df_offers_cutted[['offer_id', 'complex_id', 'builder_id', 'rooms_cnt', 'square', 'floor', 'total_floors', 'total_price', 'price_for_meter', 'nearest_subway', 'minutes_to_subway', 'is_walk']]
    
    return df_to_analyse

## Загрузка информации о жилых комплексах

In [122]:
df_complexes = pd.read_csv('complexes_info.csv', sep=',')
df_complexes

Unnamed: 0,complex_id,complex_name,name_yandex,longitude,latitude,builder_id
0,96,Южная долина,ЖК Южная долина,30.642199,59.985329,8093
1,44193,Орловский парк,Орловский парк,30.269371,60.056063,9
2,25135,Дом на Космонавтов,ЖК Дом на Космонавтов,30.352259,59.839722,8110
3,14272,Цвета Радуги,ЖК Цвета Радуги,30.446569,60.069298,8115
4,38206,Жемчужный берег,ЖК Жемчужный Берег,30.151388,59.857277,1608
...,...,...,...,...,...,...
490,14847,Мендельсон,ЖК Мендельсон,30.282642,59.961807,1607
491,36510,Приоритет,Клубный дом Приоритет,30.350351,59.949722,9144
492,5715,Квартет,ЖК Квартет,30.371153,59.830730,8114
493,46598,Девятый Вал 2,ЖК Девятый вал,30.432632,60.049362,1245


## Обработка данных с предложениями от агентов

In [123]:
# Uploading information about agents' offers
df_agent_offers = pd.read_csv("agents_offers_upd.csv", sep=';')
#df_agent_offers

In [124]:
df_agent_offers = process_offers_info(df_agent_offers, df_complexes, 0)
df_agent_offers

Unnamed: 0,offer_id,complex_id,builder_id,rooms_cnt,square,floor,total_floors,total_price,price_for_meter,nearest_subway,minutes_to_subway,is_walk
0,229237248,96,8093,1,42.60,9,9,3151000,73967,Ладожская,15,0
1,223339377,96,8093,1,43.00,9,9,3400000,79070,Ладожская,30,0
2,229803237,96,8093,1,45.60,4,9,3490000,76535,Ладожская,19,0
3,227843106,20631,9117,1,35.00,2,3,2800000,80000,Ладожская,45,0
4,226886389,7723,2573,2,68.98,3,6,6495000,94158,Девяткино,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
19498,232418738,5673,5558,1,50.00,13,25,5600000,112000,Парнас,15,1
19499,232130909,8816,8077,1,42.10,4,11,8999999,213777,Площадь Восстания,8,1
19500,228376399,8764,8077,2,59.30,5,9,12900000,217538,Площадь Восстания,8,1
19501,222533458,8764,8077,3,106.30,4,9,26490000,249200,Площадь Восстания,5,1


In [128]:
df_agent_offers.to_csv('agent_offers_processed.csv', sep=';', encoding='utf-8', index=False)

## Обработка данных с предложениями от застройщиков, часть 1

In [129]:
df_dev_offers = pd.read_csv("all_flats_from_dev_upd.csv", sep=';', encoding='utf-8')
#df_dev_offers

In [130]:
df_dev_offers = process_offers_info(df_dev_offers, df_complexes, 1)
df_dev_offers

Unnamed: 0,offer_id,complex_id,builder_id,rooms_cnt,square,floor,total_floors,total_price,price_for_meter,nearest_subway,minutes_to_subway,is_walk
0,227902154,96,8093,2,68.5,1,9,4360000,63650,Ладожская,19,0
1,227357682,96,8093,3,90.2,1,9,5742000,63659,Ладожская,19,0
2,223165375,96,8093,2,68.9,1,9,4386000,63657,Ладожская,19,0
3,223165368,96,8093,2,69.7,1,9,4437000,63659,Ладожская,19,0
4,223165358,96,8093,2,79.9,2,9,5256000,65782,Ладожская,19,0
...,...,...,...,...,...,...,...,...,...,...,...,...
42763,229664835,7430,8110,1,43.0,3,25,5391553,125385,Старая Деревня,4,0
42764,229664830,7430,8110,2,68.8,17,25,8800000,127907,Старая Деревня,4,0
42765,229664876,7430,8110,3,76.5,8,25,9700000,126797,Старая Деревня,4,0
42766,229664859,7430,8110,2,94.4,20,25,14924640,158100,Старая Деревня,4,0


In [131]:
df_dev_offers.to_csv('dev_offers_p1_processed.csv', sep=';', encoding='utf-8', index=False)

## Обработка данных с предложениями от застройщиков, часть 2

In [132]:
# Uploading information about developers' offers
df_dev_offers2 = pd.read_csv("devs_offers_new_info.csv", sep=';')
#df_dev_offers2

In [133]:
df_dev_offers2 = process_offers_info(df_dev_offers2, df_complexes, 1)
df_dev_offers2

Unnamed: 0,offer_id,complex_id,builder_id,rooms_cnt,square,floor,total_floors,total_price,price_for_meter,nearest_subway,minutes_to_subway,is_walk
0,223165375,96,8093,2,68.9,1,9,4386000,63657,Ладожская,19,0
1,227357682,96,8093,3,90.2,1,9,5742000,63659,Ладожская,19,0
2,223165368,96,8093,2,69.7,1,9,4437000,63659,Ладожская,19,0
3,223165363,96,8093,3,90.2,6,9,5938000,65831,Ладожская,19,0
4,223165366,96,8093,3,90.2,8,9,5933000,65776,Ладожская,19,0
...,...,...,...,...,...,...,...,...,...,...,...,...
32020,231989004,39050,11848,3,92.5,4,4,11110000,120108,Проспект Большевиков,5,0
32021,231988978,39050,11848,3,92.5,4,4,11110000,120108,Проспект Большевиков,5,0
32022,231988997,39050,11848,4,95.6,4,4,11242000,117594,Проспект Большевиков,5,0
32023,231988927,39050,11848,4,95.6,4,4,11286000,118054,Проспект Большевиков,5,0


In [134]:
df_dev_offers2.to_csv('dev_offers_p2_processed.csv', sep=';', encoding='utf-8', index=False)

## Избавление от выбросов и формирование итогового датасета

In [135]:
offers_df_lst = [df_dev_offers, df_dev_offers2, df_agent_offers]

In [136]:
final_offers_df = pd.concat(offers_df_lst)

In [137]:
final_offers_df

Unnamed: 0,offer_id,complex_id,builder_id,rooms_cnt,square,floor,total_floors,total_price,price_for_meter,nearest_subway,minutes_to_subway,is_walk
0,227902154,96,8093,2,68.5,1,9,4360000,63650,Ладожская,19,0
1,227357682,96,8093,3,90.2,1,9,5742000,63659,Ладожская,19,0
2,223165375,96,8093,2,68.9,1,9,4386000,63657,Ладожская,19,0
3,223165368,96,8093,2,69.7,1,9,4437000,63659,Ладожская,19,0
4,223165358,96,8093,2,79.9,2,9,5256000,65782,Ладожская,19,0
...,...,...,...,...,...,...,...,...,...,...,...,...
19498,232418738,5673,5558,1,50.0,13,25,5600000,112000,Парнас,15,1
19499,232130909,8816,8077,1,42.1,4,11,8999999,213777,Площадь Восстания,8,1
19500,228376399,8764,8077,2,59.3,5,9,12900000,217538,Площадь Восстания,8,1
19501,222533458,8764,8077,3,106.3,4,9,26490000,249200,Площадь Восстания,5,1


In [138]:
# Некоторые предложения имели неоправданно высокую цену, которая сильно выделялась на общем фоне. Такие данные были удалены
treshold_value = np.percentile(final_offers_df['price_for_meter'].values, 99.9)
final_offers_df = final_offers_df[final_offers_df['price_for_meter'] < treshold_value]
final_offers_df

Unnamed: 0,offer_id,complex_id,builder_id,rooms_cnt,square,floor,total_floors,total_price,price_for_meter,nearest_subway,minutes_to_subway,is_walk
0,227902154,96,8093,2,68.5,1,9,4360000,63650,Ладожская,19,0
1,227357682,96,8093,3,90.2,1,9,5742000,63659,Ладожская,19,0
2,223165375,96,8093,2,68.9,1,9,4386000,63657,Ладожская,19,0
3,223165368,96,8093,2,69.7,1,9,4437000,63659,Ладожская,19,0
4,223165358,96,8093,2,79.9,2,9,5256000,65782,Ладожская,19,0
...,...,...,...,...,...,...,...,...,...,...,...,...
19498,232418738,5673,5558,1,50.0,13,25,5600000,112000,Парнас,15,1
19499,232130909,8816,8077,1,42.1,4,11,8999999,213777,Площадь Восстания,8,1
19500,228376399,8764,8077,2,59.3,5,9,12900000,217538,Площадь Восстания,8,1
19501,222533458,8764,8077,3,106.3,4,9,26490000,249200,Площадь Восстания,5,1


In [2]:
# Сохранение итогового набора предложений, который в дальнейшем будет загружен в ClickHouse
final_offers_df['nearest_subway'] = final_offers_df['nearest_subway'].fillna('')
final_offers_df.to_csv('all_offers_processed.csv', sep=';', encoding='utf-8', index=False)