In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data_root = Path('./data/raw/')
FILE = 'test'
df = pd.read_csv(data_root / f'{FILE}.csv')
df.head(5)

Unnamed: 0,city,floor,id,osm_amenity_points_in_0.001,osm_amenity_points_in_0.005,osm_amenity_points_in_0.0075,osm_amenity_points_in_0.01,osm_building_points_in_0.001,osm_building_points_in_0.005,osm_building_points_in_0.0075,...,reform_mean_year_building_1000,reform_mean_year_building_500,region,lat,lng,total_square,street,date,realty_type,price_type
0,Курск,1.0,COL_289284,7,55,85,117,0,0,0,...,1966.471591,1966.74026,Курская область,51.709255,36.147908,156.148996,S6983,2020-09-06,100,1
1,Сургут,1.0,COL_289305,8,70,112,140,0,0,0,...,1988.259259,1989.068182,Ханты-Мансийский АО,61.23324,73.462509,190.737943,S29120,2020-09-06,110,1
2,Тюмень,-1.0,COL_289318,3,28,67,122,0,0,0,...,1985.880282,1991.458333,Тюменская область,57.14311,65.554573,457.118051,S23731,2020-09-06,10,1
3,Иркутск,1.0,COL_289354,5,76,139,231,0,0,0,...,1947.073276,1941.657895,Иркутская область,52.28138,104.282975,66.503622,S14207,2020-09-06,100,1
4,Курск,,COL_289399,8,105,189,279,0,0,2,...,1948.764151,1946.689655,Курская область,51.729706,36.194019,23.864915,S20658,2020-09-06,10,1


In [3]:
from feature_extractors.floor import parse_floor
from feature_extractors.new_osm_features import (
    min_distance_to_city,
    min_distance_to_transport
)
from feature_extractors.ipoteka_feature import mortage_rate
from feature_extractors.stat_feature_processing import is_subway, subway_stations, preprocessing

In [4]:
df = df.assign(is_huge_city=  df.osm_city_nearest_population.apply(lambda n: n > 1.2*1e7),
               is_big_city=   df.osm_city_nearest_population.apply(lambda n: n > 0.4*1e7),
               is_medium_city=df.osm_city_nearest_population.apply(lambda n: n > 1_000_000),
               is_small_city= df.osm_city_nearest_population.apply(lambda n: n > 400_000),
               is_micro_city= df.osm_city_nearest_population.apply(lambda n: n > 200_000),
               is_nano_city=  df.osm_city_nearest_population.apply(lambda n: n < 100_000))

In [5]:
df = df.assign(clear_floor=df.floor.apply(parse_floor).fillna(0),
               min_dist_to_city=min_distance_to_city(df, 'osm_city_closest_dist', 'osm_transport_stop_closest_dist'),
               min_dist_to_transport=min_distance_to_transport(df, 'osm_subway_closest_dist', 'osm_transport_stop_closest_dist'),
               has_subway=df.city.apply(is_subway),
               n_subways=df.city.apply(subway_stations))

In [6]:
# log features
from tqdm import tqdm

numeric_features = df.dtypes[df.dtypes == 'float64'].index.tolist()
for feat in tqdm(numeric_features):
    if feat == 'per_square_meter_price':
        continue
    df.loc[:, feat] = df[feat].fillna(df[feat].median())
    df.loc[:, f'log_{feat}'] = np.log1p(df[feat])
#     df.loc[:, f'1/{feat}'] = 1 / (df[feat] + 0.0001)

100%|██████████| 18/18 [00:00<00:00, 553.10it/s]


In [7]:
# df.loc[:, 'osm_city_nearest_population'] = df.osm_city_nearest_population.fillna(df.osm_city_nearest_population.median())
df.osm_city_nearest_population.isna().sum()

0

In [8]:
na_cols = df.columns[df.isna().sum(axis=0) > 0].tolist()
print(na_cols)
# df = df.drop(na_cols, axis=1)

['floor', 'street']


In [9]:
df.to_csv(f'data/featured/{FILE}.csv.gz', index=False, compression='gzip')