# Ноутбук для разведочного анализа данных

In [None]:
import numpy as np
import pandas as pd
pd.options.mode.use_inf_as_na = True

import plotly.express as px
import plotly.graph_objects as go

## Данные

In [None]:
df = pd.read_csv('data/raifhack_train.csv', 
    parse_dates=['date'], 
    usecols=lambda x: x not in ['floor']
)

In [None]:
# Есть 2 типа оценок: 
# 0 = из парсинга объявлений
# 1 = экспертные оценки
# Цены из объявлений содержат много шума, поэтому берем экспертные
# + Экспертных оценок меньше (около 5000), меньше времени на обучение
is_expert = (df['price_type'] == 1)
df = df[is_expert]

In [None]:
for_test = df['date'].dt.month.isin([7, 8])
for_test.mean()
df_train = df[ ~for_test ]
df_test = df[ for_test ]

In [None]:
df.columns

## Признаки

**Корреляции отдельных переменных**

In [None]:
fig = px.scatter(
    x=df['total_square'].apply(np.log),
    y=df['per_square_meter_price'].apply(np.log),
    
    trendline='ols',
    trendline_color_override='black',
    
    template='plotly_white'
)
fig.data[0].marker.size=3
fig.show()

**Двумерная гистограмма**

In [None]:
fig = px.density_heatmap(
    x=df['total_square'].apply(np.log),
    y=df['per_square_meter_price'].apply(np.log),
        
    color_continuous_scale='blues',
    nbinsx=100, nbinsy=50,
    
    template='plotly_white'
)
fig.show()

**Нужен ли логарифм**

In [None]:
def should_use_func(x, y, func=np.log):
    fname = func.__name__
    f_x = x.apply(func)
    f_y = y.apply(func)
    print(f'No {fname}: ', y.corr(x))
    print(f' Y {fname}: ', f_y.corr(x))
    print(f' X {fname}: ', y.corr(f_x))
    print(f'XY {fname}: ', f_y.corr(f_x))

In [None]:
features = [
    'osm_city_closest_dist',
    'osm_city_nearest_population',
    'osm_crossing_closest_dist',
    'osm_subway_closest_dist',
    'osm_train_stop_closest_dist',
    'osm_transport_stop_closest_dist',
    'reform_mean_floor_count_1000',
    'total_square',
]
for x in features:
    print(x)
    should_use_func(
        x=df[x], 
        y=df['per_square_meter_price']
    )
    print('--')

**Смотрим категориальные переменные**

In [None]:
fig = px.bar(
    df_train.groupby('region').agg({'per_square_meter_price': 'mean'}).squeeze().sort_values().tail(30),
    orientation='h',
    template='plotly_white'
)
fig.update_layout(height=600)
fig.show()

In [None]:
px.bar(
    df.groupby('osm_city_nearest_name').agg(
        count=('per_square_meter_price', 'count'),
        mean=('per_square_meter_price', 'mean'),
    ).sort_values('count', ascending=False).head(50)['mean'].sort_values()
)

## Корреляции

In [None]:
# corr = df.sample(frac=0.1).corr(
#     method='spearman'
# )
corr_features = (
    df_train.columns.str.contains('.', regex=False) 
    | df_train.columns.str.contains('price', regex=False)
)
corr = df_train.loc[:, ~corr_features].corr(
    method='spearman'
)

In [None]:
# px.box(corr.abs().values.flatten())

In [None]:
px.imshow(corr.values, 
          color_continuous_scale=['red', 'white', 'blue'],
          color_continuous_midpoint=0)

## PCA

In [None]:
from sklearn.decomposition import PCA

In [None]:
df_pca = df_train.select_dtypes(include=np.number)

df_pca = df_pca.loc[:,
#     df_pca.columns.str.contains('00$', regex=True)
    df_pca.columns.str.contains('.', regex=False)
]

df_pca = df_pca.apply(np.log1p)

df_pca = (df_pca - df_pca.mean()) / df_pca.std()
df_pca = df_pca.fillna(df_pca.mean())

In [None]:
df_pca

In [None]:
pca = PCA()

In [None]:
df_pca = pca.fit_transform(df_pca)

In [None]:
px.bar( pca.explained_variance_ratio_ )

In [None]:
df_pca = pd.DataFrame(df_pca).assign(
    X=df_train['osm_catering_points_in_0.01'].apply(np.log1p),
    Y=df_train['per_square_meter_price'].apply(np.log)
)

In [None]:
px.scatter(
    df_pca.sample(10_000),
    x=0,
    y='Y',
    trendline='ols',
    trendline_color_override='black'
)

In [None]:
px.scatter(
    df_pca.sample(10_000),
    x='X',
    y='Y',
    trendline='ols',
    trendline_color_override='black'
)

## Кластеризация по координатам

**Смотрим, какие признаки можно сделать по координатам**

**в модели не используется**

In [None]:
from sklearn.cluster import KMeans

In [None]:
temp = df.sample(20_000, random_state=0)

In [None]:
temp['y'] = temp['per_square_meter_price'].apply(np.log)

In [None]:
min_, max_ = temp['lat'].quantile([0.001, 0.999]).values
temp['lat'] = temp['lat'].clip(min_, max_)

min_, max_ = temp['lng'].quantile([0.001, 0.999]).values
temp['lng'] = temp['lng'].clip(min_, max_)

In [None]:
clust = KMeans(n_clusters=1000)
temp['cluster'] = clust.fit_predict(temp[['lat', 'lng']])
temp['cluster_mean'] = temp['cluster'].map(
    temp.groupby('cluster').agg({'per_square_meter_price': 'mean'}).squeeze().to_dict()
)

In [None]:
px.scatter_3d(
    temp,
    x='lng',
    y='lat',
    z='cluster_mean',
    color='cluster_mean',
    color_continuous_scale=['lime', 'yellow', 'orange', 'red', 'darkred'][::-1],
    template='plotly_white'
)

In [None]:
px.bar( np.sort(temp['cluster_mean'].unique()) )

In [None]:
px.scatter(
    temp,
    x='cluster_mean',
    y='per_square_meter_price',
    trendline='ols',
    trendline_color_override='black'
)