# Импорт библиотек и данных

In [1]:
import sys
import os
import pandas as pd
from pathlib import Path
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import json
import re

%matplotlib inline

sns.set_style('darkgrid')

from IPython.display import display, display_html, HTML

root_folder = '../'
sys.path.append(root_folder)
# from src.data import make_dataset
from src import utils
from src.features import build_features
from src.visualization import visualize

interim_data = Path(root_folder, 'data', 'interim', '1.0_first_process.csv')
baseline_data = Path(root_folder, 'data', 'processed', '1.0_baseline.csv')

In [2]:
df = pd.read_csv(
    interim_data,
    index_col=0
)
# IMPORTANT FOR FURTHER LIST ANALYSIS!
df = build_features.get_df_with_lists(df)
df.info()
display(df)

<class 'pandas.core.frame.DataFrame'>
Index: 374306 entries, 0 to 374305
Data columns (total 29 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   status                333918 non-null  object 
 1   street                372469 non-null  object 
 2   baths                 267467 non-null  float64
 3   fireplace             374306 non-null  bool   
 4   city                  374272 non-null  object 
 5   sqft                  333384 non-null  float64
 6   zipcode               374306 non-null  object 
 7   beds                  277504 non-null  float64
 8   state                 374306 non-null  object 
 9   stories               221109 non-null  float64
 10  target                374306 non-null  int64  
 11  marked_interior_area  374306 non-null  bool   
 12  private_pool          374306 non-null  bool   
 13  mls                   374306 non-null  bool   
 14  property_type         333572 non-null  object 
 15  year_

Unnamed: 0,status,street,baths,fireplace,city,sqft,zipcode,beds,state,stories,...,school_rating,school_distance,school_grades,school_name,parking_type,parking_count,central_heating,heating_type,central_cooling,cooling_type
0,active,240 Heather Ln,3.5,True,Southern Pines,2900.0,28387,4.0,NC,,...,"[4, 4, 7, NR, 4, 7, NR, NR]","[2.7 mi, 3.6 mi, 5.1 mi, 4.0 mi, 10.5 mi, 12.6...","[3–5, 6–8, 9–12, PK–2, 6–8, 9–12, PK–5, K–12]","[Southern Pines Elementary School, Southern Mi...",,,True,pump,False,
1,sale,12911 E Heroy Ave,3.0,False,Spokane Valley,1947.0,99216,3.0,WA,2.0,...,"[4/10, null/10, 4/10]","[1.65mi, 1.32mi, 1.01mi]","[9-12, 3-8, PK-8]","[East Valley High School&Extension, Eastvalley...",,,False,,False,
2,sale,2005 Westridge Rd,2.0,True,Los Angeles,3000.0,90049,3.0,CA,1.0,...,"[8/10, 4/10, 8/10]","[1.19mi, 2.06mi, 2.63mi]","[6-8, K-5, 9-12]","[Paul Revere Middle School, Brentwood Science ...",garage,1.0,True,central,True,central
3,sale,4311 Livingston Ave,8.0,True,Dallas,6457.0,75205,5.0,TX,3.0,...,"[9/10, 9/10, 10/10, 9/10]","[1.05mi, 0.1mi, 1.05mi, 0.81mi]","[5-6, PK-4, 7-8, 9-12]","[Mcculloch Intermediate School, Bradfield Elem...",garage,1.0,True,central,True,central
4,sale,1524 Kiscoe St,,False,Palm Bay,,32908,,FL,,...,"[4/10, 5/10, 5/10]","[5.96mi, 3.25mi, 3.03mi]","[7-8, 9-12, PK-6]","[Southwest Middle School, Bayside High School,...",,,False,,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
374301,,20800 NE 23rd Ave,6.0,False,Miami,4017.0,33180,5.0,FL,,...,"[10/10, 5/10]","[32.1 mi, 1.1 mi]","[PK-8, 9-12]","[Air Base Elementary School, Dr Michael M. Kro...",other,2.0,False,other,True,central
374302,sale,3530 N Lake Shore Dr #4B,3.0,False,Chicago,2000.0,60657,3.0,IL,9.0,...,"[1/10, 5/10, 7/10]","[10.61mi, 1.42mi, 0.4mi]","[9-12, 9-12, PK-8]","[Hope College Prep High School, Lake View High...",,,False,radiant,False,
374303,sale,15509 Linden Blvd,3.0,False,Jamaica,1152.0,11434,3.0,NY,2.0,...,"[5/10, 4/10]","[0.48mi, 0.73mi]","[PK-5, 6-8]","[Ps 48 William Wordsworth, Jhs 8 Richard S Gro...",other,2.0,False,other,False,
374304,,7810 Pereida St,,False,Houston,,77028,,TX,,...,"[NA, NA, NA]","[1.3 mi, 0.5 mi, 1.9 mi]","[PK-5, 6-8, 9-12]","[Hiliard El, Forest Brook Middle, North Forest...",,,False,,False,


# Baseline

Подготовим "baseline"-датасет для линейной регресии с минимумом предвариательной обработки данных.
Оставим только числовые и логические признаки.

In [3]:
base_df = df.drop(df.dtypes[df.dtypes == 'object'].index, axis=1)
cols_to_drop = ['marked_interior_area', 'remodeled_year']
base_df = base_df.drop(cols_to_drop, axis=1)
base_df.info()
base_df.head()
base_df.to_csv(baseline_data)

<class 'pandas.core.frame.DataFrame'>
Index: 374306 entries, 0 to 374305
Data columns (total 13 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   baths            267467 non-null  float64
 1   fireplace        374306 non-null  bool   
 2   sqft             333384 non-null  float64
 3   beds             277504 non-null  float64
 4   stories          221109 non-null  float64
 5   target           374306 non-null  int64  
 6   private_pool     374306 non-null  bool   
 7   mls              374306 non-null  bool   
 8   year_built       311041 non-null  float64
 9   lotsize          279341 non-null  float64
 10  parking_count    183076 non-null  float64
 11  central_heating  374306 non-null  bool   
 12  central_cooling  374306 non-null  bool   
dtypes: bool(5), float64(7), int64(1)
memory usage: 27.5 MB
