<a href="https://colab.research.google.com/github/nikitazhuikov/Kaggle/blob/main/EDA_house_pricing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [45]:
import pandas as pd
from pyspark.sql.functions import col, sum
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType

In [3]:
from pyspark.sql import SparkSession


# Создаем SparkSession
spark = (
    SparkSession.builder
    .appName("My PySpark Application")   # Название приложения
    .master("local[*]")   # Указывает использовать локальный режим со всеми ядрами
    .config("spark.executor.memory", "2g")   # Настройка памяти для исполнителей
    .config("spark.driver.memory", "2g")   # Настройка памяти для драйвера
    .config("spark.sql.debug.maxToStringFields", 1000)
    .getOrCreate()
)

# Проверка успешного подключения
print("Spark Version:", spark.version)

Spark Version: 3.5.5


### Загрузка данных

In [4]:
df = spark.read.csv("Housing.csv", header=True, inferSchema=True)

In [9]:
df.show(5)

+--------+----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+
|   price|area|bedrooms|bathrooms|stories|mainroad|guestroom|basement|hotwaterheating|airconditioning|parking|prefarea|furnishingstatus|
+--------+----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+
|13300000|7420|       4|        2|      3|     yes|       no|      no|             no|            yes|      2|     yes|       furnished|
|12250000|8960|       4|        4|      4|     yes|       no|      no|             no|            yes|      3|      no|       furnished|
|12250000|9960|       3|        2|      2|     yes|       no|     yes|             no|             no|      2|     yes|  semi-furnished|
|12215000|7500|       4|        2|      2|     yes|       no|     yes|             no|            yes|      3|     yes|       furnished|
|11410000|7420|       4|        1|      2

### Характеристика данных

In [12]:
df.summary().show()

+-------+------------------+------------------+------------------+------------------+------------------+--------+---------+--------+---------------+---------------+------------------+--------+----------------+
|summary|             price|              area|          bedrooms|         bathrooms|           stories|mainroad|guestroom|basement|hotwaterheating|airconditioning|           parking|prefarea|furnishingstatus|
+-------+------------------+------------------+------------------+------------------+------------------+--------+---------+--------+---------------+---------------+------------------+--------+----------------+
|  count|               545|               545|               545|               545|               545|     545|      545|     545|            545|            545|               545|     545|             545|
|   mean| 4766729.247706422|  5150.54128440367|2.9651376146788992|1.2862385321100918|1.8055045871559634|    NULL|     NULL|    NULL|           NULL|           N

In [16]:
df.printSchema()
num_rows = df.count()
num_cols = len(df.columns)
print(f"Number of rows: {num_rows}, Number of columns: {num_cols}")

root
 |-- price: integer (nullable = true)
 |-- area: integer (nullable = true)
 |-- bedrooms: integer (nullable = true)
 |-- bathrooms: integer (nullable = true)
 |-- stories: integer (nullable = true)
 |-- mainroad: string (nullable = true)
 |-- guestroom: string (nullable = true)
 |-- basement: string (nullable = true)
 |-- hotwaterheating: string (nullable = true)
 |-- airconditioning: string (nullable = true)
 |-- parking: integer (nullable = true)
 |-- prefarea: string (nullable = true)
 |-- furnishingstatus: string (nullable = true)

Number of rows: 545, Number of columns: 13


### Поиск отсутствующих значений

In [19]:
missing_values = df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns])
missing_values.show()

+-----+----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+
|price|area|bedrooms|bathrooms|stories|mainroad|guestroom|basement|hotwaterheating|airconditioning|parking|prefarea|furnishingstatus|
+-----+----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+
|    0|   0|       0|        0|      0|       0|        0|       0|              0|              0|      0|       0|               0|
+-----+----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+



In [30]:
print(col('price').isNull()) # Подумать еще

Column<'(price IS NULL)'>


### Дополнение пустыми значениями

In [49]:
schema = StructType([StructField('price', IntegerType(), True),
                     StructField('area', IntegerType(), True),
                     StructField('bedrooms', IntegerType(), True),
                     StructField('bathrooms', IntegerType(), True),
                     StructField('stories', IntegerType(), True),
                     StructField('mainroad', StringType(), True),
                     StructField('guestroom', StringType(), True),
                     StructField('basement', StringType(), True),
                     StructField('hotwaterheating', StringType(), True),
                     StructField('airconditioning', StringType(), True),
                     StructField('parking', IntegerType(), True),
                     StructField('prefarea', StringType(), True),
                     StructField('furnishingstatus', StringType(), True)
])
df1 = spark.createDataFrame([(4340000, None, 3, None, 2, 'yes', None, None, 'no', None, None, None, None),
                             (4000000, None, 3, None, 2, None, None, None, None, None, None, None, None),
                             (None, 5000, 3, None, 2, None, 'no', None, None, None, None, 'yes', 'yes')], schema=schema)
df1.show()

+-------+----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+
|  price|area|bedrooms|bathrooms|stories|mainroad|guestroom|basement|hotwaterheating|airconditioning|parking|prefarea|furnishingstatus|
+-------+----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+
|4340000|NULL|       3|     NULL|      2|     yes|     NULL|    NULL|             no|           NULL|   NULL|    NULL|            NULL|
|4000000|NULL|       3|     NULL|      2|    NULL|     NULL|    NULL|           NULL|           NULL|   NULL|    NULL|            NULL|
|   NULL|5000|       3|     NULL|      2|    NULL|       no|    NULL|           NULL|           NULL|   NULL|     yes|             yes|
+-------+----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+



In [50]:
df = df.union(df1)
df.tail(5)

[Row(price=1750000, area=2910, bedrooms=3, bathrooms=1, stories=1, mainroad='no', guestroom='no', basement='no', hotwaterheating='no', airconditioning='no', parking=0, prefarea='no', furnishingstatus='furnished'),
 Row(price=1750000, area=3850, bedrooms=3, bathrooms=1, stories=2, mainroad='yes', guestroom='no', basement='no', hotwaterheating='no', airconditioning='no', parking=0, prefarea='no', furnishingstatus='unfurnished'),
 Row(price=4340000, area=None, bedrooms=3, bathrooms=None, stories=2, mainroad='yes', guestroom=None, basement=None, hotwaterheating='no', airconditioning=None, parking=None, prefarea=None, furnishingstatus=None),
 Row(price=4000000, area=None, bedrooms=3, bathrooms=None, stories=2, mainroad=None, guestroom=None, basement=None, hotwaterheating=None, airconditioning=None, parking=None, prefarea=None, furnishingstatus=None),
 Row(price=None, area=5000, bedrooms=3, bathrooms=None, stories=2, mainroad=None, guestroom='no', basement=None, hotwaterheating=None, aircond

### Замена пустых значений

In [None]:
df.na.fill({'price': 4340000, 'area': 4600, 'bedrooms': 2, 'bathrooms': 1, 'stories': 2, 'mainroad': 'yes', 'guestroom': 'no',
            'basement': 'no', 'hotwaterheating': ,airconditioning,parking,prefarea,furnishingstatus}).show()
