# Командная работа

По дисциплине "Машинное обучение и большие данные". Мухитова Азалия, Каспранов Камиль - 22П-2

### План выполнения работы

#### 1. Определение структуры данных  
- Выбор ключевых атрибутов
- Обоснование включения/исключения данных  

#### 2. Предобработка текста  
- Очистка текста:  
  - Удаление стоп-слов, спецсимволов, лишних пробелов
  - Приведение к нижнему регистру 
  - Лемматизация

#### 3. Тематическое моделирование  
- Анализ ключевых слов для каждого класса (положительные/отрицательные отзывы)
- Визуализация:  
  - Облака слов 
  - LDA, LSA модели

#### 4. Векторизация текста  
- Преобразование текста в числовые векторы:  
  - TF-IDF

#### 5. Классификация Ансамблевыми методами 
  - Стекинг  
  - Беггинг
  - Бустинг 




In [8]:
import pandas as pd
import string
import re
import numpy as np
import nltk
import pymorphy3

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ROG\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ROG\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
df =pd.read_csv("real_estate_data.csv")


  df =pd.read_csv("real_estate_data.csv")


In [3]:
df.head()

Unnamed: 0,id,type,sub_type,start_date,end_date,listing_type,tom,building_age,total_floor_count,floor_no,room_count,size,address,furnished,heating_type,price,price_currency
0,1,Konut,Rezidans,12/10/18,1/9/19,2,30,0,20 ve üzeri,2,2+1,90.0,İstanbul/Kartal/Kordonboyu,,Fancoil,3500.0,TRY
1,2,Konut,Daire,2/13/19,,1,14,0,20 ve üzeri,20 ve üzeri,1+0,43.0,İstanbul/Kartal/Kordonboyu,,Fancoil,490000.0,TRY
2,3,Konut,Daire,10/9/18,11/8/18,1,30,0,1,Yüksek Giriş,2+1,,Tekirdağ/Çorlu/Reşadiye,,Fancoil,155000.0,TRY
3,4,Konut,Rezidans,9/10/18,10/10/18,1,30,3,20 ve üzeri,20 ve üzeri,6+1,450.0,İstanbul/Beşiktaş/Levent,,Fancoil,32500000.0,TRY
4,5,Konut,Rezidans,12/10/18,1/9/19,1,30,0,20 ve üzeri,2,2+1,90.0,İstanbul/Kartal/Kordonboyu,,Fancoil,1450000.0,TRY


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 403487 entries, 0 to 403486
Data columns (total 17 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id                 403487 non-null  int64  
 1   type               403487 non-null  object 
 2   sub_type           403487 non-null  object 
 3   start_date         403487 non-null  object 
 4   end_date           266298 non-null  object 
 5   listing_type       403487 non-null  int64  
 6   tom                403487 non-null  int64  
 7   building_age       376097 non-null  object 
 8   total_floor_count  375466 non-null  object 
 9   floor_no           368191 non-null  object 
 10  room_count         403487 non-null  object 
 11  size               257481 non-null  float64
 12  address            403487 non-null  object 
 13  furnished          0 non-null       float64
 14  heating_type       375517 non-null  object 
 15  price              402772 non-null  float64
 16  pr

### Описание структуры датасета недвижимости

| № | Столбец | Описание |
|---|---------|----------|
| 1 | `id` | Уникальный идентификатор объекта недвижимости |
| 2 | `type` | Тип свойства: `Konut` (Жилье) |
| 3 | `sub_type` | Подтип свойства: `Daire` (Квартира), `Rezidans` (Жилой комплекс) |
| 4 | `start_date` | Дата начала активности объявления |
| 5 | `end_date` | Дата окончания активности объявления (если применимо) |
| 6 | `listing_type` | Тип листинга: `1` - Продажа, `2` - Аренда |
| 7 | `tom` | Время на рынке (в днях) |
| 8 | `building_age` | Возраст здания (текстовое описание) |
| 9 | `total_floor_count` | Общее количество этажей в здании |
| 10 | `floor_no` | Номер этажа объекта |
| 11 | `room_count` | Количество комнат (формат: "2+1") |
| 12 | `size` | Площадь объекта в м² |
| 13 | `address` | Адрес объекта (город/район/окружение) |
| 14 | `furnished` | Меблировка (всегда пусто) |
| 15 | `heating_type` | Тип отопления |
| 16 | `price` | Цена объекта |
| 17 | `price_currency` | Валюта цены: `TRY`, `USD`, `EUR` |

In [5]:
# 1. Удаление полностью пустых колонок
df.drop(columns=['furnished'], inplace=True, errors='ignore')

In [9]:
# 2. Специфическая обработка текстовых полей
def parse_room_count(room_str):
    """Парсинг формата комнат: 2+1, 3+1 и т.д."""
    if pd.isna(room_str):
        return np.nan
    
    room_str = str(room_str).strip()
    # Ищем паттерн типа "2+1", "3+1" и т.д.
    match = re.search(r'(\d+)\s*\+\s*(\d+)', room_str)
    if match:
        rooms = int(match.group(1))
        living_rooms = int(match.group(2))
        return rooms + living_rooms  # Общее количество комнат
    else:
        # Пробуем извлечь просто число
        try:
            return float(room_str)
        except:
            return np.nan

def parse_floor(floor_str):
    """Парсинг информации о этаже"""
    if pd.isna(floor_str):
        return np.nan
    
    floor_str = str(floor_str).strip().lower()
    
    # Специальные случаи этажей
    special_floors = {
        'yüksek giriş': 1,  # 1-й этаж
        'giriş kat': 0, 'kot1': 0, 'zemin kat': 0,  # цокольный этаж
        'çatı katı': -1,  # мансарда (помечаем специальным значением)
        'bahçe katı': -2   # садовый этаж
    }
    
    for key, value in special_floors.items():
        if key in floor_str:
            return value
    
    # Пробуем извлечь числовое значение
    try:
        return float(re.search(r'\d+', floor_str).group())
    except:
        return np.nan

def parse_building_age(age_str):
    """Парсинг возраста здания"""
    if pd.isna(age_str):
        return np.nan
    
    age_str = str(age_str).strip().lower()
    
    # Специальные категории
    if 'sıfır' in age_str or '0' in age_str:
        return 0  # Новостройка
    elif '1-5' in age_str or '1 ile 5' in age_str:
        return 3  # Среднее значение диапазона
    elif '6-10' in age_str:
        return 8
    elif '11-15' in age_str:
        return 13
    elif '16-20' in age_str:
        return 18
    elif '20+' in age_str or '21' in age_str:
        return 25  # Приблизительное значение для старых зданий
    else:
        # Пробуем извлечь конкретное число
        try:
            return float(re.search(r'\d+', age_str).group())
        except:
            return np.nan

In [10]:
# Применяем парсинг к текстовым полям
df['room_count'] = df['room_count'].apply(parse_room_count)
df['floor_no'] = df['floor_no'].apply(parse_floor)
df['building_age_numeric'] = df['building_age'].apply(parse_building_age)

In [11]:
df.head()

Unnamed: 0,id,type,sub_type,start_date,end_date,listing_type,tom,building_age,total_floor_count,floor_no,room_count,size,address,heating_type,price,price_currency,building_age_numeric
0,1,Konut,Rezidans,12/10/18,1/9/19,2,30,0,20 ve üzeri,2.0,3.0,90.0,İstanbul/Kartal/Kordonboyu,Fancoil,3500.0,TRY,0.0
1,2,Konut,Daire,2/13/19,,1,14,0,20 ve üzeri,20.0,1.0,43.0,İstanbul/Kartal/Kordonboyu,Fancoil,490000.0,TRY,0.0
2,3,Konut,Daire,10/9/18,11/8/18,1,30,0,1,1.0,3.0,,Tekirdağ/Çorlu/Reşadiye,Fancoil,155000.0,TRY,0.0
3,4,Konut,Rezidans,9/10/18,10/10/18,1,30,3,20 ve üzeri,20.0,7.0,450.0,İstanbul/Beşiktaş/Levent,Fancoil,32500000.0,TRY,3.0
4,5,Konut,Rezidans,12/10/18,1/9/19,1,30,0,20 ve üzeri,2.0,3.0,90.0,İstanbul/Kartal/Kordonboyu,Fancoil,1450000.0,TRY,0.0


In [12]:
# 3. Обработка пропущенных значений
# Для числовых признаков
numeric_cols = ['size', 'price', 'room_count', 'floor_no', 'building_age_numeric']
for col in numeric_cols:
    if col in df.columns:
        df[col].fillna(df[col].median(), inplace=True)

# Для категориальных признаков
categorical_cols = ['heating_type', 'price_currency', 'building_age']
for col in categorical_cols:
    if col in df.columns:
        df[col].fillna('Unknown', inplace=True)

# Для дат
df['end_date'].fillna('Active', inplace=True)  # Активные объявления

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

In [13]:
# 4. Преобразование типов данных
# Дата-время
df['start_date'] = pd.to_datetime(df['start_date'], errors='coerce')
df['end_date'] = pd.to_datetime(df['end_date'], errors='coerce')

# Категориальные переменные с учетом турецких значений
categorical_mappings = {
    'type': {'Konut': 'Housing'},
    'sub_type': {'Daire': 'Flat'},
    'listing_type': {1: 'Sale', 2: 'Rent'},  # Предполагая кодировку
    'price_currency': {'TL': 'TRY', 'USD': 'USD', 'EUR': 'EUR'}
}

for col, mapping in categorical_mappings.items():
    if col in df.columns:
        df[col] = df[col].replace(mapping)
        df[col] = df[col].astype('category')

  df['start_date'] = pd.to_datetime(df['start_date'], errors='coerce')
  df['end_date'] = pd.to_datetime(df['end_date'], errors='coerce')


In [14]:
# 5. Обработка heating_type с группировкой похожих типов
heating_mapping = {
    'Kalorifer (Doğalgaz)': 'Central_Gas',
    'Kalorifer (Kömür)': 'Central_Coal', 
    'Kombi (Doğalgaz)': 'Combi_Gas',
    'Kombi (Elektrikli)': 'Combi_Electric',
    'Merkezi Sistem': 'Central',
    'Merkezi Sistem (Isı Payı Ölçer)': 'Central',
    'Yerden Isıtma': 'Floor_Heating',
    'Soba (Kömür)': 'Stove_Coal',
    'Soba (Doğalgaz)': 'Stove_Gas',
    'Klima': 'Air_Conditioning',
    'Fancoil': 'Air_Conditioning',
    'Kat Kaloriferi': 'Central',
    'Kalorifer (Akaryakıt)': 'Central_Fuel',
    'Güneş Enerjisi': 'Solar',
    'Jeotermal': 'Geothermal',
    'Yok': 'None'
}

if 'heating_type' in df.columns:
    df['heating_type_grouped'] = df['heating_type'].map(heating_mapping)
    df['heating_type_grouped'].fillna('Other', inplace=True)
    df['heating_type_grouped'] = df['heating_type_grouped'].astype('category')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['heating_type_grouped'].fillna('Other', inplace=True)


In [16]:
# 6. Извлечение географических признаков из адреса
if 'address' in df.columns:
    # Разбиваем адрес на компоненты (предполагая формат: Город, Район, Район)
    address_parts = df['address'].str.split(',', expand=True)
    
    if address_parts is not None:
        df['city'] = address_parts[0].str.strip()
        df['district'] = address_parts[1].str.strip() if len(address_parts.columns) > 1 else 'Unknown'
        df['neighborhood'] = address_parts[2].str.strip() if len(address_parts.columns) > 2 else 'Unknown'
        
        df['city'] = df['city'].astype('category')
        df['district'] = df['district'].astype('category')

In [20]:
df.head()

Unnamed: 0,id,type,sub_type,start_date,end_date,listing_type,tom,building_age,total_floor_count,floor_no,...,size,address,heating_type,price,price_currency,building_age_numeric,heating_type_grouped,city,district,neighborhood
0,1,Housing,Rezidans,2018-12-10,2019-01-09,Rent,30,0,20 ve üzeri,2.0,...,90.0,İstanbul/Kartal/Kordonboyu,Fancoil,3500.0,TRY,0.0,Air_Conditioning,İstanbul/Kartal/Kordonboyu,Unknown,Unknown
1,2,Housing,Flat,2019-02-13,NaT,Sale,14,0,20 ve üzeri,20.0,...,43.0,İstanbul/Kartal/Kordonboyu,Fancoil,490000.0,TRY,0.0,Air_Conditioning,İstanbul/Kartal/Kordonboyu,Unknown,Unknown
2,3,Housing,Flat,2018-10-09,2018-11-08,Sale,30,0,1,1.0,...,110.0,Tekirdağ/Çorlu/Reşadiye,Fancoil,155000.0,TRY,0.0,Air_Conditioning,Tekirdağ/Çorlu/Reşadiye,Unknown,Unknown
3,4,Housing,Rezidans,2018-09-10,2018-10-10,Sale,30,3,20 ve üzeri,20.0,...,450.0,İstanbul/Beşiktaş/Levent,Fancoil,32500000.0,TRY,3.0,Air_Conditioning,İstanbul/Beşiktaş/Levent,Unknown,Unknown
4,5,Housing,Rezidans,2018-12-10,2019-01-09,Sale,30,0,20 ve üzeri,2.0,...,90.0,İstanbul/Kartal/Kordonboyu,Fancoil,1450000.0,TRY,0.0,Air_Conditioning,İstanbul/Kartal/Kordonboyu,Unknown,Unknown


In [21]:
# 7. Создание новых признаков
# Время на рынке в днях (используем текущую дату через pd.Timestamp.now())
if 'start_date' in df.columns:
    current_date = pd.Timestamp.now()
    df['days_on_market'] = (current_date - df['start_date']).dt.days
    df['days_on_market'].fillna(df['days_on_market'].median(), inplace=True)

# Цена за квадратный метр
if all(col in df.columns for col in ['price', 'size']):
    df['price_per_sqm'] = df['price'] / df['size']
    # Убираем некорректные значения
    df = df[(df['price_per_sqm'] > 0) & (df['price_per_sqm'] != np.inf)]

# Признак активности объявления
df['is_active'] = df['end_date'] == 'Active'

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['days_on_market'].fillna(df['days_on_market'].median(), inplace=True)


In [22]:
# 8. Обработка выбросов
def remove_price_outliers(df):
    """Удаление выбросов в цене с учетом типа недвижимости"""
    if 'price' not in df.columns or 'sub_type' not in df.columns:
        return df
    
    # Убираем явно ошибочные значения
    df = df[df['price'] > 100]  # Минимальная reasonable цена

        # Убираем крайние выбросы по квартирному типу
    if 'Flat' in df['sub_type'].values:
        flat_mask = df['sub_type'] == 'Flat'
        df_flat = df[flat_mask]
        if len(df_flat) > 0:
            Q1 = df_flat['price'].quantile(0.01)
            Q3 = df_flat['price'].quantile(0.99)
            df = df[~flat_mask | ((df['price'] >= Q1) & (df['price'] <= Q3))]
    
    return df

In [23]:
df = remove_price_outliers(df)

In [24]:
df.head()

Unnamed: 0,id,type,sub_type,start_date,end_date,listing_type,tom,building_age,total_floor_count,floor_no,...,price,price_currency,building_age_numeric,heating_type_grouped,city,district,neighborhood,days_on_market,price_per_sqm,is_active
0,1,Housing,Rezidans,2018-12-10,2019-01-09,Rent,30,0,20 ve üzeri,2.0,...,3500.0,TRY,0.0,Air_Conditioning,İstanbul/Kartal/Kordonboyu,Unknown,Unknown,2457,38.888889,False
1,2,Housing,Flat,2019-02-13,NaT,Sale,14,0,20 ve üzeri,20.0,...,490000.0,TRY,0.0,Air_Conditioning,İstanbul/Kartal/Kordonboyu,Unknown,Unknown,2392,11395.348837,False
2,3,Housing,Flat,2018-10-09,2018-11-08,Sale,30,0,1,1.0,...,155000.0,TRY,0.0,Air_Conditioning,Tekirdağ/Çorlu/Reşadiye,Unknown,Unknown,2519,1409.090909,False
3,4,Housing,Rezidans,2018-09-10,2018-10-10,Sale,30,3,20 ve üzeri,20.0,...,32500000.0,TRY,3.0,Air_Conditioning,İstanbul/Beşiktaş/Levent,Unknown,Unknown,2548,72222.222222,False
4,5,Housing,Rezidans,2018-12-10,2019-01-09,Sale,30,0,20 ve üzeri,2.0,...,1450000.0,TRY,0.0,Air_Conditioning,İstanbul/Kartal/Kordonboyu,Unknown,Unknown,2457,16111.111111,False
