In [None]:
import warnings
warnings.filterwarnings('ignore')
import os
import re
import time
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.datasets import fetch_20newsgroups, load_files

import pandas as pd
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderUnavailable

from tqdm.auto import tqdm

df_orig = pd.read_csv('./data/data.csv')

In [None]:
geolocator = Nominatim(user_agent="my_geocoder")

def convert(cell_val):
    """
    Purpose: 
    """
    pattern = r"\d+(,\d+)?(\.\d+)?"
    num_str = re.search(pattern, cell_val)

    if (num_str):
        # comment: 
        return float(num_str.group().replace(',' , ''))
    else:
        return None
        # comment: 
    # end if
# end def

def addr_to_coords(addr, geolocator):
    """
    Purpose: 
    """
    geolocator = Nominatim(user_agent="my_geocoder", scheme='http', domain='localhost:8080', timeout=10)

    # Геокодирование адреса
    try:
        # comment: 
        location = geolocator.geocode(addr)
        if (location):
        # comment:
            return (location.latitude, location.longitude)
        else:
            # comment: 
            return None
    # end if
    except (GeocoderTimedOut, GeocoderUnavailable) as e:
        print(f"Error: {e}. Retrying...")
        time.sleep(2)  # Добавляем задержку перед повторной попыткой
        return addr_to_coords(addr, geolocator)
    # end try

In [None]:
df = df_orig.copy()

df.drop(['zipcode', 'mls-id', 'MlsId'], axis=1, inplace=True)

df['street'] = df['street'].astype(str).str.strip()
df['state'] = df['state'].astype(str).str.strip()
df['full_addr'] = df['street'] +', '+ df['state']

In [None]:
tqdm.pandas(desc="Processing rows")

df100 = df.copy()

df100['coords'] = df100['full_addr'].progress_apply(lambda x: addr_to_coords(x, geolocator))
df100[['latitude', 'longitude']] = df100['coords'].apply(lambda x: pd.Series(x) if x is not None else pd.Series([None, None]))

df100.to_csv('./data/data_with_coords50000.csv', mode='a+', header=False)

In [None]:
df = pd.read_csv('data/data_with_coords50000.csv')

Удалим колонки, участвовавшие в геокодировании

In [None]:
df = df.dropna(subset=['coords'])
labels = [
    'street',
    'city',
    'full_addr',
    'coords'
]
df.drop(labels=labels, axis=1, inplace=True)

df = df.rename(columns={
    'latitude': 'lat',
    'longitude': 'lon'
})

In [117]:
missing_values_per_column = df.isna().sum()
print("Количество значений NaN по каждой колонке:")
print(missing_values_per_column)


Количество значений NaN по каждой колонке:
Unnamed: 0           0
status           24558
private pool         0
propertyType     20528
baths            70549
homeFacts            0
fireplace       176391
schools              0
sqft             26753
beds             57994
state                0
stories          96840
PrivatePool          0
target               0
lat                  0
lon                  0
dtype: int64


точно можно удалить строки с нулевыми значениями в target, толку от неизвестной цены не будет

In [None]:
df = df.dropna(subset=['target'])

df['target'] = df['target'].astype(str)
df['target'] = df['target'].map(convert)

Обработаем столбец 'private pool'

In [None]:
df.loc[df['private pool'] == 'Yes', 'private pool'] = 1
df['private pool'].fillna(0, inplace=True)
df['private pool'] = df['private pool'].astype(int)

Обработаем столбец 'PrivatePool'

In [115]:
df.loc[df['PrivatePool'] == 'Yes', 'PrivatePool'] = 1
df.loc[df['PrivatePool'] == 'yes', 'PrivatePool'] = 1
df['PrivatePool'].fillna(0, inplace=True)
df['PrivatePool'] = df['PrivatePool'].astype(int)

In [None]:
with open('temp.csv', 'w') as f:
    # Comment: 
    df['baths'].value_counts().to_csv(f)
# end open file


In [None]:
df[df['beds']=='Baths']

In [None]:
df[df['beds']=='Baths']

In [None]:
print(df.shape)
print(df_orig.shape)