In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
import numpy as np
import wandb
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import joblib
import json
import os

In [None]:
def contains_digits(s: str):
    """Функция для проверки наличия цифр

    Args:
        s (str): строка

    Returns:
        bool: наличие или отсутствие цифр в строке
    """
    return any(char.isdigit() for char in s)

def code_rare_categories(df, column: str, threshold: float, new_val):
    """кодирование редких категорий как other, используя threshold
    
    Keyword arguments: \n
    df -- dataframe\n
    column -- имя колонки\n
    threshold -- процентный допуск, ниже которого ставится значение other\n
    Return: обновленный df
    """
    
    category_counts = df[column].value_counts()
    total_count = len(df)
    category_percentages = (category_counts / total_count) * 100
    rare_categories = category_percentages[category_percentages < threshold].index
    new_column = column + '_cleaned'
    df[new_column] = df[column].apply(lambda x: new_val if x in rare_categories else x)
    return df

# end def

def replace_based_on_pattern(value, pattern, new_value):
    if re.search(pattern, value):
        return new_value
    return value


def convert_target(cell_val):
    """
    Purpose: 
    """
    pattern = r"\d+(,\d+)?(\.\d+)?"
    
    num_str = re.sub(r'[^0-9.]', '', cell_val)

    # num_str = re.search(pattern, cell_val)

    if (num_str):
        # comment: 
        try:
            # comment: 
            return float(num_str)
        except Exception as e:
            return None
        # end try
        # return float(num_str)
    else:
        return None
        # comment: 
    # end if
# end def

def addr_to_coords(addr, geolocator):
    """
    Purpose: 
    """
    geolocator = Nominatim(user_agent="my_geocoder", scheme='http', domain='localhost:8080', timeout=10)

    # Геокодирование адреса
    try:
        # comment: 
        location = geolocator.geocode(addr)
        if (location):
        # comment:
            return (location.latitude, location.longitude)
        else:
            # comment: 
            return None
    # end if
    except (GeocoderTimedOut, GeocoderUnavailable) as e:
        print(f"Error: {e}. Retrying...")
        time.sleep(2)  # Добавляем задержку перед повторной попыткой
        return addr_to_coords(addr, geolocator)
    # end try

def convert_bath(cell_val):
    """
    Purpose: 
    """
    pattern = r"\d+[,.]?\d*"
    num_str = re.search(pattern, cell_val)

    if (num_str):
        # comment: 
        return float(num_str.group().replace(',' , ''))
    else:
        return None
        # comment: 
    # end if
# end def

def value_percentage(df, column):
    """
    Purpose: процент значений колонки от общего числа строк
    """
    name_series = df[column].value_counts(dropna=False)
    name_df = name_series.to_frame()
    name_df.reset_index(inplace=True)
    name_df['percent'] = name_df['count'] / df.shape[0] * 100
    return name_df

# end def

# Пользовательская функция для объединения столбцов по условию
def combine_columns(row):
    if ((row['private_pool'] + row['privatepool']) < 1):
        return 0
    else:
        return 1
    
def missing_values_per_column(df):
    """
    Purpose: вывод количества отсутствующих значений в датасете
    """
    missing_values = df.isna().sum()
    print("Количество значений NaN по каждой колонке:")
    print(missing_values)

# end def

# Функция для безопасной десериализации JSON-строк
def safe_json_loads(json_str):
    try:
        return json.loads(json_str)
    except (json.JSONDecodeError, TypeError):
        return None
    

# Функция для извлечения данных из 'atAGlanceFacts'
def extract_home_facts(home_facts, label):
    if not home_facts or 'atAGlanceFacts' not in home_facts:
        return None
    for fact in home_facts['atAGlanceFacts']:
        if fact.get('factLabel') == label:
            return fact.get('factValue')
    return None



def quote_replace(cell_val):
    
    def replace(match):
        return f'{match.group(1)}\'{match.group(3)}'

    """
    Purpose: 
    """
    pattern = r'([a-zA-Z])(\")([a-zA-Z])'
    new_text = re.sub(pattern, replace, cell_val)

    return new_text
# end def

def word_in_quote_replace(cell_val):
    
    def replace(match):
        return f'\'{match.group(2)}\''

    """
    Purpose: 
    """
    pattern = r'(\")(\w+)(\")'
    new_text = re.sub(pattern, replace, cell_val)

    return new_text
# end def

def get_school_rating(cell_val):
    """
    Purpose: 
    """
    if (cell_val):
        # comment: 
        rating = cell_val.get('rating')

        if (rating):
            # comment:
            av_rate = []
            for item in rating:
                # comment: 
                try:
                    # comment: 
                    item = float(Fraction(item))
                    av_rate.append(item)
                except Exception as e:
                    item = 0
                    av_rate.append(item)
                # end try
            # end for
            return round(np.mean(av_rate), 2)
        else:
            # comment: 
            return None
        # end if
    else:
        return None
        # comment: 
    # end if    
# end def

def get_schools_qty(cell_val):
    """
    Purpose: 
    """
    if (cell_val):
        # comment: 
        rating = cell_val.get('rating')

        if (rating):
            return len(rating)
        else:
            # comment: 
            return None
        # end if
    else:
        return None
        # comment: 
    # end if    
# end def

def get_avg_school_dist(cell_val):
    """
    Purpose: 
    """
    if (cell_val):
        # comment: 
        rating = cell_val.get('data')

        if (rating):
            # comment:
            dist = rating.get('Distance')
            dist_list = []
            for item in dist:
                # comment: 
                pattern = r"[-+]?\d*[.,]\d+"
                if (re.search(pattern, item)):
                    # comment: 
                    number = re.search(pattern, item).group()
                else:
                    # comment: 
                    number = 0
                # end if
                dist_list.append(float(number))
            # end for
            return round(np.mean(dist_list), 2)
        else:
            # comment: 
            return None
        # end if
    else:
        return None
        # comment: 
    # end if    
# end def



In [None]:
df = pd.read_csv('./data/df_for_ML_new.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
# df['year_built'].fillna(-1, inplace=True)
df.fillna(-1, inplace=True)
df['year_built'] = df['year_built'].astype(int)

In [None]:
X = df.drop('target', axis=1)
y = df['target']

In [None]:
with open('/home/dwarf/diplom/wandb.json', 'r') as f:
  data = json.load(f)
  os.environ['WANDB_API_KEY'] = data["key"]

wandb.login()
run = wandb.init(entity=None, project="diplom", name = 'knn_run')

# Разделяем на обучающий и тестовый наборы данных
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Создаем модель k-NN с параметром k = 3
model = KNeighborsRegressor(n_neighbors=3)

# Обучаем модель на обучающем наборе данных
model.fit(X_train, y_train)

# Предсказываем стоимости на тестовом наборе данных
y_pred = model.predict(X_test)

# Рассчитываем метрики качества
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Логирование метрик в W&B
wandb.log({"MAE": mae})
wandb.log({"MSE": mse})
wandb.log({"R2": r2})

print("Средняя абсолютная ошибка (MAE):", mae)
print("Среднеквадратичная ошибка (MSE):", mse)
print("Коэффициент детерминации (R²):", r2)

# Сохранение модели
model_filename = "random_forest_model.pkl"
joblib.dump(model, model_filename)

# Загрузка сохраненной модели в W&B
wandb.save(model_filename)

# Завершение сессии W&B
wandb.finish()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

with open('/home/dwarf/diplom/wandb.json', 'r') as f:
  data = json.load(f)
  os.environ['WANDB_API_KEY'] = data["key"]

wandb.login()

n_estimators = [50, 100, 150, 200]
max_depths = [5, 10, 15, 20, 25, 30]

for estimators in n_estimators:
    for depth in max_depths:
      run = wandb.init(entity=None, project="diplom", name = f'rand_for_reg est={estimators} depth={depth}')

      # Создание и обучение модели случайного леса для регрессии
      model = RandomForestRegressor(n_estimators=estimators, max_depth=depth, random_state=42)

      model.fit(X_train, y_train)

      # Предсказание на тестовой выборке
      y_pred = model.predict(X_test)

      # Рассчитываем метрики качества
      mae = mean_absolute_error(y_test, y_pred)
      mse = mean_squared_error(y_test, y_pred)
      r2 = r2_score(y_test, y_pred)

      # Логирование метрик в W&B
      wandb.log({"MAE": mae})
      wandb.log({"MSE": mse})
      wandb.log({"R2": r2})   

      # Сохранение модели
      model_filename = "random_forest_model.pkl"
      joblib.dump(model, model_filename)

      # Загрузка сохраненной модели в W&B
      wandb.save(model_filename)

# Завершение сессии W&B
wandb.finish()
# print("Средняя абсолютная ошибка (MAE):", mae)
# print("Среднеквадратичная ошибка (MSE):", mse)
# print("Коэффициент детерминации (R²):", r2)




In [None]:
with open('/home/dwarf/diplom/wandb.json', 'r') as f:
  data = json.load(f)
  os.environ['WANDB_API_KEY'] = data["key"]

wandb.login()

run = wandb.init(entity=None, project="diplom", name = f'linear_regression')

scaler = MinMaxScaler()

X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

# Предсказание на тестовом наборе
y_pred = model.predict(X_test)

# Рассчитываем метрики качества
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Логирование метрик в W&B
wandb.log({"MAE": mae})
wandb.log({"MSE": mse})
wandb.log({"R2": r2})   

# Сохранение модели
model_filename = "linear_regerssion.pkl"
joblib.dump(model, model_filename)
scaler_filename = 'min_max_scaler.pkl'
joblib.dump(scaler, scaler_filename)

# Загрузка сохраненной модели в W&B
wandb.save(model_filename)
wandb.save(scaler_filename)

# Завершение сессии W&B
wandb.finish()

In [None]:
with open('/home/dwarf/diplom/wandb.json', 'r') as f:
  data = json.load(f)
  os.environ['WANDB_API_KEY'] = data["key"]

wandb.login()

# Разделение данных на тренировочный и тестовый наборы
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

max_depths = [5, 10, 15, 20, 25, 30]

for depth in max_depths:
  run = wandb.init(entity=None, project="diplom", name = f'Decision Tree depth={depth}')

  # Создание и обучение модели дерева решений
  model = DecisionTreeRegressor(random_state=42)
  model.fit(X_train, y_train)

  # Предсказание на тестовом наборе
  y_pred = model.predict(X_test)

  # Рассчитываем метрики качества
  mae = mean_absolute_error(y_test, y_pred)
  mse = mean_squared_error(y_test, y_pred)
  r2 = r2_score(y_test, y_pred)

  # Логирование метрик в W&B
  wandb.log({"MAE": mae})
  wandb.log({"MSE": mse})
  wandb.log({"R2": r2})   

  # Сохранение модели
  model_filename = "decision_tree_model.pkl"
  joblib.dump(model, model_filename)

  # Загрузка сохраненной модели в W&B
  wandb.save(model_filename)

# Завершение сессии W&B
wandb.finish()