In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import sqlite3

# Шаг 1: Загрузка данных
credit_df = pd.read_csv('credit_train.csv', encoding='WINDOWS-1251', on_bad_lines='skip', sep=';', decimal=',')
print(credit_df.head())
print("\nКоличество строк и столбцов:")
print(credit_df.shape)

In [None]:
# Шаг 2: Разделение выборки на обучающую и тестовую
credit_df_train, credit_df_test = train_test_split(credit_df, test_size=0.1, random_state=0)


In [None]:
# Шаг 3: Создание соединения с SQLite in-memory database
conn = sqlite3.connect(':memory:')

credit_df_train.set_index('client_id', inplace=True)
credit_df_train.to_sql('credit_train', conn, index=True, index_label='client_id', if_exists='replace')
columns = credit_df_train.columns


In [None]:
# Шаг 4: Анализ пропусков данных
null_counts_list = []
for column in columns:
    query = f"SELECT COUNT(*) - COUNT({column}) AS null_count FROM credit_train"
    result = pd.read_sql_query(query, conn)
    null_counts_list.append({'column_name': column, 'null_count': result.iloc[0]['null_count']})

null_counts = pd.DataFrame(null_counts_list)
print(null_counts)

In [None]:
# Шаг 5: Удаление строк с пропусками
conditions = " AND ".join([f"{col} IS NOT NULL" for col in columns])
delete_query = f"DELETE FROM credit_train WHERE NOT ({conditions})"
conn.execute(delete_query)
conn.commit()
cleaned_data = pd.read_sql_query("SELECT * FROM credit_train", conn)
print(cleaned_data.shape)


In [None]:
# Шаг 6: Кодирование категориальных переменных
convert_to_numerical = """
UPDATE credit_train
SET gender = CASE gender
    WHEN 'M' THEN 1
    WHEN 'F' THEN 0
    ELSE -1
END,
marital_status = CASE marital_status
    WHEN 'MAR' THEN 1
    WHEN 'UNM' THEN 0
    ELSE -1
END,
job_position = CASE job_position
    WHEN 'SPC' THEN 1
    WHEN 'UMN' THEN 0
    ELSE -1
END,
education = CASE education
    WHEN 'GRD' THEN 1
    WHEN 'SCH' THEN 0
    ELSE -1
END;
"""
conn.execute(convert_to_numerical)
conn.commit()
converted_data = pd.read_sql_query("SELECT gender, marital_status, job_position, education FROM credit_train LIMIT 5", conn)
print(converted_data)


In [None]:
# Шаг 7: Создание и обновление новых столбцов
conn.execute("ALTER TABLE credit_train ADD COLUMN is_moscow_or_piter INTEGER;")
conn.execute("""
UPDATE credit_train
SET is_moscow_or_piter = CASE
WHEN living_region LIKE '%МОСК%' OR living_region LIKE '%ПЕТЕР%' THEN 1
ELSE 0
END;
""")
conn.execute("ALTER TABLE credit_train DROP COLUMN living_region;")
conn.commit()
converted_data = pd.read_sql_query("SELECT * FROM credit_train LIMIT 5", conn)
print(converted_data)


In [None]:
# Шаг 8: Предварительная обработка столбцов для нормализации
columns_to_normalize = ['age', 'credit_sum', 'credit_month', 'tariff_id', 'monthly_income', 'credit_count']

# Обработчик для замены некорректных значений на NULL
def preprocess_column(conn, table, column):
    conn.execute(f"""
    UPDATE {table}
    SET {column} = CASE
        WHEN typeof({column}) = 'text' AND {column} GLOB '*[0-9]*' THEN NULL
        WHEN typeof({column}) = 'text' THEN CAST({column} AS REAL)
        ELSE {column}
    END;
    """)
    conn.commit()

for column in columns_to_normalize:
    preprocess_column(conn, 'credit_train', column)

# Нормализация столбцов
def normalize_column(conn, table, column):
    min_value = conn.execute(f"SELECT MIN({column}) FROM {table} WHERE {column} IS NOT NULL").fetchone()[0]
    max_value = conn.execute(f"SELECT MAX({column}) FROM {table} WHERE {column} IS NOT NULL").fetchone()[0]
    if min_value is not None and max_value is not None and min_value != max_value:
        conn.execute(f"""
        UPDATE {table}
        SET {column} = ({column} - {min_value}) / ({max_value} - {min_value})
        """)
        conn.commit()

for column in columns_to_normalize:
    normalize_column(conn, 'credit_train', column)
normalized_data = pd.read_sql_query("SELECT * FROM credit_train LIMIT 5", conn)
print(normalized_data)

In [None]:
# Шаг 9: Удаление выбросов
credit_sum_df = pd.read_sql_query("SELECT credit_sum FROM credit_train", conn)
Q1 = credit_sum_df['credit_sum'].quantile(0.25)
Q3 = credit_sum_df['credit_sum'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
delete_outliers_query = f"""
DELETE FROM credit_train
WHERE credit_sum < {lower_bound} OR credit_sum > {upper_bound}
"""
conn.execute(delete_outliers_query)
conn.commit()

# Проверка результата
cleaned_data = pd.read_sql_query("SELECT * FROM credit_train LIMIT 5", conn)
print(cleaned_data)