In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

file_path = r'C:\Users\Oleg\Documents\bank\table1.xlsx'
data = pd.read_excel(file_path)

df = data.copy()

df.columns = df.columns.str.lower().str.replace(' ', '_')

df = df.drop(columns=['student_id'])

df = df.fillna(0)

train_df, temp_df = train_test_split(df, test_size=0.4, random_state=1)
validation_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=1)

vectorizer = DictVectorizer(sparse=True)
X_train = vectorizer.fit_transform(train_df.to_dict(orient='records'))
X_validation = vectorizer.transform(validation_df.to_dict(orient='records'))
X_test = vectorizer.transform(test_df.to_dict(orient='records'))

print(f"Train: {X_train.shape}, Validation: {X_validation.shape}, Test: {X_test.shape}")
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_text
from sklearn.feature_extraction import DictVectorizer

target = 'jamb_score'
features = [col for col in df.columns if col != target]

vectorizer = DictVectorizer(sparse=False)
X_train = vectorizer.fit_transform(train_df[features].to_dict(orient='records'))
y_train = train_df[target]

tree_model = DecisionTreeRegressor(max_depth=1, random_state=1)
tree_model.fit(X_train, y_train)

tree_rules = export_text(tree_model, feature_names=vectorizer.get_feature_names_out())

print("Правило разбиения в дереве решений (глубина=1):")
print(tree_rules)
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

y_validation = validation_df[target]
X_validation = vectorizer.transform(validation_df[features].to_dict(orient='records'))

random_forest = RandomForestRegressor(
    n_estimators=10,
    random_state=1,
    n_jobs=-1
)
random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_validation)

rmse = np.sqrt(mean_squared_error(y_validation, y_pred))

print(f"RMSE на валидационных данных: {rmse:.2f}")

n_estimators_values = list(range(10, 201, 10))

rmse_values = {}

for n_estimators in n_estimators_values:

    random_forest = RandomForestRegressor(
        n_estimators=n_estimators,
        random_state=1,
        n_jobs=-1
    )

    random_forest.fit(X_train, y_train)

    y_pred = random_forest.predict(X_validation)

    rmse = np.sqrt(mean_squared_error(y_validation, y_pred))

    rmse_values[n_estimators] = rmse

for n_estimators, rmse in rmse_values.items():
    print(f"n_estimators = {n_estimators}, RMSE = {rmse:.3f}")

best_n_estimators = min(rmse_values, key=rmse_values.get)
print(f"Значение n_estimators после которого RMSE перестает улучшаться: {best_n_estimators}")

max_depth_values = [10, 15, 20, 25]
n_estimators_values = list(range(10, 201, 10))

mean_rmse_values = {}

for max_depth in max_depth_values:
    rmse_list = []
    for n_estimators in n_estimators_values:

        random_forest = RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            random_state=1,
            n_jobs=-1
        )

        random_forest.fit(X_train, y_train)

        y_pred = random_forest.predict(X_validation)

        rmse = np.sqrt(mean_squared_error(y_validation, y_pred))
        rmse_list.append(rmse)

    mean_rmse = np.mean(rmse_list)
    mean_rmse_values[max_depth] = mean_rmse

for max_depth, mean_rmse in mean_rmse_values.items():
    print(f"max_depth = {max_depth}, Среднее RMSE = {mean_rmse:.3f}")

best_max_depth = min(mean_rmse_values, key=mean_rmse_values.get)
print(f"Лучшее значение max_depth: {best_max_depth}")
from sklearn.ensemble import RandomForestRegressor

random_forest = RandomForestRegressor(
    n_estimators=10,
    max_depth=20,
    random_state=1,
    n_jobs=-1
)

random_forest.fit(X_train, y_train)

feature_importances = random_forest.feature_importances_

feature_names = vectorizer.get_feature_names_out()

feature_importance_dict = dict(zip(feature_names, feature_importances))

sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

print(f"Самый важный признак: {sorted_feature_importance[0][0]} с важностью {sorted_feature_importance[0][1]:.3f}")
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np

dtrain = xgb.DMatrix(X_train, label=y_train)
dvalidation = xgb.DMatrix(X_validation, label=y_validation)

watchlist = [(dtrain, 'train'), (dvalidation, 'eval')]

xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

model_0_3 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10)

y_pred_0_3 = model_0_3.predict(dvalidation)
rmse_0_3 = np.sqrt(mean_squared_error(y_validation, y_pred_0_3))

xgb_params['eta'] = 0.1
model_0_1 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10)

y_pred_0_1 = model_0_1.predict(dvalidation)
rmse_0_1 = np.sqrt(mean_squared_error(y_validation, y_pred_0_1))

print(f"RMSE с eta=0.3: {rmse_0_3:.3f}")
print(f"RMSE с eta=0.1: {rmse_0_1:.3f}")

if rmse_0_3 < rmse_0_1:
    print("Лучшее значение eta: 0.3")
elif rmse_0_1 < rmse_0_3:
    print("Лучшее значение eta: 0.1")
else:
    print("Обе модели дают одинаковое значение RMSE")
