In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

df = pd.read_csv('jamb_exam_results.csv')

df.columns = df.columns.str.lower().str.replace(' ', '_')

df = df.drop(columns=['student_id'])

df = df.fillna(0)

X = df.drop(columns=['jamb_score'])
y = df['jamb_score']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1)

dv = DictVectorizer(sparse=True)

X_train_dict = X_train.to_dict(orient='records')
X_val_dict = X_val.to_dict(orient='records')
X_test_dict = X_test.to_dict(orient='records')

X_train_encoded = dv.fit_transform(X_train_dict)
X_val_encoded = dv.transform(X_val_dict)
X_test_encoded = dv.transform(X_test_dict)


Вопрос 1

In [2]:
from sklearn.tree import DecisionTreeRegressor

model_tree = DecisionTreeRegressor(max_depth=1, random_state=1)
model_tree.fit(X_train_encoded, y_train)

best_split_feature = model_tree.feature_importances_
best_split_idx = best_split_feature.argmax()
best_feature_name = dv.get_feature_names_out()[best_split_idx]

print(f"Признак для разбиения: {best_feature_name}")

Признак для разбиения: study_hours_per_week


Вопрос 2

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

model_rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
model_rf.fit(X_train_encoded, y_train)

y_val_pred_rf = model_rf.predict(X_val_encoded)

rmse_rf = np.sqrt(mean_squared_error(y_val, y_val_pred_rf))
print(f"RMSE на валидационном наборе: {rmse_rf:.2f}")

RMSE на валидационном наборе: 43.16


Вопрос 3

In [4]:
n_estimators_values = range(10, 201, 10)
rmse_values = []

for n_estimators in n_estimators_values:
    model_rf = RandomForestRegressor(n_estimators=n_estimators, random_state=1, n_jobs=-1)
    model_rf.fit(X_train_encoded, y_train)

    y_val_pred_rf = model_rf.predict(X_val_encoded)
    rmse = np.sqrt(mean_squared_error(y_val, y_val_pred_rf))
    rmse_values.append(rmse)

optimal_n_estimators = n_estimators_values[np.argmin(rmse_values)]
print(f"Лучшее значение n_estimators: {optimal_n_estimators} с RMSE: {min(rmse_values):.3f}")

Лучшее значение n_estimators: 180 с RMSE: 40.136


Вопрос 4

In [5]:
max_depth_values = [10, 15, 20, 25]
n_estimators_values = range(10, 201, 10)

best_rmse = float('inf')
best_max_depth = None
best_n_estimators = None

for max_depth in max_depth_values:
    for n_estimators in n_estimators_values:
        model_rf = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=1, n_jobs=-1)
        model_rf.fit(X_train_encoded, y_train)

        y_val_pred_rf = model_rf.predict(X_val_encoded)
        rmse = np.sqrt(mean_squared_error(y_val, y_val_pred_rf))

        if rmse < best_rmse:
            best_rmse = rmse
            best_max_depth = max_depth
            best_n_estimators = n_estimators

print(f"Лучшее значение max_depth: {best_max_depth}, n_estimators: {best_n_estimators} с RMSE: {best_rmse:.3f}")


Лучшее значение max_depth: 10, n_estimators: 180 с RMSE: 39.823


Вопрос 5

In [6]:
model_rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
model_rf.fit(X_train_encoded, y_train)

feature_importances = model_rf.feature_importances_
most_important_feature_idx = feature_importances.argmax()
most_important_feature_name = dv.get_feature_names_out()[most_important_feature_idx]

print(f"Самый важный признак: {most_important_feature_name}")

Самый важный признак: study_hours_per_week


Вопрос 6

In [7]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

dtrain = xgb.DMatrix(X_train_encoded, label=y_train)
dval = xgb.DMatrix(X_val_encoded, label=y_val)

watchlist = [(dtrain, 'train'), (dval, 'eval')]

xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

model_xgb = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10)

y_val_pred_xgb = model_xgb.predict(dval)
rmse_xgb_0_3 = np.sqrt(mean_squared_error(y_val, y_val_pred_xgb))
print(f"RMSE для eta=0.3: {rmse_xgb_0_3:.3f}")

xgb_params['eta'] = 0.1
model_xgb = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10)

y_val_pred_xgb = model_xgb.predict(dval)
rmse_xgb_0_1 = np.sqrt(mean_squared_error(y_val, y_val_pred_xgb))
print(f"RMSE для eta=0.1: {rmse_xgb_0_1:.3f}")

if rmse_xgb_0_3 < rmse_xgb_0_1:
    print("Лучшее значение eta: 0.3")
else:
    print("Лучшее значение eta: 0.1")

[0]	train-rmse:42.84835	eval-rmse:44.52338
[1]	train-rmse:39.96423	eval-rmse:42.83406
[2]	train-rmse:37.91231	eval-rmse:41.62607
[3]	train-rmse:36.51126	eval-rmse:41.25491
[4]	train-rmse:35.52212	eval-rmse:40.84075
[5]	train-rmse:34.77126	eval-rmse:40.71677
[6]	train-rmse:34.03898	eval-rmse:40.72669
[7]	train-rmse:33.62820	eval-rmse:40.68822
[8]	train-rmse:32.94729	eval-rmse:40.81273
[9]	train-rmse:32.27703	eval-rmse:40.84939
[10]	train-rmse:31.73818	eval-rmse:40.83759
[11]	train-rmse:31.31360	eval-rmse:40.80575
[12]	train-rmse:30.72949	eval-rmse:40.84238
[13]	train-rmse:30.11486	eval-rmse:40.96020
[14]	train-rmse:29.43538	eval-rmse:40.98775
[15]	train-rmse:29.23018	eval-rmse:41.04798
[16]	train-rmse:28.64113	eval-rmse:41.08375
RMSE для eta=0.3: 41.160
[0]	train-rmse:45.64414	eval-rmse:46.63724
[1]	train-rmse:44.26862	eval-rmse:45.58724
[2]	train-rmse:43.08569	eval-rmse:44.76209
[3]	train-rmse:42.05227	eval-rmse:44.02498
[4]	train-rmse:41.10533	eval-rmse:43.40640
[5]	train-rmse:40.2830