In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb
data = pd.read_csv('jamb_exam_results.csv') 
data.head()

Unnamed: 0,JAMB_Score,Study_Hours_Per_Week,Attendance_Rate,Teacher_Quality,Distance_To_School,School_Type,School_Location,Extra_Tutorials,Access_To_Learning_Materials,Parent_Involvement,IT_Knowledge,Student_ID,Age,Gender,Socioeconomic_Status,Parent_Education_Level,Assignments_Completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


In [13]:
data.columns = data.columns.str.lower().str.replace(' ', '_')
data = data.drop(columns=['student_id'])
data = data.fillna(0)

X = data.drop(columns=['jamb_score'])
y = data['jamb_score']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1)

dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(X_train.to_dict(orient='records'))
X_val = dv.transform(X_val.to_dict(orient='records'))
X_test = dv.transform(X_test.to_dict(orient='records'))

In [14]:
# Обучаем дерево решений
dt_model = DecisionTreeRegressor(max_depth=1, random_state=1)
dt_model.fit(X_train, y_train)

# Получили признак, использованный для разбиения данных
split_feature = dv.feature_names_[dt_model.tree_.feature[0]]
print("Признак, использованный для разбиения данных:", split_feature)

Признак, использованный для разбиения данных: study_hours_per_week


In [15]:
# Обучаем случайный лес
rf_model = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Оцениваем модель на валидационных данных
y_val_pred = rf_model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print("RMSE на валидационных данных:", rmse)

RMSE на валидационных данных: 43.157758977963624


In [None]:
# Значения n_estimators от 10 до 200 с шагом 10
rmse_scores = {}
for n in range(10, 201, 10):
    rf_model = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf_model.fit(X_train, y_train)
    y_val_pred = rf_model.predict(X_val)
    rmse_scores[n] = np.sqrt(mean_squared_error(y_val, y_val_pred))

# Значение n_estimators, после которого RMSE перестает улучшаться
for n, score in rmse_scores.items():
    print(f"n_estimators: {n}, RMSE: {score:.3f}")

In [None]:
# Значения max_depth: [10, 15, 20, 25]
depth_scores = {}
for max_depth in [10, 15, 20, 25]:
    rmse_list = []
    for n in range(10, 201, 10):
        rf_model = RandomForestRegressor(n_estimators=n, max_depth=max_depth, random_state=1, n_jobs=-1)
        rf_model.fit(X_train, y_train)
        y_val_pred = rf_model.predict(X_val)
        rmse_list.append(np.sqrt(mean_squared_error(y_val, y_val_pred)))
    depth_scores[max_depth] = np.mean(rmse_list)

# Находим значение max_depth с наименьшим средним RMSE
best_max_depth = min(depth_scores, key=depth_scores.get)
print("Лучшее значение max_depth:", best_max_depth)

In [None]:
# Обучаем модель с указанными параметрами
rf_model = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Получите информацию о важности признаков
importances = rf_model.feature_importances_
important_feature = dv.feature_names_[np.argmax(importances)]
print("Самый важный признак:", important_feature)

In [8]:
# Создаём DMatrix для train и validation
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

# Создаём watchlist
watchlist = [(dtrain, 'train'), (dval, 'eval')]

# Устанавливаем параметры для модели XGBoost
xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1
}

# Обучаем модель с eta=0.3
model_eta_03 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10)

# Меняем eta на 0.1 и повторяем обучение
xgb_params['eta'] = 0.1
model_eta_01 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10)

# Сраваем RMSE на валидации для двух значений eta
print("Лучшее значение RMSE при eta=0.3:", model_eta_03.best_score)
print("Лучшее значение RMSE при eta=0.1:", model_eta_01.best_score)

[0]	train-rmse:42.84835	eval-rmse:44.52338
[1]	train-rmse:39.96423	eval-rmse:42.83406
[2]	train-rmse:37.91231	eval-rmse:41.62607
[3]	train-rmse:36.51126	eval-rmse:41.25491
[4]	train-rmse:35.52212	eval-rmse:40.84075
[5]	train-rmse:34.77126	eval-rmse:40.71677
[6]	train-rmse:34.03898	eval-rmse:40.72669
[7]	train-rmse:33.62820	eval-rmse:40.68822
[8]	train-rmse:32.94729	eval-rmse:40.81273
[9]	train-rmse:32.27703	eval-rmse:40.84939
[10]	train-rmse:31.73818	eval-rmse:40.83759
[11]	train-rmse:31.31360	eval-rmse:40.80575
[12]	train-rmse:30.72949	eval-rmse:40.84238
[13]	train-rmse:30.11486	eval-rmse:40.96020
[14]	train-rmse:29.43538	eval-rmse:40.98775
[15]	train-rmse:29.23018	eval-rmse:41.04798
[16]	train-rmse:28.64113	eval-rmse:41.08375
[17]	train-rmse:28.42128	eval-rmse:41.15979
[0]	train-rmse:45.64414	eval-rmse:46.63724
[1]	train-rmse:44.26862	eval-rmse:45.58724
[2]	train-rmse:43.08569	eval-rmse:44.76209
[3]	train-rmse:42.05227	eval-rmse:44.02498
[4]	train-rmse:41.10533	eval-rmse:43.40640
[5]