## 1) Установка окружения и загрузка данных

Подключаем библиотеки и загружаем CSV.

In [2]:
# Импорты
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, KFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Загрузка данных
df = pd.read_csv('Campus_Selection.csv')
print('Shape:', df.shape)
df.head(8)


Shape: (215, 14)


Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed
2,3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed
3,4,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed
4,5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed
5,6,M,55.0,Others,49.8,Others,Science,67.25,Sci&Tech,Yes,55.0,Mkt&Fin,51.58,Not Placed
6,7,F,46.0,Others,49.2,Others,Commerce,79.0,Comm&Mgmt,No,74.28,Mkt&Fin,53.29,Not Placed
7,8,M,82.0,Central,64.0,Central,Science,66.0,Sci&Tech,Yes,67.0,Mkt&Fin,62.14,Placed


## 2) Первичный осмотр данных (EDA)

Проверим типы, пропуски и уникальные значения для категориальных признаков.

In [3]:
print('Info:')
display(df.info())

print('\nПропуски по колонкам:')
display(df.isnull().sum())

print('\nУникальные значения по колонкам:')
display(df.nunique())


Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sl_no           215 non-null    int64  
 1   gender          215 non-null    object 
 2   ssc_p           215 non-null    float64
 3   ssc_b           215 non-null    object 
 4   hsc_p           215 non-null    float64
 5   hsc_b           215 non-null    object 
 6   hsc_s           215 non-null    object 
 7   degree_p        215 non-null    float64
 8   degree_t        215 non-null    object 
 9   workex          215 non-null    object 
 10  etest_p         215 non-null    float64
 11  specialisation  215 non-null    object 
 12  mba_p           215 non-null    float64
 13  status          215 non-null    object 
dtypes: float64(5), int64(1), object(8)
memory usage: 23.6+ KB


None


Пропуски по колонкам:


Unnamed: 0,0
sl_no,0
gender,0
ssc_p,0
ssc_b,0
hsc_p,0
hsc_b,0
hsc_s,0
degree_p,0
degree_t,0
workex,0



Уникальные значения по колонкам:


Unnamed: 0,0
sl_no,215
gender,2
ssc_p,103
ssc_b,2
hsc_p,97
hsc_b,2
hsc_s,3
degree_p,89
degree_t,3
workex,2


## 3) План предобработки

1. Уберём `sl_no`.
2. Закодируем `status` в `status_binary` (Placed=1, Not Placed=0).
3. Определим числовые и категориальные признаки.
4. Построим `ColumnTransformer` для масштабирования числовых и OneHot кодирования категорий.

In [4]:
# Копируем данные и подготавливаем признаки
data = df.copy()

if 'sl_no' in data.columns:
    data = data.drop(columns=['sl_no'])

data['status_binary'] = data['status'].map({'Placed':1, 'Not Placed':0})

numeric_features = ['ssc_p','hsc_p','degree_p','etest_p','mba_p']
categorical_features = ['gender','ssc_b','hsc_b','hsc_s','degree_t','workex','specialisation']

print('Numeric features present:', [c for c in numeric_features if c in data.columns])
print('Categorical features present:', [c for c in categorical_features if c in data.columns])

display(data[numeric_features + categorical_features + ['status','status_binary']].head())
display(data.isnull().sum())


Numeric features present: ['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p']
Categorical features present: ['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation']


Unnamed: 0,ssc_p,hsc_p,degree_p,etest_p,mba_p,gender,ssc_b,hsc_b,hsc_s,degree_t,workex,specialisation,status,status_binary
0,67.0,91.0,58.0,55.0,58.8,M,Others,Others,Commerce,Sci&Tech,No,Mkt&HR,Placed,1
1,79.33,78.33,77.48,86.5,66.28,M,Central,Others,Science,Sci&Tech,Yes,Mkt&Fin,Placed,1
2,65.0,68.0,64.0,75.0,57.8,M,Central,Central,Arts,Comm&Mgmt,No,Mkt&Fin,Placed,1
3,56.0,52.0,52.0,66.0,59.43,M,Central,Central,Science,Sci&Tech,No,Mkt&HR,Not Placed,0
4,85.8,73.6,73.3,96.8,55.5,M,Central,Central,Commerce,Comm&Mgmt,No,Mkt&Fin,Placed,1


Unnamed: 0,0
gender,0
ssc_p,0
ssc_b,0
hsc_p,0
hsc_b,0
hsc_s,0
degree_p,0
degree_t,0
workex,0
etest_p,0


## 4) ColumnTransformer и Pipeline

Числовые признаки: `SimpleImputer(strategy='median')` + `StandardScaler()`.
Категориальные: `SimpleImputer(strategy='most_frequent')` + `OneHotEncoder(handle_unknown='ignore')`.

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

print('Preprocessor ready.')


Preprocessor ready.


## 5) Классификация: модели и K-Fold кросс-валидация

Сравним: Logistic Regression (L2 и L1), kNN, SVM и RandomForest. Оценки: Accuracy, Precision, Recall, F1. Используем StratifiedKFold(n_splits=5).

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

# X и y для классификации
X = data.drop(columns=['status','status_binary'])
y = data['status_binary']

# Убедимся, что порядок колонок корректен (если какие-то отсутствуют, это вызовет KeyError)
X = X[numeric_features + categorical_features]

pipelines_clf = {
    'LogReg_L2': Pipeline([
        ('pre', preprocessor),
        ('clf', LogisticRegression(penalty='l2', solver='liblinear',
                                   max_iter=1000, random_state=42))
    ]),
    'LogReg_L1': Pipeline([
        ('pre', preprocessor),
        ('clf', LogisticRegression(penalty='l1', solver='liblinear',
                                   max_iter=1000, random_state=42))
    ]),
    'LogReg_ElasticNet': Pipeline([
        ('pre', preprocessor),
        ('clf', LogisticRegression(penalty='elasticnet', solver='saga',
                                   l1_ratio=0.5, max_iter=1000, random_state=42))
    ]),
    'kNN': Pipeline([
        ('pre', preprocessor),
        ('clf', KNeighborsClassifier(n_neighbors=5))
    ]),
    'SVM': Pipeline([
        ('pre', preprocessor),
        ('clf', SVC(kernel='rbf', probability=True, random_state=42))
    ]),
    'DecisionTree': Pipeline([
        ('pre', preprocessor),
        ('clf', DecisionTreeClassifier(max_depth=4, random_state=42))
    ]),
    'RandomForest': Pipeline([
        ('pre', preprocessor),
        ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
    ])
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy','precision','recall','f1']

results_clf = {}
for name, pipe in pipelines_clf.items():
    print('Running CV for', name)
    scores = cross_validate(pipe, X, y, cv=cv, scoring=scoring, return_train_score=False)
    results_clf[name] = {
        'accuracy_mean': scores['test_accuracy'].mean(),
        'accuracy_std': scores['test_accuracy'].std(),
        'precision_mean': scores['test_precision'].mean(),
        'recall_mean': scores['test_recall'].mean(),
        'f1_mean': scores['test_f1'].mean()
    }

import pandas as pd
results_clf_df = pd.DataFrame(results_clf).T.sort_values(by='f1_mean', ascending=False)
results_clf_df


Running CV for LogReg_L2
Running CV for LogReg_L1
Running CV for LogReg_ElasticNet
Running CV for kNN
Running CV for SVM
Running CV for DecisionTree
Running CV for RandomForest


Unnamed: 0,accuracy_mean,accuracy_std,precision_mean,recall_mean,f1_mean
RandomForest,0.874419,0.047887,0.870733,0.95908,0.912532
SVM,0.869767,0.023716,0.877425,0.946207,0.908883
LogReg_L2,0.865116,0.040011,0.885187,0.925747,0.904547
LogReg_ElasticNet,0.855814,0.047433,0.879267,0.91908,0.898186
LogReg_L1,0.851163,0.043133,0.878158,0.912184,0.894132
kNN,0.832558,0.034179,0.837938,0.938851,0.885021
DecisionTree,0.786047,0.063091,0.841152,0.852414,0.843694


## 6) Регрессия (status как 0/1): Linear, Ridge, Lasso

Для демонстрации Lasso/Ridge в регрессии закодируем `status` как 0/1 и посчитаем MAE/RMSE/R².

In [7]:
from sklearn.model_selection import KFold, cross_validate
from sklearn.linear_model import LinearRegression, Ridge, Lasso
import numpy as np

y_reg = data['status_binary']

pipelines_reg = {
    'Linear': Pipeline([('pre', preprocessor), ('reg', LinearRegression())]),
    'Ridge': Pipeline([('pre', preprocessor), ('reg', Ridge(alpha=1.0, random_state=42))]),
    'Lasso': Pipeline([('pre', preprocessor), ('reg', Lasso(alpha=0.01, random_state=42, max_iter=5000))])
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
scoring_reg = {'MAE':'neg_mean_absolute_error', 'MSE':'neg_mean_squared_error', 'R2':'r2'}

results_reg = {}
for name, pipe in pipelines_reg.items():
    print('Running CV for (regression) ', name)
    scores = cross_validate(pipe, X, y_reg, cv=kf, scoring=scoring_reg, return_train_score=False)
    mae = -scores['test_MAE'].mean()
    mse = -scores['test_MSE'].mean()
    rmse = np.sqrt(mse)
    r2 = scores['test_R2'].mean()
    results_reg[name] = {'MAE': mae, 'RMSE': rmse, 'R2': r2}

results_reg_df = pd.DataFrame(results_reg).T.sort_values(by='RMSE')
results_reg_df


Running CV for (regression)  Linear
Running CV for (regression)  Ridge
Running CV for (regression)  Lasso


Unnamed: 0,MAE,RMSE,R2
Ridge,0.270888,0.330081,0.464334
Lasso,0.272783,0.330269,0.467717
Linear,0.27109,0.3306,0.462368


## 7) Сравнение результатов и рекомендации

Сводные таблицы, выбор лучшей модели и рекомендации по следующему шагу (GridSearch, баланс классов, отбор признаков).

In [8]:
print('--- Classification results (mean across folds) ---')
display(results_clf_df)

print('\n--- Regression results (binary target) ---')
display(results_reg_df)

best_clf = results_clf_df['f1_mean'].idxmax()
print(f"Лучший классификатор по F1: {best_clf} (F1={results_clf_df.loc[best_clf,'f1_mean']:.4f})")

best_reg = results_reg_df['RMSE'].idxmin()
print(f"Лучшая регрессия по RMSE: {best_reg} (RMSE={results_reg_df.loc[best_reg,'RMSE']:.4f})")


--- Classification results (mean across folds) ---


Unnamed: 0,accuracy_mean,accuracy_std,precision_mean,recall_mean,f1_mean
RandomForest,0.874419,0.047887,0.870733,0.95908,0.912532
SVM,0.869767,0.023716,0.877425,0.946207,0.908883
LogReg_L2,0.865116,0.040011,0.885187,0.925747,0.904547
LogReg_ElasticNet,0.855814,0.047433,0.879267,0.91908,0.898186
LogReg_L1,0.851163,0.043133,0.878158,0.912184,0.894132
kNN,0.832558,0.034179,0.837938,0.938851,0.885021
DecisionTree,0.786047,0.063091,0.841152,0.852414,0.843694



--- Regression results (binary target) ---


Unnamed: 0,MAE,RMSE,R2
Ridge,0.270888,0.330081,0.464334
Lasso,0.272783,0.330269,0.467717
Linear,0.27109,0.3306,0.462368


Лучший классификатор по F1: RandomForest (F1=0.9125)
Лучшая регрессия по RMSE: Ridge (RMSE=0.3301)


## Выводы по сравнению моделей

1. **RandomForest** показал наилучшие результаты среди всех алгоритмов:
   - Самая высокая средняя Accuracy (≈ **87.4%**) при умеренной дисперсии.  
   - Лучшая сбалансированность между Precision (**87.1%**), Recall (**95.9%**) и F1 (**91.3%**).  
   - Высокий Recall особенно важен в задаче прогнозирования трудоустройства — модель почти не пропускает студентов, которые будут трудоустроены.  

2. **SVM** и **Logistic Regression (L2)** показали схожие результаты, немного уступая RandomForest:  
   - Accuracy около **86–87%**,  
   - F1 около **90–91%**.  
   Эти модели также достаточно надёжные и интерпретируемые.  

3. **Logistic Regression (ElasticNet и L1)** показали чуть более слабые результаты:  
   - ElasticNet (F1 ≈ **89.8%**) и L1 (F1 ≈ **89.4%**) показали снижение качества по сравнению с L2.  
   - Это связано с тем, что регуляризация может занулять часть признаков, что иногда ведёт к потере информации.  

4. **kNN** оказался наименее эффективным среди ансамблей и линейных методов  
   - Accuracy ≈ **83.3%**,  
   - F1 ≈ **88.5%**,  
   - Несмотря на высокий Recall, модель хуже сбалансирована по метрикам.  

5. **DecisionTree** показал наименее устойчивые результаты (Accuracy ≈ **78.6%**, F1 ≈ **84.4%**) и высокую дисперсию, что указывает на склонность к переобучению.  

---

### Общий итог
- **Лучшая модель для данной задачи — RandomForest**, так как она даёт оптимальный баланс всех метрик и высокую устойчивость.  
- **SVM и Logistic Regression (L2)** можно рассматривать как хорошие альтернативы, особенно если важна интерпретируемость модели.  
- Для улучшения качества можно дополнительно провести **GridSearchCV** для подбора гиперпараметров и рассмотреть балансировку классов (если распределение целевой переменной неравномерное).  
