1. 机器学习面试题：
任务：
使用线性回归、多项式回归和决策树回归预测学生期末成绩，并比较模型性能。数据集采用UCI机器学习库中的"学生表现数据集"。

实现步骤：
1. 将数据加载到Pandas DataFrame
2. 分离特征(X)和目标变量(y)
3. 按80-20划分训练测试集(random_state=42)
4. 使用StandardScaler标准化特征
5. 训练以下模型：
   - 多项式回归(degree=2)
   - 决策树回归
   - 线性回归
6. 计算各模型的MSE和R2分数
7. 输出评估指标

In [167]:
import numpy as np
import pandas as pd

student_info = pd.read_csv('student-mat.csv', sep=';')
y = student_info['G3']
X = student_info.drop(columns=['G3', 'G1', 'G2'])
binary_cols = ["famsize", "schoolsup", "higher"]
cat_cols = ["school", "Mjob", "Fjob", "guardian"]
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
X

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,yes,no,no,4,3,4,1,1,3,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,yes,yes,no,5,3,3,1,1,3,4
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,yes,yes,no,4,3,2,2,3,3,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,yes,yes,yes,3,2,2,1,1,5,2
4,GP,F,16,U,GT3,T,3,3,other,other,...,yes,no,no,4,3,2,1,2,5,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20,U,LE3,A,2,2,services,services,...,yes,no,no,5,5,4,4,5,4,11
391,MS,M,17,U,LE3,T,3,1,services,services,...,yes,yes,no,2,4,5,3,4,2,3
392,MS,M,21,R,GT3,T,1,1,other,other,...,yes,no,no,5,5,3,3,3,3,3
393,MS,M,18,R,LE3,T,3,2,services,other,...,yes,yes,no,4,4,1,3,4,5,0


In [168]:
binary_cols

['famsize', 'schoolsup', 'higher']

In [169]:
cat_cols

['school', 'Mjob', 'Fjob', 'guardian']

In [170]:
num_cols

['age',
 'Medu',
 'Fedu',
 'traveltime',
 'studytime',
 'failures',
 'famrel',
 'freetime',
 'goout',
 'Dalc',
 'Walc',
 'health',
 'absences']

In [171]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(), cat_cols),
        ('binary', OrdinalEncoder(), binary_cols)
    ])

In [172]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

linear_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('linear_reg', LinearRegression())
])

In [173]:
from sklearn.tree import DecisionTreeRegressor

tree_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('tree_reg', DecisionTreeRegressor(random_state=42))
])

In [174]:
from sklearn.preprocessing import PolynomialFeatures

poly_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('regressor', LinearRegression())
])

In [175]:
linear_pipeline.fit(X_train, y_train)
poly_pipeline.fit(X_train, y_train)
tree_pipeline.fit(X_train, y_train)

In [176]:
linear_pipeline.fit(X_train, y_train)
poly_pipeline.fit(X_train, y_train)
tree_pipeline.fit(X_train, y_train)

In [177]:
linear_pred = linear_pipeline.predict(X_test)
poly_pred = poly_pipeline.predict(X_test)
tree_pred = tree_pipeline.predict(X_test)

In [178]:
from sklearn.metrics import mean_squared_error, r2_score


linear_mse = mean_squared_error(y_test, linear_pred)
linear_r2 = r2_score(y_test, linear_pred)

poly_mse = mean_squared_error(y_test, poly_pred)
poly_r2 = r2_score(y_test, poly_pred)

tree_mse = mean_squared_error(y_test, tree_pred)
tree_r2 = r2_score(y_test, tree_pred)

In [179]:
linear_mse, linear_r2

(16.462142764908982, 0.19716630985061623)

In [180]:
poly_mse, poly_r2

(477.28479059096753, -22.276454053060263)

In [181]:
tree_mse, tree_r2

(22.481012658227847, -0.09636482980652006)

In [182]:
print("\n前10个预测结果对比:")
print("="*60)
print(f"{'实际值':<8} {'线性回归':<10} {'多项式回归':<12} {'决策树回归':<12}")
print("-"*60)
for i in range(10):
    print(f"{y_test.iloc[i]:<8} {linear_pred[i]:<10.2f} {poly_pred[i]:<12.2f} {tree_pred[i]:<12.2f}")


前10个预测结果对比:
实际值      线性回归       多项式回归        决策树回归       
------------------------------------------------------------
10       2.37       -50.61       9.00        
12       8.05       -24.40       6.00        
5        8.93       11.21        8.00        
10       11.11      8.03         7.00        
9        9.71       56.66        8.00        
13       9.48       -4.59        15.00       
18       12.78      25.16        13.00       
6        11.25      21.69        7.00        
0        11.43      -18.09       10.00       
14       12.17      8.61         11.00       


2. 处理泰坦尼克号数据集，在data/目录里

像之前课程里  对读取加州房价数据所做的那样  解压缩此压缩包。

这将提供两个CSV文件，train.csv和test.csv，可以使用pandas.read_csv()加载它。

最后目标：目标是训练一个可以根据其他列预测Survived（是否存活）列的分类器

In [183]:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request


def load_housing_data():
    tarball_path = Path("homework/HW3/data/titanic.tgz")
    if not tarball_path.is_file():
        Path("homework/HW3/data").mkdir(parents=True, exist_ok=True)
        url = "homework/HW3/data/titanic.tgz"
        urllib.request.urlretrieve(url, tarball_path)
    with tarfile.open(tarball_path) as housing_tarball:
        housing_tarball.extractall(path="homework\HW3\data")

load_housing_data()

In [184]:
train_data = pd.read_csv('homework/HW3/data/titanic/train.csv')
test_data = pd.read_csv('homework/HW3/data/titanic/test.csv')

In [185]:
train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen 'Carrie'",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [186]:
test_data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [187]:
X = train_data.drop(['Survived', 'PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
y = train_data['Survived']

In [188]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")
cat_cols = ["Embarked"]
binary_cols = ["Sex"]
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('onehot', OneHotEncoder())
])
binary_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('ordinal', OrdinalEncoder())
])
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

In [189]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipe, num_cols),
        ('cat', cat_pipe, cat_cols),
        ('binary', binary_pipe, binary_cols)
    ])

In [190]:
from sklearn.ensemble import RandomForestClassifier

rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('rf_model', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [191]:
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.2500,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.9250,S
3,1,female,35.0,1,0,53.1000,S
4,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000,S
887,1,female,19.0,0,0,30.0000,S
888,3,female,,1,2,23.4500,S
889,1,male,26.0,0,0,30.0000,C


In [192]:
rf_pipeline.fit(X, y)

In [193]:
X_test = test_data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

In [194]:
X_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0000,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S
...,...,...,...,...,...,...,...
413,3,male,,0,0,8.0500,S
414,1,female,39.0,0,0,108.9000,C
415,3,male,38.5,0,0,7.2500,S
416,3,male,,0,0,8.0500,S


In [195]:
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.2500,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.9250,S
3,1,female,35.0,1,0,53.1000,S
4,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000,S
887,1,female,19.0,0,0,30.0000,S
888,3,female,,1,2,23.4500,S
889,1,male,26.0,0,0,30.0000,C


In [196]:
pred = rf_pipeline.predict(X_test)

In [197]:
result = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': pred
})

In [198]:
result

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


1. 精确率是什么，评估分类的性能为什么不能只用精确率
2. 简述下混淆矩阵是什么
3. 简述下各个性能指标的意思： 准确率，召回率，F1分数，假阳性，PR曲线，ROC曲线，AUC分数
4. 简述下准确率-召回率权衡
5. 如何用二元分类器 去解决多元分类的问题
6. 什么是 多标签-多分类问题？ 

1.精确率 = TP / (TP+FP)

  精确率 = 预测为正类的样本中，实际为正类的样本所占比例。
  
  精确率只关注预测为正的样本，忽略了预测为负的样本
  
  可能导致片面的评估结果，例如一个保守的分类器可能有很高的精确率，但漏掉了大量正类样本

2.混淆矩阵中的每一行代表一个实际类，而每一列代表一个预测类
  是评估分类模型性能的表格形式

3.准确率: 分类正确的样本占总样本的比例

  召回率: 预测为正类的样本中，实际为正类的样本所占比例
  
  F1分数: 精确率和召回率的调和平均数，平衡两者的关系
  
  假阳性: 预测为正类的样本中，实际为负类的样本所占比例
  
  PR曲线: 以召回率为横轴，精确率为纵轴绘制的曲线，以评估分类器的性能
  
  ROC曲线: ROC曲线绘制的是真阳性率（召回率的另一个名称）与假阳性率(False Positive Rate，FPR)
  
  AUC分数: ROC曲线的曲线下面积，表示分类器的性能

4.高准确率意味着分类器在预测为正类的样本中很少犯错，但可能会漏掉一些真正的正类样本(低召回率)

  高召回率意味着分类器能找出大部分正类样本，但可能会将一些负类样本错误分类为正类(低准确率)
  
  提高分类阈值 → 减少FP → 提高精确率，但可能增加FN → 降低召回率
  
  降低分类阈值 → 减少FN → 提高召回率，但可能增加FP → 降低精确率

5.为每两个类别训练一个二元分类器，最终通过投票决定分类结果

  为每个类别训练一个二元分类器，将该类别作为正类，其余所有类别作为负类。

6.每个样本可以同时属于多个类别(标签)

In [5]:
import pandas as pd

# 1. 读取数据
csv_path = "student-mat.csv"

df = pd.read_csv(csv_path, sep=";")  # 该数据集使用分号分隔

# 2. 分离特征(X)与目标(y)
X = df.drop(columns=["G1", "G2", "G3"])
y = df["G3"]

# # 3. 按80-20划分训练测试集(random_state=42)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# 4. 使用StandardScaler标准化特征, 多项式特征（degree=2）也放在预处理
import numpy as np
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import r2_score

preprocess_poly2 = ColumnTransformer(
    transformers=[
        ("num", Pipeline(steps=[
            ("poly", PolynomialFeatures(degree=2, include_bias=False)),
            ("scaler", StandardScaler())
        ]), make_column_selector(dtype_include=np.number)),
        ("cat", OneHotEncoder(handle_unknown="ignore"), make_column_selector(dtype_exclude=np.number)),
    ]
)

# 训练模型
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
poly2_pipeline = make_pipeline(preprocess_poly2, LinearRegression())
poly2_pipeline.fit(X_train, y_train)

In [6]:
y_train_pred = poly2_pipeline.predict(X_train)
y_test_pred = poly2_pipeline.predict(X_test)

r2_score(y_train, y_train_pred)

0.5163689462955683

In [7]:
from sklearn.linear_model import RidgeCV

ridge_pipeline = Pipeline([
    ('preprocess', preprocess_poly2),
    ('ridge', RidgeCV(alphas=[0.1, 1.0, 10.0, 100.0, 1000.0], cv=5))
])

ridge_pipeline.fit(X_train, y_train)
y_train_pred = ridge_pipeline.predict(X_train)
y_test_pred = ridge_pipeline.predict(X_test)
r2_score(y_train, y_train_pred)

0.20128242654058637

In [8]:
from sklearn.linear_model import LassoCV

lasso_pipeline = Pipeline([
    ('preprocess', preprocess_poly2),
    ('lasso', LassoCV(alphas=[0.01, 0.1, 1.0, 10.0, 100.0], cv=5, max_iter=2000))
])
lasso_pipeline.fit(X_train, y_train)
y_train_pred = lasso_pipeline.predict(X_train)
y_test_pred = lasso_pipeline.predict(X_test)
r2_score(y_train, y_train_pred)

0.2900800552440117