## 0.环境

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest

df = pd.read_csv('train-NBA.csv')

### 0.1 显示文件信息

In [4]:
df.head()

Unnamed: 0,name,gp,min,pts,fgm,fga,fg,3p_made,3pa,3p,...,fta,ft,oreb,dreb,reb,ast,stl,blk,tov,target_5yrs
0,Brandon Ingram,36,27.4,7.4,2.6,7.6,34.7,0.5,2.1,25.0,...,2.3,69.9,0.7,3.4,4.1,1.9,0.4,0.4,1.3,0
1,Andrew Harrison,35,26.9,7.2,2.0,6.7,29.6,0.7,2.8,23.5,...,3.4,76.5,0.5,2.0,2.4,3.7,1.1,0.5,1.6,0
2,JaKarr Sampson,74,15.3,5.2,2.0,4.7,42.2,0.4,1.7,24.4,...,1.3,67.0,0.5,1.7,2.2,1.0,0.5,0.3,1.0,0
3,Malik Sealy,58,11.6,5.7,2.3,5.5,42.6,0.1,0.5,22.6,...,1.3,68.9,1.0,0.9,1.9,0.8,0.6,0.1,1.0,1
4,Matt Geiger,48,11.5,4.5,1.6,3.0,52.4,0.0,0.1,0.0,...,1.9,67.4,1.0,1.5,2.5,0.3,0.3,0.4,0.8,1


In [5]:
df.columns

Index(['name', 'gp', 'min', 'pts', 'fgm', 'fga', 'fg', '3p_made', '3pa', '3p',
       'ftm', 'fta', 'ft', 'oreb', 'dreb', 'reb', 'ast', 'stl', 'blk', 'tov',
       'target_5yrs'],
      dtype='object')

特征名含义：
- 'name': 球员姓名
- 'gp': 出场比赛次数
- 'min': 平均每场比赛上场时间（分钟）
- 'pts': 平均每场比赛得分
- 'fgm': 平均每场比赛命中次数
- 'fga': 平均每场比赛出手次数
- 'fg': 场均命中率
- '3p_made': 平均每场比赛三分球命中次数
- '3pa': 平均每场比赛三分球出手次数
- '3p': 三分球命中率
- 'ftm': 平均每场比赛罚球命中次数
- 'fta': 平均每场比赛罚球出手次数
- 'ft': 罚球命中率
- 'oreb': 平均每场比赛进攻篮板球数
- 'dreb': 平均每场比赛防守篮板球数
- 'reb': 平均每场比赛篮板球数
- 'ast': 平均每场比赛助攻数
- 'stl': 平均每场比赛抢断数
- 'blk': 平均每场比赛盖帽数
- 'tov': 平均每场比赛失误数
- 'target_5yrs': 五年后是否仍在NBA联赛中，目标标签

## 1.数据预处理

### 1.1 数据清洗

#### 1.1.1 查看文件信息

In [6]:
df['gp'] = df['gp'].apply(np.floor)

In [7]:
df.isnull().sum()

name            0
gp              0
min             0
pts             0
fgm             0
fga             0
fg              0
3p_made         0
3pa             0
3p             18
ftm             0
fta             0
ft              0
oreb            0
dreb            0
reb             0
ast             0
stl             0
blk             0
tov             0
target_5yrs     0
dtype: int64

由上段代码可以知道，三分球命中率的缺失值有22项，其他没有缺失值

#### 1.1.2 处理缺失值

In [8]:
threePNull = df[df['3p'].isnull()==True]
threePNull

Unnamed: 0,name,gp,min,pts,fgm,fga,fg,3p_made,3pa,3p,...,fta,ft,oreb,dreb,reb,ast,stl,blk,tov,target_5yrs
68,Jeff Wilkins,56.0,18.9,4.7,2.1,4.6,45.0,0.0,0.0,,...,0.7,67.5,1.1,3.8,4.9,0.7,0.6,0.8,1.1,1
239,Ken Johnson,64.0,12.7,4.1,1.8,3.3,52.8,0.0,0.0,,...,1.3,43.5,1.4,2.4,3.8,0.3,0.2,0.3,0.9,0
240,Ken Johnson,64.0,12.7,4.1,1.8,3.3,52.8,0.0,0.0,,...,1.3,43.5,1.4,2.4,3.8,0.3,0.2,0.3,0.9,0
241,Pete Williams,53.0,10.8,2.8,1.3,2.1,60.4,0.0,0.0,,...,0.8,42.5,0.9,1.9,2.8,0.3,0.4,0.4,0.4,0
259,Melvin Turpin,79.0,24.7,10.6,4.6,9.0,51.1,0.0,0.0,,...,1.8,78.4,2.0,3.8,5.7,0.5,0.5,1.1,1.5,1
287,Jim Petersen,60.0,11.9,3.2,1.2,2.4,48.6,0.0,0.0,,...,1.1,75.8,0.7,1.7,2.5,0.5,0.2,0.5,1.2,1
298,Tom Scheffler,39.0,6.9,1.3,0.5,1.3,41.2,0.0,0.0,,...,0.5,50.0,0.5,1.5,1.9,0.3,0.2,0.3,0.4,0
408,Sam Williams,59.0,18.2,6.1,2.6,4.7,55.6,0.0,0.0,,...,1.5,55.1,1.5,3.7,5.2,0.6,0.8,1.3,1.1,0
410,Kurt Nimphius,63.0,17.2,5.3,2.2,4.7,46.1,0.0,0.0,,...,1.7,58.3,1.5,3.2,4.7,1.0,0.3,1.3,0.9,1
411,Pete Verhoeven,71.0,17.0,4.9,2.1,4.2,50.3,0.0,0.0,,...,1.0,70.8,1.5,2.1,3.6,0.7,0.6,0.3,0.8,1


因为3p = 3p_made / 3pa ，所以3p的空缺大多是因为3p_made,3pa数值为零导致的,使用dropna（）删除

In [9]:
df = df.dropna()

#### 1.1.3 数据去重

In [10]:
df[df.duplicated(subset=['name']).values == True]

Unnamed: 0,name,gp,min,pts,fgm,fga,fg,3p_made,3pa,3p,...,fta,ft,oreb,dreb,reb,ast,stl,blk,tov,target_5yrs
19,Larry Johnson,82.0,37.2,19.2,7.5,15.3,49.0,0.1,0.3,22.7,...,5.0,82.9,3.9,7.0,11.0,3.6,1.0,0.6,1.9,1
73,Dee Brown,82.0,23.7,8.7,3.5,7.5,46.4,0.1,0.4,20.6,...,1.9,87.3,0.5,1.7,2.2,4.2,1.0,0.2,1.7,1
74,Dee Brown,49.0,9.2,1.9,0.7,2.0,32.7,0.1,0.6,21.4,...,0.8,64.9,0.2,0.7,0.8,1.7,0.5,0.1,0.6,0
75,Dee Brown,49.0,9.2,1.9,0.7,2.0,32.7,0.1,0.6,21.4,...,0.8,64.9,0.2,0.7,0.8,1.7,0.5,0.1,0.6,1
110,Mark Bryant,36.0,7.0,1.6,0.7,1.6,43.1,0.0,0.0,0.0,...,0.4,69.2,0.7,1.2,1.9,0.2,0.1,0.1,0.3,1
125,Mark Davis,33.0,7.8,3.8,1.5,3.1,48.0,0.0,0.3,10.0,...,1.0,82.4,0.5,0.6,1.1,0.4,0.4,0.1,0.4,0
126,Mark Davis,57.0,10.0,3.3,1.0,2.6,36.9,0.1,0.2,30.8,...,2.0,63.8,1.0,1.2,2.2,0.8,0.7,0.4,1.2,1
127,Mark Davis,57.0,10.0,3.3,1.0,2.6,36.9,0.1,0.2,30.8,...,2.0,63.8,1.0,1.2,2.2,0.8,0.7,0.4,1.2,0
144,Reggie Williams,35.0,24.5,10.4,4.3,12.2,35.6,0.4,1.7,22.4,...,1.9,72.7,1.6,1.8,3.4,1.7,0.8,0.6,1.8,1
213,Bob Martin,53.0,10.1,2.1,0.8,1.7,45.5,0.0,0.0,0.0,...,1.0,60.8,0.7,1.5,2.2,0.3,0.1,0.6,0.6,0


保留第一次出现的项

In [11]:
df = df.drop_duplicates(subset=['name'], keep='first', inplace=False)

#### 1.1.4 特征工程

根据资料可知，上述基础数据可以通过公式得到高阶数据：
- 真实投篮命中率=全场得分/[2×（全场出手次数+0.44×罚球出手次数）]
- 投篮效率 = (投篮命中数+0.5×三分命中数)÷投篮出手数
- 篮板率 = 球员篮板数×(球队所有球员上场时间÷5)÷球员上场时间÷(球队总篮板+对手总篮板)
- 助攻率 = 球员助攻数÷(球员上场时间÷(球队所有球员上场时间÷5)×球队总进球数-球员进球数)
- 抢断率 = 球员抢断数×(球队所有球员上场时间÷5)÷球员上场时间÷对手进攻次数
- 盖帽率 = 球员盖帽数×(球队所有球员上场时间÷5)÷球员上场时间÷对手两分球出手次数
- 失误率 = 球员失误数÷(球员两分球出手次数+0.44×球员罚球次数+球员失误数)
- 使用率 = (球员出手次数+0.44×球员罚球次数+球员失误次数)×(球队所有球员上场时间÷5)÷球员上场时间÷(球队所有总球员出手次数+0.44×球队所有球员罚球次数+球队所有球员失误次数)
- PER = [(得分数+助攻数+总篮板数+抢断数+盖帽数)-(投篮出手数-投篮命中数)-(罚球出手数-罚球命中数)-失误数]/球员的比赛场次

In [12]:
# # 真实投篮命中率
# df['真实投篮命中率'] = df['pts'] / (2 * (df['fga'] + 0.44 * df['fta']))

# # 投篮效率
# df['投篮效率'] = (df['fgm'] + 0.5 * df['3p_made']) / df['fga']

# # 篮板率 x 数据不够

# # 助攻率 x 数据不够

# # 失误率
# df['失误率'] = df['tov'] / (df['fga'] + 0.44 * df['fta'] + df['tov']) 

# # 使用率
# # nba['使用率'] = (df['场均投射次数'] + 0.44 * df['平均每场比赛罚球出手次数'] + df['平均每场比赛失误数'] ) 

# # 效率值
# df['效率值'] = ((df['pts'] + df['ast'] + df['reb'] + df['stl'] + df['blk'])
#              - (df['fga'] - df['fgm']) - 
#              (df['fta'] - df['ftm']) - df['tov']) / df['gp']

### 1.2特征选择

#### 1.2.1相关性分析

特征的相关性

In [13]:
cols = df.columns.drop(['name'])
for col in cols:
    f = df['target_5yrs'].corr(df[col])
    print(f'{col} has {f:.3f} correlation with target')

gp has 0.415 correlation with target
min has 0.322 correlation with target
pts has 0.316 correlation with target
fgm has 0.319 correlation with target
fga has 0.289 correlation with target
fg has 0.250 correlation with target
3p_made has 0.013 correlation with target
3pa has -0.005 correlation with target
3p has -0.020 correlation with target
ftm has 0.304 correlation with target
fta has 0.308 correlation with target
ft has 0.088 correlation with target
oreb has 0.313 correlation with target
dreb has 0.303 correlation with target
reb has 0.318 correlation with target
ast has 0.166 correlation with target
stl has 0.227 correlation with target
blk has 0.218 correlation with target
tov has 0.274 correlation with target
target_5yrs has 1.000 correlation with target


#### 1.2.2创建新数据集

In [14]:
selector = SelectKBest(k=16)
df_new = selector.fit_transform(df.drop(['name', 'target_5yrs'], axis=1), df['target_5yrs'])
selected_features_indices = selector.get_support(indices=True)
df.drop(['name', 'target_5yrs'], axis=1).columns[selected_features_indices]

Index(['gp', 'min', 'pts', 'fgm', 'fga', 'fg', 'ftm', 'fta', 'ft', 'oreb',
       'dreb', 'reb', 'ast', 'stl', 'blk', 'tov'],
      dtype='object')

### 1.3特征缩放

#### 1.3.1标准化

In [15]:
scaler = StandardScaler()
df_new = scaler.fit_transform(df_new)

### 1.4拆分数据集

In [16]:
X_train, X_test, y_train, y_test = train_test_split(df_new, df['target_5yrs'], test_size=0.3)

## 2.模型训练

### 2.1环境导入

In [17]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()

from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
rf = RandomForestClassifier()
gb = GradientBoostingClassifier()
et = ExtraTreesClassifier()

from sklearn.svm import SVC
svc = SVC()

from sklearn.naive_bayes import GaussianNB
bayes = GaussianNB()

# from catboost import CatBoostClassifier
# catboost = CatBoostClassifier()

In [18]:
lr.fit(X_train, y_train)
dt.fit(X_train, y_train)
rf.fit(X_train, y_train)
gb.fit(X_train, y_train)
et.fit(X_train, y_train)
knn.fit(X_train, y_train)
svc.fit(X_train, y_train)
bayes.fit(X_train, y_train)
# catboost.fit(X_train, y_train)

In [19]:
lrpred = lr.predict(X_test)
dtpred = dt.predict(X_test)
rfpred = rf.predict(X_test)
gbpred = gb.predict(X_test)
etpred = et.predict(X_test)
knnpred = knn.predict(X_test)
svcpred = svc.predict(X_test)
bayespred = bayes.predict(X_test)
# catboostpred = catboost.predict(X_test)

### 2.2模型评价

In [20]:
from sklearn.metrics import classification_report

print(f"Logistic: {classification_report(y_test, lrpred)}\n----------------------------------------------------")
print(f"\nDecision Tree: {classification_report(y_test, dtpred)}\n----------------------------------------------------")
print(f"\nForest: {classification_report(y_test, rfpred)}\n----------------------------------------------------")
print(f"\nGradient Boost: {classification_report(y_test, gbpred)}\n----------------------------------------------------")
print(f"\nExtra Trees: {classification_report(y_test, etpred)}\n----------------------------------------------------")
print(f"\nsvc: {classification_report(y_test, etpred)}\n----------------------------------------------------")
print(f"\nbayes: {classification_report(y_test, bayespred)}\n----------------------------------------------------")
# print(f"\ncatboost: {classification_report(y_test, catboostpred)}\n----------------------------------------------------")

Logistic:               precision    recall  f1-score   support

           0       0.69      0.57      0.62       123
           1       0.76      0.84      0.80       203

    accuracy                           0.74       326
   macro avg       0.72      0.71      0.71       326
weighted avg       0.73      0.74      0.73       326

----------------------------------------------------

Decision Tree:               precision    recall  f1-score   support

           0       0.59      0.56      0.57       123
           1       0.74      0.76      0.75       203

    accuracy                           0.69       326
   macro avg       0.67      0.66      0.66       326
weighted avg       0.68      0.69      0.69       326

----------------------------------------------------

Forest:               precision    recall  f1-score   support

           0       0.66      0.58      0.61       123
           1       0.76      0.82      0.79       203

    accuracy                           0.

### 2.3模型训练

In [21]:
X_train, y_train = df_new, df['target_5yrs'].values


lr = LogisticRegression()
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
gb = GradientBoostingClassifier()
et = ExtraTreesClassifier()
svm = SVC()
# cb = CatBoostClassifier()

lr.fit(X_train, y_train)
dt.fit(X_train, y_train)
rf.fit(X_train, y_train)
gb.fit(X_train, y_train)
et.fit(X_train, y_train)
knn.fit(X_train, y_train)
svm.fit(X_train, y_train)
# cb.fit(X_train, y_train)

## 3.预测

### 3.1测试集处理

In [22]:
df_test = pd.read_csv('test-NBA.csv')

df_test['gp'] = df_test['gp'].apply(np.floor)
df_test = df_test.dropna()
df_test = df_test.drop_duplicates(subset=['name'], keep='first', inplace=False)
cols = df_test.columns.drop(['name'])

df_new = df_test[df.drop(['name'], axis=1).columns[selected_features_indices]].values
df_new = scaler.transform(df_new)

### 3.2选择预测模型

In [23]:
my_result = pd.DataFrame(et.predict(df_new))

### 3.3生成预测结果

In [24]:
my_result.columns = ["target_5yrs"]
my_result = pd.concat((df_test['name'], my_result), axis=1)
my_result.to_csv("et.csv", index=False)

my_result.info

<bound method DataFrame.info of                       name  target_5yrs
0              Darnell Mee            0
1               Josh Grant            0
2            Ervin Johnson            1
3              Brian Davis            0
4               Eric Riley            1
..                     ...          ...
318           James Harden            1
319      Terrence Williams            1
320  Taurean Waller-Prince            0
321          DeMar DeRozan            1
322        Wayne Ellington            1

[323 rows x 2 columns]>