In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression,SGDRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.linear_model import Ridge, ElasticNet, Lasso

In [40]:
# 获取数据
data = pd.read_csv("./data/train_V2.csv")

In [41]:
# 数据基本处理
data.shape

(4446966, 29)

In [42]:
# 处理缺失值
np.any(pd.isnull(data))

True

In [43]:
data.dropna(inplace=True)

In [44]:
# 判断缺失值是否处理完
np.any((np.any(pd.isna(data))) or (np.any(pd.isnull(data))))


False

In [45]:
# 缩小数据范围
data = data[:50000]

In [33]:
data.shape

(50000, 29)

In [36]:
data.head()

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,...,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,...,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,47,...,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.9,0,0,0,75,...,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,45,...,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


In [37]:
# 确定特征值和目标值
y = data['winPlacePerc']
x_data = ["assists","boosts",
          "damageDealt","DBNOs",
          "headshotKills","heals",
          "killPlace","killPoints",
          "kills","killStreaks","longestKill",
          "matchDuration","maxPlace","numGroups",
          "rankPoints","teamKills"]
x_dict = {}
for i in range(len(x_data)):
    if i > len(x_data) - 2:
        break
    if i < len(x_data)-1:
        for j in (i+1,len(x_data)-1):
            x_list = []
#             print(len(x_data))
#             print(j)
            x_list.append(x_data[i])
            x_list.append(x_data[j])
            
            x = data[x_list]
            # 数据切割
            x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=22)
            # 特征工程
            # 数据标准化
            transfer = StandardScaler()
            x_train = transfer.fit_transform(x_train)
            x_test = transfer.transform(x_test)

            # 建立模型
            estimator = Ridge(alpha=0.1)
            # estimator.fit(x_train,y_train.astype('int'))
            estimator.fit(x_train,y_train)

            # 5.模型评估
            # 5.1 获取系数等值
            y_predict = estimator.predict(x_test)
            print("预测值为:\n", y_predict)
            print("模型中的系数为:\n", estimator.coef_)
            print("模型中的偏置为:\n", estimator.intercept_)

            # 5.2 评价
            # 平均绝对误差（MAE）
            error = mean_absolute_error(y_test, y_predict)
            print("--{}--".format(i))
            print("平均绝对误差:\n", error)
            print()
            x_dict[tuple(x_list)] = error
    

预测值为:
 [0.34059117 0.55322188 0.34059117 ... 0.72234577 0.34059117 0.50971506]
模型中的系数为:
 [0.03644161 0.18350896]
模型中的偏置为:
 0.4745538
--0--
平均绝对误差:
 0.1950344357599949

预测值为:
 [0.43693344 0.43693344 0.43693344 ... 0.5957226  0.43693344 0.5957226 ]
模型中的系数为:
 [0.09212972 0.00477384]
模型中的偏置为:
 0.4745538
--0--
平均绝对误差:
 0.2532945450846023

预测值为:
 [0.36450775 0.57130974 0.32766872 ... 0.68765306 0.35547176 0.48438205]
模型中的系数为:
 [0.16972031 0.0474155 ]
模型中的偏置为:
 0.4745538
--1--
平均绝对误差:
 0.19450095739585746

预测值为:
 [0.34758852 0.57299458 0.34758852 ... 0.68569762 0.34758852 0.46029155]
模型中的系数为:
 [0.19453461 0.00303892]
模型中的偏置为:
 0.4745538
--1--
平均绝对误差:
 0.19717203174181586

预测值为:
 [0.42997778 0.53383379 0.36916078 ... 0.59697468 0.46660043 0.50549351]
模型中的系数为:
 [ 0.16617427 -0.03968121]
模型中的偏置为:
 0.4745538
--2--
平均绝对误差:
 0.23576914534207352

预测值为:
 [0.47491661 0.50409785 0.36898469 ... 0.55590455 0.44893331 0.53687678]
模型中的系数为:
 [0.13634494 0.00426801]
模型中的偏置为:
 0.4745538
--2--
平均绝对误差:
 0.23681

In [28]:
print(len(x_dict))
x_dict_sort = dict(sorted(x_dict.items(),key=lambda item:item[1]))
# print(x_dict_sort)
for k,v in x_dict_sort.items():
    print(k,"-----",v)

27
('heals', 'killPlace') ----- 0.16196351448395574
('killPlace', 'killPoints') ----- 0.16530046844263951
('killPlace', 'rankPoints') ----- 0.16530358280272303
('boosts', 'damageDealt') ----- 0.19512285919906966
('assists', 'boosts') ----- 0.1958174338418712
('boosts', 'rankPoints') ----- 0.19772914618003043
('headshotKills', 'heals') ----- 0.23073724223742087
('killStreaks', 'longestKill') ----- 0.23378711406923755
('damageDealt', 'DBNOs') ----- 0.23511618376394963
('damageDealt', 'rankPoints') ----- 0.2359738283571998
('heals', 'rankPoints') ----- 0.2371909704133221
('kills', 'killStreaks') ----- 0.23816253716153848
('kills', 'rankPoints') ----- 0.23901266938521476
('killPoints', 'kills') ----- 0.23902734874563544
('longestKill', 'matchDuration') ----- 0.24022445116653357
('longestKill', 'rankPoints') ----- 0.24040110494172057
('killStreaks', 'rankPoints') ----- 0.2448222473379177
('DBNOs', 'headshotKills') ----- 0.25085789852343654
('assists', 'rankPoints') ----- 0.2530446646193457


In [47]:
# 确定特征值和目标值
y = data['winPlacePerc']
x = data[["assists","boosts",
          "damageDealt","DBNOs",
          "headshotKills","heals",
          "killPlace","killPoints",
          "kills","killStreaks","longestKill",
          "matchDuration","maxPlace","numGroups",
          "rankPoints"]]

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=22)
# 特征工程
# 数据标准化
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

# 建立模型
estimator = SGDRegressor(max_iter=1000)
# estimator.fit(x_train,y_train.astype('int'))
estimator.fit(x_train,y_train)

# 5.模型评估
# 5.1 获取系数等值
y_predict = estimator.predict(x_test)
print("预测值为:\n", y_predict)
print("模型中的系数为:\n", estimator.coef_)
print("模型中的偏置为:\n", estimator.intercept_)

# 5.2 评价
# 平均绝对误差（MAE）
error = mean_absolute_error(y_test, y_predict)
print("平均绝对误差:\n", error)



预测值为:
 [0.38363469 0.65259892 0.14725143 ... 0.74306106 0.3347454  0.29891614]
模型中的系数为:
 [ 0.02259406  0.0853314   0.01094456 -0.00575535 -0.00050466  0.01293691
 -0.30251378  0.00122647 -0.03744688 -0.15445413  0.0160755  -0.01579416
 -0.29999905  0.314187    0.00053449]
模型中的偏置为:
 [0.47472513]
平均绝对误差:
 0.11960445582469485
