In [None]:
# 逻辑回归 位于main.ipynb文件 第10个代码块中

# 导入本程序所需要的库

In [None]:
import csv
import pandas as pd
import math
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
import seaborn as sns
import random
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve
from sklearn.datasets import make_classification

# 初始化列表数据等，以备使用

In [None]:
base_elo_ratings = 1600
team_stats = {}  # Recent Team Performance
team_elo_ratings = {}  # Elo等级分制度(Elo Ratings)
x = []
y = []

# 定义函数：读取csv文件中的球队数据，将其导入至变量中

In [None]:
def get_raw_data(Mstat, Ostat, Tstat):
    new_Mstat = Mstat.drop(['Rk', 'Arena'], axis=1)
    new_Ostat = Ostat.drop(['Rk', 'G', 'MP'], axis=1)
    new_Tstat = Tstat.drop(['Rk', 'G', 'MP'], axis=1)

    team_stats1 = pd.merge(new_Mstat, new_Ostat, how='left', on='Team')
    team_stats1 = pd.merge(team_stats1, new_Tstat, how='left', on='Team')
    return team_stats1.set_index('Team', inplace=False, drop=True)

# 定义函数：根据数据进行Elo等级分制度，获取每个团队的elo分数。

In [None]:
def get_team_elo(team):
    try: 
        return team_elo_ratings[team]  # 根据分数赋予队伍elo rate
    except:
        team_elo_ratings[team] = base_elo_ratings  # 没有elo分数时，对每个队都赋予初始elo rate
        return team_elo_ratings[team]

# 定义函数：对每个队伍的成绩进行分级elo rating

In [None]:
def elo_rating(winner,loser):
    winner_rank = get_team_elo(winner)
    loser_rank = get_team_elo(loser)
    rank_diff = winner_rank - loser_rank
    exp = (rank_diff *- 1) / 400
    odds = 1 / (1 + math.pow(10,exp))

    # 根据排名调整K值
    if winner_rank<2100:
        k = 32
    elif winner_rank >=2100 and winner_rank < 2400:
        k = 24
    else:
        c=16
        
    # 更新排名值
    new_winner_rank = round(winner_rank + (k * (1 - odds)))
    new_loser_rank = round(loser_rank + (k * (0 - odds)))
    return new_winner_rank,new_loser_rank

# 定义函数：得到集合，将数据进行整合

In [None]:
def dataset(all_data):
    print("Working on building data set.")
    skip = 0
    for index, row in all_data.iterrows():
        Wteam = row['WTeam']
        Lteam = row['LTeam']

        # 取每只队伍的elo值
        team1_elo = get_team_elo(Wteam)
        team2_elo = get_team_elo(Lteam)

        # 主场占优，故加100elo分数
        if row['WLoc'] == 'H':
            team1_elo += 100
        else:
            team2_elo += 100

        # 将elo值视为每支队伍的第一个特征值
        team1_features = [team1_elo]
        team2_features = [team2_elo]

        # 考虑获得到的各项数据
        for key, value in team_stats.loc[Wteam].iteritems():
            team1_features.append(value)
        for key, value in team_stats.loc[Lteam].iteritems():
            team2_features.append(value)

        # 将俩队特征值分别赋予两边
        if random.random() > 0.5:
            x.append(team1_features + team2_features)
            y.append(0)
        else:
            x.append(team2_features + team1_features)
            y.append(1)

        if skip == 0:
            print('x',x)
            skip = 1

        # 根据某一场比赛数据来更新队伍的elo值
        new_winner_rank, new_loser_rank = elo_rating(Wteam,Lteam)
        team_elo_ratings[Wteam] = new_winner_rank
        team_elo_ratings[Lteam] = new_loser_rank
    return np.nan_to_num(x),y

# 定义函数：预测哪一个队伍会胜利

In [None]:
def who_win(teamA, teamB, model):
    features = []
    # 客队队伍A
    features.append(get_team_elo(teamA))
    for key, value in team_stats.loc[teamA].iteritems():
        features.append(value)
    # 主队队伍B
    features.append(get_team_elo(teamB)+100)
    for key, value in team_stats.loc[teamB].iteritems():
        features.append(value)
    features = np.nan_to_num(features)
    return model.predict_proba([features])

# 定义函数：绘制曲线

In [None]:
def plot_learning_curve():
    data_x, data_y = make_classification(n_samples=1400, n_classes=5, n_features=10, n_informative=8)  # 生成分类任务
    # 绘制学习曲线
    model = SVC(kernel="linear")
    train_sizes, train_scores, valid_scores = learning_curve(model, data_x, data_y, train_sizes=np.linspace(0.1, 1.0, 10), cv=5, random_state=0)
    train_scores_mean = np.mean(train_scores, axis=1)
    valid_scores_mean = np.mean(valid_scores, axis=1)
    # 开始绘图
    sns.set()
    sns.lineplot(x=train_sizes, y=train_scores_mean, label="Traning score")
    sns.lineplot(x=train_sizes, y=valid_scores_mean, label="Cross-Validation score")
    plt.xlabel("Traning examples")
    plt.ylabel("Score")
    plt.title("Learning Curve")
    plt.show()

# main函数：使用该模型来判断一场新的比赛并返回其获胜的概率

In [None]:
if __name__ == '__main__':
    Mstat = pd.read_csv("2018-2019Miscellaneous Stats.csv")
    Ostat = pd.read_csv("2018-2019Opponent Per Game Stats.csv")
    Tstat = pd.read_csv("2018-2019Team Per Game Stats.csv")

    team_stats = get_raw_data(Mstat, Ostat, Tstat)
    result_data = pd.read_csv("2018-2019_result.csv")
    x, y = dataset(result_data)

    # 训练网络模型
    # print("Fitting on %d game Samples." % len(x))
    model = linear_model.LogisticRegression()
    model.fit(x,y)

    # 使用 10% 交叉验证计算训练准确率
    '''
    print("Doing cross-validation.")
    print(cross_val_score(model, x, y, cv = 10, scoring='accuracy', n_jobs = -1).mean())
    print("predicting on new schedule")
    '''
    schedule1920 = pd.read_csv("2019-2020_schedule.csv")
    result = []
    for index, row in schedule1920.iterrows():
        teamA = row["Vteam"]
        teamB = row["Hteam"]
        pred = who_win(teamA, teamB, model)
        prob = pred[0][0]
        if prob > 0.5:
            winner = teamA
            loser =teamB
            result.append([winner, loser, prob])
        else:
            winner = teamB
            loser = teamA
            result.append([winner, loser, 1-prob])

    with open("19-20Result.csv",'w') as f:
        writer = csv.writer(f)
        writer.writerow(['win', 'lose', 'probability'])
        writer.writerows(result)
        print('done')

# 绘制曲线并将结果可视化

In [None]:
plot_learning_curve()
pd.read_csv('19-20Result.csv',header=0)