# Final Model Pipeline for Deployment

In [None]:
# dependencies

import pandas as pd
import numpy as np
import re
import requests
import datetime
from datetime import date, timedelta
from bs4 import BeautifulSoup
import pickle
import copy

from numpy import sort
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn_pandas import DataFrameMapper
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelBinarizer, LabelEncoder
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from sklearn.model_selection import cross_val_score
from sklearn import metrics

import shap

In [None]:
# use if you already have final_df.csv
def get_data(filepath, file_name):
    data = pd.read_csv(filepath+file_name)
    return data

# otherwise, use the following to scrape data
def get_days():
    days = []
    mondays = []
    first = pd.to_datetime(pd.Timestamp(year=2015, month=10, day=1))
    for i in range(2015,2021): # get every monday since 2015
        mondays += pd.date_range(start=str(i),
                         end=str(i+1),
                         freq='W-MON')
    for monday in mondays:
        d = monday.to_pydatetime()
        if d >= first:
            days.append(d)
    return days


months = {
    1:'january',
    2:'february',
    3:'march',
    4:'april',
    5:'may',
    6:'june',
    7:'july',
    8:'august',
    9:'september',
    10:'october',
    11:'november',
    12:'december'
}

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

def rank_data():
    days = get_days()
    data = []
    for i in range(len(days)):
        day = days[i]
        next_day = days[i + 1] - datetime.timedelta(days=1) if i < len(days) - 1 else date.today()
        url = 'https://www.hltv.org/ranking/teams/' + \
            str(day.year) + '/' + \
            months[day.month] + '/' + str(day.day)
        
        res = requests.get(url,headers=headers)
        
#         if res.status_code != 200:
#             continue
#         else:
        soup = BeautifulSoup(res.content, 'html.parser')
        team_ranks = soup.findAll("div",class_="ranked-team standard-box")
        pattern = re.compile('\#(\d+)')
        teams = []
        for team in team_ranks:
            rank = pattern.match(team.find("span",class_="position").text).groups(1)[0]
            name = team.find("span",class_="name").text
            players = team.findAll("div",class_="nick")
            playernames = [player.text for player in players]
            date_range = pd.date_range(start=day,end=next_day)
            for d in date_range:
                teams.append([d, name, rank, playernames]) # have to do every day or else it only pulls 2019-12-30
        
        data+=teams
        
    df = pd.DataFrame(data=data, columns=['date','team','rank','player_names'])
    
    rank_df_csv = 'rank_df.csv'
    rank_df = rank_data()
    rank_df.to_csv(data_filepath+rank_df_csv, index=False)
    
    return df



players = 'https://www.hltv.org/stats/players?startDate=all&matchType=Lan&rankingFilter=Top30'

# returns a list of links for each player
def get_links():
    res = requests.get(players,headers=headers)
    soup = BeautifulSoup(res.content,'html.parser')
    cells = soup.find('table',class_='stats-table player-ratings-table').find('tbody').findAll('tr')
    # table_body = players_table.find('tbody')
    # player_cells = table.body.findAll('tr')

    links = {}
    for cell in cells:
        link_tag = cell.find('td',class_='playerCol').find('a')
        link = link_tag['href']
        name = link_tag.text
        links[name] = 'https://www.hltv.org' + link
    return links

def get_players(links):
    data = []
    # how score is formatted on this page (score = how many rounds team won vs lost)
    score_re = re.compile("\((\d+)\)")
    # how kill/death ratio is formatted on this page
    kd_re = re.compile("(\d+) - (\d+)")
    
    for player, link in links.items():
        res = requests.get(link,headers=headers)
        
#         if res.status_code != 200:
#             continue
#         else:
        soup = BeautifulSoup(res.content,'html.parser')
        impact = soup.findAll('div',class_='summaryStatBreakdownRow')[1].find('div',class_='summaryStatBreakdownDataValue').text.strip()
        
        match_link = link.replace('/players','/players/matches')
        res2 = requests.get(match_link, headers=headers)
        
#         if res2.status_code != 200:
#             continue
#         else:
        soup2 = BeautifulSoup(res2.content,'html.parser')
        rows = soup2.find('table').find('tbody').findAll('tr')
            
        for row in rows:
            cells = row.findAll('td')
            date = cells[0].find('div',class_='time').text.strip()
            team = cells[1].findAll('span')[0].text.strip()
            rounds_text = cells[1].findAll('span')[1].text.strip()
            team_rounds = score_re.match(rounds_text).group(1)
            opposing_team = cells[2].findAll('span')[0].text.strip()
            opposing_team_rounds = score_re.match(cells[2].findAll('span')[1].text.strip()).group(1)
            map_played = cells[3].text.strip()
            kills = kd_re.match(cells[4].text.strip()).group(1)
            deaths = kd_re.match(cells[4].text.strip()).group(2)
            differential = cells[5].text.strip()
            rating = cells[6].text.strip()
            data.append([player, date, team, team_rounds, opposing_team,
                         opposing_team_rounds, map_played, kills, deaths, differential, rating, impact])
            
        # data.append([impact])
    columns = ["player", "date", "team", "team_rounds", "opposing_team",
               "opposing_team_rounds", "map", "kills", "deaths", "differential", "rating", "avg_impact"]
    df = pd.DataFrame(data=data, columns=columns)
    
    
    match_df_csv = 'match_df.csv'
    player_links = get_links()
    match_df = get_players(player_links)
    match_df.to_csv(data_filepath+match_df_csv, index=False)
    return df



def get_month():
    rmonths = []
    months = []
    first = pd.to_datetime(pd.Timestamp(year=2015, month=10, day=1))
    for i in range(2015,2021): # get every monday since 2015
        months += pd.date_range(start=str(i),
                         end=str(i+1),
                         freq='MS')
    for month in months:
        d = month.to_pydatetime()
        if d >= first:
            rmonths.append(d)
    return rmonths

def team_data():
    months = get_month()
    data = []
    for i in range(len(months)):
        month = months[i]
        nextmonth = months[i+1] if i < len(months) - 1 else months[i]
        url = 'https://www.hltv.org/stats/teams/ftu?startDate={}&endDate={}&rankingFilter=Top30'.format(months[i].date(),nextmonth.date())
        res = requests.get(url,headers=headers)
        soup = BeautifulSoup(res.content,'html.parser')
        stats = soup.find('table').find('tbody').findAll('tr')
        team_ = []
        for stat in stats:
            cells = stat.findAll('td')
            team = cells[0].find('a').text.strip()
            p_rounds_won = cells[2].text.strip()
            opening_duels = cells[3].text.strip()
            multi_kills = cells[4].text.strip()
            team_5v4 = cells[5].text.strip()
            team_4v5 = cells[6].text.strip()
            team_traded = cells[7].text.strip()
            utility_adr = cells[8].text.strip()
            utility_flash = cells[9].text.strip()
            date_range = pd.date_range(start=month,end=nextmonth)
            for m in date_range:
                team_.append([m,team,p_rounds_won,opening_duels,multi_kills,
                              team_5v4,team_4v5,team_traded,utility_adr,utility_flash])
        
        data+=team_
    
    columns = ['month','team','p_rounds_won','opening_duels','multi_kills',
               'team_5v4','team_4v5','team_traded','utility_adr','utility_flash']
    df = pd.DataFrame(data=data,columns=columns)
    
    team_df_csv = 'team_df.csv'
    team_df = team_data()
    team_df.to_csv(data_filepath+team_df_csv, index=False)
    return df


# helper functions
def get_rank(opponent, date):
    ranks = ranking_df[ranking_df['date'] == date]
    if len(ranks) == 0:
        return -1
    rows = ranks[ranks['team'] == opponent]
    if(len(rows) == 0):
        return -1
    row = rows.iloc[[0]]
    if len(row) == 0:
        return -1
    return row['rank'].iloc[0]

# using https://www.desmos.com/calculator to model the weights
def rank_weight(row):
    if row['rank_differential'] < -2:
        return 1.5
    elif row['rank_differential'] > 2:
        return 1
    else:
        return 1.75 - ((1 / (1 + math.exp(-0.5*row['rank_differential']))))

def prep_scraped_data():
#     rank_df_csv = 'rank_df.csv'
#     ranking_df = pd.read_csv(data_filepath+rank_df_csv)
#     match_df_csv = 'match_df.csv'
#     match_df = pd.read_csv(data_filepath+match_df_csv)
#     team_df_csv = 'team_df.csv'
#     team_df = pd.read_csv(data_filepath+team_df_csv)
    ranking_df = rank_data()
    player_links = get_links()
    match_df = get_players(player_links)
    team_df = team_data()    
    
    ranking_df['date'] = pd.to_datetime(ranking_df.date)
    match_df['date'] = pd.to_datetime(match_df.date)
    team_df = team_df.rename(columns = {'month':'date'})
    team_df['date'] = pd.to_datetime(team_df.date)
    
    df = copy.deepcopy(match_df)
    
    df = df.merge(team_df, on=['date','team'])
    df['rating'] = df['rating'].replace(to_replace='\*',value='',regex=True)
    df['rating'] = pd.to_numeric(df['rating'])
    df['win'] = df['team_rounds'] > df['opposing_team_rounds']
    
    le = LabelEncoder()
    df['target'] = le.fit_transform(df['win'])
    df = df.drop(columns = 'win')
    
    df['p_rounds_won'] = df['p_rounds_won'].replace(to_replace='\%',value='',regex=True)
    df['p_rounds_won'] = pd.to_numeric(df['p_rounds_won'])
    df['p_rounds_won'] = df['p_rounds_won'] / 100

    df['opening_duels'] = df['opening_duels'].replace(to_replace='\%',value='',regex=True)
    df['opening_duels'] = pd.to_numeric(df['opening_duels'])
    df['opening_duels'] = df['opening_duels'] / 100

    df['team_5v4'] = df['team_5v4'].replace(to_replace='\%',value='',regex=True)
    df['team_5v4'] = pd.to_numeric(df['team_5v4'])
    df['team_5v4'] = df['team_5v4'] / 100

    df['team_4v5'] = df['team_4v5'].replace(to_replace='\%',value='',regex=True)
    df['team_4v5'] = pd.to_numeric(df['team_4v5'])
    df['team_4v5'] = df['team_4v5'] / 100

    df['team_traded'] = df['team_traded'].replace(to_replace='\-',value='',regex=True)
    df['team_traded'] = df['team_traded'].replace(to_replace='\%',value='',regex=True)
    df['team_traded'] = pd.to_numeric(df['team_traded'])
    df['team_traded'] = df['team_traded'] / 100
    df['utility_adr'] = df['utility_adr'].replace(to_replace='\-',value='',regex=True)
    df['utility_adr'] = pd.to_numeric(df['utility_adr'])


    df['utility_flash'] = df['utility_flash'].replace(to_replace='\-',value='',regex=True)
    df['utility_flash'] = pd.to_numeric(df['utility_flash'])
    
    df['team_traded'] = df['team_traded'].fillna(df.groupby('team')['team_traded'].mean())
    df['utility_adr'] = df['utility_adr'].fillna(df.groupby('team')['utility_adr'].mean())
    df['utility_flash'] = df['utility_flash'].fillna(df.groupby('team')['utility_flash'].mean())
    
    df['team_traded'] = df['team_traded'].interpolate(method='linear',limit_direction='both')
    df['utility_adr'] = df['utility_adr'].interpolate(method='linear',limit_direction='both')
    df['utility_flash'] = df['utility_flash'].interpolate(method='linear',limit_direction='both')
    
    
    df['opposing_team_rank'] = -1
    df['team_rank'] = -1
    for index, row in df.iterrows():
        df.at[index, 'opposing_team_rank'] = get_rank(
            row['opposing_team'], row['date'])
        df.at[index, 'team_rank'] = get_rank(
            row['team'], row['date'])

    # stats for each player per game
    if df['deaths'] != 0:
        df['kdr'] = df['kills'] / df['deaths']
    else:
        df['kdr'] = df['kills'] / (df['deaths'] + 1)
    df['kpr'] = df['kills'] / (df['team_rounds'] + df['opposing_team_rounds'])
    
    df['n_impact'] = 0.5*((df['avg_impact'])**2 - (df['avg_impact'].mean())**2)
    df['performance'] = df['rating'] + df['n_impact']
    
    df['rank_differential'] = df.apply(lambda x: 31 if x['team_rank'] == -1 or x['opposing_team_rank'] == -1 else x['opposing_team_rank'] - x['team_rank'], axis=1)



    df['rank_weight'] = df.apply(rank_weight, axis=1)

    # weighted performance
    df['w_performance'] = df['performance'] * df['rank_weight']

    # performance residual - accounted for impact and ranking of teams
    df['perf_resid'] = df['w_performance'] - df['rating']
    
    df['perf_resid_lag'] = df.groupby('player')['perf_resid'].shift(1)

    df['perf_resid_lag'] = df.groupby('player')['perf_resid_lag'].fillna(method='bfill')
    
    df.to_csv(data_filepath + 'final_df.csv', index=False)
    return df

In [None]:
def clean_data(data):
    data['date'] = pd.to_datetime(data.date)
    data = data.replace([np.inf, -np.inf], np.nan).dropna()
    data = data[data['team']!='Sprout']
    
    cd = data.groupby('team').mean()['kdr']
    cd = cd.reset_index()
    cd.columns = ['team','mean_team_kdr']
    
    data = data.merge(cd, on=['team'], how='left')
    
    td = data.groupby('team').mean()['kpr']
    td = td.reset_index()
    td.columns = ['team','mean_team_kpr']
    
    data = data.merge(td, on=['team'], how='left')
    
    rows = []
    for index, row in data.iterrows():
        feature_set = []
        feature_set.append(row["team"])
        feature_set.append(row["kills"])
        feature_set.append(row["deaths"])
        feature_set.append(row["map"])
        feature_set.append(row["opposing_team"])
        feature_set.append(row["rating"])
        feature_set.append(row["avg_impact"])
        feature_set.append(row["p_rounds_won"])
        feature_set.append(row["opening_duels"])
        feature_set.append(row["multi_kills"])
        feature_set.append(row["team_5v4"])
        feature_set.append(row["team_4v5"])
        feature_set.append(row["team_traded"])
        feature_set.append(row["utility_adr"])
        feature_set.append(row["utility_flash"])
        feature_set.append(row["target"])
        feature_set.append(row["opposing_team_rank"])
        feature_set.append(row["team_rank"])
        feature_set.append(row["kdr"])
        feature_set.append(row["kpr"])
        feature_set.append(row["n_impact"])
        feature_set.append(row["performance"])
        feature_set.append(row["rank_differential"])
        feature_set.append(row["rank_weight"])
        feature_set.append(row["w_performance"])
        feature_set.append(row["perf_resid"])
        feature_set.append(row["perf_resid_lag"])
        feature_set.append(row["mean_team_kpr"])
        feature_set.append(row["mean_team_kdr"])
        rows.append(feature_set)
        
    mdf = pd.DataFrame(data=rows, columns=[
         'team',
         'kills',
         'deaths',
         'map',
         'opposing_team',
         'rating',
         'avg_impact',
         'p_rounds_won',
         'opening_duels',
         'multi_kills',
         'team_5v4',
         'team_4v5',
         'team_traded',
         'utility_adr',
         'utility_flash',
         'target',
         'opposing_team_rank',
         'team_rank',
         'kdr',
         'kpr',
         'n_impact',
         'performance',
         'rank_differential',
         'rank_weight',
         'w_performance',
         'perf_resid',
         'perf_resid_lag',
         'mean_team_kdr',
         'mean_team_kpr'
    ])   
    
    new_rows = []
    for index, row in mdf.iterrows():
        pset = []
        pset.append(row["team"])
        pset.append(row["opposing_team"])
        pset.append(row["map"])
        pset.append(row["target"])
        pset.append([
            row["kdr"],
            row["kpr"],
            row["opening_duels"],
            row["multi_kills"],
            row["team_5v4"],
            row["team_4v5"],
            row["team_traded"],
            row["utility_adr"],
            row["utility_flash"],
            row["performance"],
            row["rank_differential"],
            row["perf_resid"],
            row["mean_team_kpr"],
            row["mean_team_kdr"]
        ])
        new_rows.append(pset)
    
    df = pd.DataFrame(data=new_rows, columns=["team", "opponent", "map", "target", "X"])
        
    df = df.sample(frac=1)
    return df

In [None]:
from itertools import chain

In [None]:
def flatten(listOfLists):
    "Flatten one level of nesting"
    return chain.from_iterable(listOfLists)

def get_model(model, df, train_size):
    X = pd.DataFrame(df['X'].tolist())
    y = df['target']
    x_train,x_test,y_train,y_test = train_test_split(X,y,train_size=train_size,random_state=123)
    
    # specify model in final
    model.fit(x_train,y_train)
    train_score = cross_val_score(model,x_train,y_train,cv=10)
    print(f'Train Score:\t{train_score.mean()}\nTScore STD:\t {train_score.std()}')
    return model, x_train, x_test, y_train, y_test

def eval_model(model,x_test,y_test):
    y_pred = model.predict(x_test)

    acc = metrics.accuracy_score(y_test, y_pred)
    cr = metrics.classification_report(y_test,y_pred)
    print("Accuracy: %f" % (acc))
    print(f"Classification Report:\n{cr}")
    return y_pred, acc, cr

def pickle_model():
    model,_ = final_model()
    _,acc,_ = eval_model(model, x_test, y_test)
    
    filepath = 'C:/Users/Tim/Desktop/lighthouse/w11,12 - final project/'
    model_filepath = filepath+'model.pickle'
    
    if not os.path.exists(model_filepath):
        model.to_pickle(model_filepath)
    else:
        old_model = pickle.load(model_filepath)
        _,old_acc,_ = eval_model(old_model,x_test,y_test)
        if acc > old_acc:
            model.to_pickle(model_filepath)
        else:
            print('Accuracy not better than old model, use old model.')

In [None]:
def final_deploy():
    
    # get data
    # data = prep_scraped_data()
    filepath='C:/Users/Tim/Desktop/lighthouse/w11,12 - final project/'
    data_filepath = filepath+'data/'
    file_name = 'final_df.csv'
    data = get_data(data_filepath, file_name)
    
    # prep data
    df = clean_data(data)
    
    # model
    model = XGBClassifier(# objective ='reg:squaredlogerror',
                     # objective = 'reg:logistic',
                     objective = 'binary:logistic',
                     eval_metric = 'logloss',
                     use_label_encoder=False,
                     booster='gbtree',
                     learning_rate = 0.3,
                     colsample_bytree = 1,
                     max_depth = 10, 
                     alpha = 0,
                     n_estimators = 100)
    
    model_, x_train, x_test, y_train, y_test = get_model(model, df, train_size = 0.7)
    
    # eval model
    eval_model(model_,x_test,y_test)
    
    # don't use this if model from xgboost
    # pickle_model()
    
    return model_, df

In [None]:
sample = [0.515000, 0.870000, 0.757000, 0.313000, 0.206000, 18.500000, 0.230000, 1.261408, 3, 1.261408, 0.071408, 1.136444, 0.689899]

def get_prediction(team, map_, opposing_team, sample, df):
    
# sample has to be a *list* of these variables in this order
#             ["kdr"],
#             ["kpr"],
#             ["opening_duels"],
#             ["multi_kills"],
#             ["team_5v4"],
#             ["team_4v5"],
#             ["team_traded"],
#             ["utility_adr"],
#             ["utility_flash"],
#             ["performance"],
#             ["rank_differential"],
#             ["perf_resid"],
#             ["mean_team_kpr"],
#             ["mean_team_kdr"]

    model,_ = final_deploy()
#     model = pickle.load(model_filepath)
    
    ourSet = sample    
    t = df.loc[df['team'] == team]
    t = t.loc[df['map'] == map_] 
    t = t.loc[df['opponent'] == opposing_team]
    m = np.array(t['X'].tolist()).mean(axis=0)
   
    # X = list(np.subtract(ourSet, m))
    X = np.subtract(ourSet, m).reshape(1,-1)
    
    print(f'{team} vs {opposing_team} on {map_} prediction:')
    # X[0] = probability of class 0 (loss), X[1] = probability of class 1
    return model.predict_proba(X)[:,1]

In [None]:
final_deploy()

In [None]:
sample = [0.515000, 0.870000, 0.757000, 0.313000, 0.206000, 18.500000, 0.230000, 1.261408, 3, 1.261408, 0.071408, 1.136444, 0.689899, 2, 3]

get_prediction("Liquid", "inf", "Astralis", sample, df)

In [None]:
model,_ = final_deploy()

sample_array = np.array(sample).reshape(1,-1)

model.predict_proba(sample_array)