In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats

In [2]:
matches = pd.read_csv('../data/train.csv')

In [3]:
matches.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60242 entries, 0 to 60241
Data columns (total 91 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   match_id                            60242 non-null  int64  
 1   color                               60242 non-null  object 
 2   rank                                60242 non-null  object 
 3   map_code                            60242 non-null  object 
 4   duration                            60242 non-null  int64  
 5   car_name                            60242 non-null  object 
 6   possession_time                     60198 non-null  float64
 7   time_in_side                        60228 non-null  float64
 8   shots                               60242 non-null  int64  
 9   shots_against                       60242 non-null  int64  
 10  goals                               60242 non-null  int64  
 11  goals_against                       60242

**Goal:** Predict the rank based on the total shots.

In [4]:
matches_prepped = matches.groupby(['match_id', 'rank'])[['avg_speed', 'percent_supersonic_speed']].mean().reset_index()
matches_prepped.head()

Unnamed: 0,match_id,rank,avg_speed,percent_supersonic_speed
0,0,silver,1413.0,9.547984
1,1,gold,1346.0,8.922114
2,2,silver,1319.5,10.972618
3,3,platinum,1391.5,8.797042
4,4,platinum,1521.0,12.298239


In [5]:
matches.groupby(['rank'])['time_full_boost'].sum().reset_index()

Unnamed: 0,rank,time_full_boost
0,bronze,57108.73
1,champion,411851.57
2,diamond,517657.02
3,gold,520698.79
4,platinum,591736.32
5,silver,240704.46


In [6]:
matches['rank'].value_counts()

rank
platinum    14996
diamond     13832
gold        12504
champion    11756
silver       5680
bronze       1474
Name: count, dtype: int64

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from xgboost import XGBClassifier, XGBRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import cross_val_score

class MultiColumnLabelEncoder(LabelEncoder):
    """
    Wraps sklearn LabelEncoder functionality for use on multiple columns of a
    pandas dataframe.

    """
    def __init__(self, columns=None):
        self.columns = columns

    def fit(self, dframe):
        """
        Fit label encoder to pandas columns.

        Access individual column classes via indexig `self.all_classes_`

        Access individual column encoders via indexing
        `self.all_encoders_`
        """
        # if columns are provided, iterate through and get `classes_`
        if self.columns is not None:
            # ndarray to hold LabelEncoder().classes_ for each
            # column; should match the shape of specified `columns`
            self.all_classes_ = np.ndarray(shape=self.columns.shape,
                                           dtype=object)
            self.all_encoders_ = np.ndarray(shape=self.columns.shape,
                                            dtype=object)
            for idx, column in enumerate(self.columns):
                # fit LabelEncoder to get `classes_` for the column
                le = LabelEncoder()
                le.fit(dframe.loc[:, column].values)
                # append the `classes_` to our ndarray container
                self.all_classes_[idx] = (column,
                                          np.array(le.classes_.tolist(),
                                                  dtype=object))
                # append this column's encoder
                self.all_encoders_[idx] = le
        else:
            # no columns specified; assume all are to be encoded
            self.columns = dframe.iloc[:, :].columns
            self.all_classes_ = np.ndarray(shape=self.columns.shape,
                                           dtype=object)
            for idx, column in enumerate(self.columns):
                le = LabelEncoder()
                le.fit(dframe.loc[:, column].values)
                self.all_classes_[idx] = (column,
                                          np.array(le.classes_.tolist(),
                                                  dtype=object))
                self.all_encoders_[idx] = le
        return self

    def fit_transform(self, dframe):
        """
        Fit label encoder and return encoded labels.

        Access individual column classes via indexing
        `self.all_classes_`

        Access individual column encoders via indexing
        `self.all_encoders_`

        Access individual column encoded labels via indexing
        `self.all_labels_`
        """
        # if columns are provided, iterate through and get `classes_`
        if self.columns is not None:
            # ndarray to hold LabelEncoder().classes_ for each
            # column; should match the shape of specified `columns`
            self.all_classes_ = np.ndarray(shape=self.columns.shape,
                                           dtype=object)
            self.all_encoders_ = np.ndarray(shape=self.columns.shape,
                                            dtype=object)
            self.all_labels_ = np.ndarray(shape=self.columns.shape,
                                          dtype=object)
            for idx, column in enumerate(self.columns):
                # instantiate LabelEncoder
                le = LabelEncoder()
                # fit and transform labels in the column
                dframe.loc[:, column] =\
                    le.fit_transform(dframe.loc[:, column].values)
                # append the `classes_` to our ndarray container
                self.all_classes_[idx] = (column,
                                          np.array(le.classes_.tolist(),
                                                  dtype=object))
                self.all_encoders_[idx] = le
                self.all_labels_[idx] = le
        else:
            # no columns specified; assume all are to be encoded
            self.columns = dframe.iloc[:, :].columns
            self.all_classes_ = np.ndarray(shape=self.columns.shape,
                                           dtype=object)
            for idx, column in enumerate(self.columns):
                le = LabelEncoder()
                dframe.loc[:, column] = le.fit_transform(
                        dframe.loc[:, column].values)
                self.all_classes_[idx] = (column,
                                          np.array(le.classes_.tolist(),
                                                  dtype=object))
                self.all_encoders_[idx] = le
        return dframe.loc[:, self.columns].values

    def transform(self, dframe):
        """
        Transform labels to normalized encoding.
        """
        if self.columns is not None:
            for idx, column in enumerate(self.columns):
                dframe.loc[:, column] = self.all_encoders_[
                    idx].transform(dframe.loc[:, column].values)
        else:
            self.columns = dframe.iloc[:, :].columns
            for idx, column in enumerate(self.columns):
                dframe.loc[:, column] = self.all_encoders_[idx]\
                    .transform(dframe.loc[:, column].values)
        return dframe.loc[:, self.columns].values

    def inverse_transform(self, dframe):
        """
        Transform labels back to original encoding.
        """
        if self.columns is not None:
            for idx, column in enumerate(self.columns):
                dframe.loc[:, column] = self.all_encoders_[idx]\
                    .inverse_transform(dframe.loc[:, column].values)
        else:
            self.columns = dframe.iloc[:, :].columns
            for idx, column in enumerate(self.columns):
                dframe.loc[:, column] = self.all_encoders_[idx]\
                    .inverse_transform(dframe.loc[:, column].values)
        return dframe.loc[:, self.columns].values




# get `object` columns
X_object_columns = X.iloc[:, :].select_dtypes(include=['object']).columns
# df_copy_object_columns = X_copy.iloc[:, :].select_dtypes(include=['object']).columns

# instantiate `MultiColumnLabelEncoder`
mcle = MultiColumnLabelEncoder(columns=X_object_columns)

# fit to `df` data
mcle.fit(X)

# transform the `df` data
X_NumericOnly = mcle.transform(X)

X_NumericOnly

In [8]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = OneHotEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = OneHotEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [9]:
NumericColumns = matches.select_dtypes(include=[np.number]).columns.tolist()
NumericColumns_Without_MatchID = NumericColumns.pop(0)
NumericColumns

['duration',
 'possession_time',
 'time_in_side',
 'shots',
 'shots_against',
 'goals',
 'goals_against',
 'saves',
 'assists',
 'score',
 'shooting_percentage',
 'bpm',
 'bcpm',
 'avg_amount',
 'amount_collected',
 'amount_stolen',
 'amount_collected_big',
 'amount_stolen_big',
 'amount_collected_small',
 'amount_stolen_small',
 'count_collected_big',
 'count_stolen_big',
 'count_collected_small',
 'count_stolen_small',
 'amount_overfill',
 'amount_overfill_stolen',
 'amount_used_while_supersonic',
 'time_zero_boost',
 'percent_zero_boost',
 'time_full_boost',
 'percent_full_boost',
 'time_boost_0_25',
 'time_boost_25_50',
 'time_boost_50_75',
 'time_boost_75_100',
 'percent_boost_0_25',
 'percent_boost_25_50',
 'percent_boost_50_75',
 'percent_boost_75_100',
 'avg_speed',
 'total_distance',
 'time_supersonic_speed',
 'time_boost_speed',
 'time_slow_speed',
 'time_ground',
 'time_low_air',
 'time_high_air',
 'time_powerslide',
 'count_powerslide',
 'avg_powerslide_duration',
 'avg_spe

In [10]:
MatchAggregated_Matches = matches.groupby(['match_id', 'rank'])[NumericColumns].sum().reset_index()
MatchAggregated_Matches

Unnamed: 0,match_id,rank,duration,possession_time,time_in_side,shots,shots_against,goals,goals_against,saves,...,percent_defensive_half,percent_offensive_half,percent_behind_ball,percent_infront_ball,percent_most_back,percent_most_forward,percent_closest_to_ball,percent_farthest_from_ball,demos_inflicted,demos_taken
0,0,silver,326,82.46,139.64,6,6,5,5,1,...,126.313240,73.686768,144.689560,55.310446,193.838250,193.838250,193.838250,193.838250,1,1
1,1,gold,920,235.53,393.56,17,17,13,13,3,...,131.031125,68.968869,151.785120,48.214880,193.860950,193.860950,193.860950,193.860950,0,0
2,2,silver,188,43.25,81.11,3,3,3,3,0,...,131.124745,68.875258,144.769240,55.230766,187.390130,187.390130,187.390130,187.390130,0,0
3,3,platinum,686,229.37,301.88,12,12,8,8,3,...,125.121784,74.878216,152.124215,47.875778,195.873688,195.873688,195.873688,195.873688,2,2
4,4,platinum,732,260.79,330.97,16,16,7,7,5,...,120.835720,79.164277,151.986890,48.013109,196.647236,196.647236,196.647236,196.647236,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30116,30116,platinum,656,206.86,284.36,13,13,9,9,5,...,120.551075,79.448913,144.377590,55.622410,193.610440,193.610440,193.610440,193.610440,0,0
30117,30117,bronze,852,211.08,365.22,20,20,12,12,8,...,128.013235,71.986763,155.357760,44.642239,195.385700,195.385700,195.385700,195.385700,1,1
30118,30118,silver,780,249.92,344.39,14,14,9,9,5,...,119.381703,80.618304,142.307998,57.692005,196.069200,196.069200,196.069200,196.069200,1,1
30119,30119,silver,744,264.72,336.73,11,11,7,7,4,...,128.526040,71.473962,148.898055,51.101952,198.715385,198.715385,198.715385,198.715385,2,2


In [11]:
X = MatchAggregated_Matches.drop(columns = ['match_id', 'assists', 'shots_against']) #, 'map_code', 'car_name'])
y = MatchAggregated_Matches['rank']

In [12]:
X

Unnamed: 0,rank,duration,possession_time,time_in_side,shots,goals,goals_against,saves,score,shooting_percentage,...,percent_defensive_half,percent_offensive_half,percent_behind_ball,percent_infront_ball,percent_most_back,percent_most_forward,percent_closest_to_ball,percent_farthest_from_ball,demos_inflicted,demos_taken
0,silver,326,82.46,139.64,6,5,5,1,759,175.000000,...,126.313240,73.686768,144.689560,55.310446,193.838250,193.838250,193.838250,193.838250,1,1
1,gold,920,235.53,393.56,17,13,13,3,2355,155.714290,...,131.031125,68.968869,151.785120,48.214880,193.860950,193.860950,193.860950,193.860950,0,0
2,silver,188,43.25,81.11,3,3,3,0,469,100.000000,...,131.124745,68.875258,144.769240,55.230766,187.390130,187.390130,187.390130,187.390130,0,0
3,platinum,686,229.37,301.88,12,8,8,3,1599,111.111112,...,125.121784,74.878216,152.124215,47.875778,195.873688,195.873688,195.873688,195.873688,2,2
4,platinum,732,260.79,330.97,16,7,7,5,1909,87.500000,...,120.835720,79.164277,151.986890,48.013109,196.647236,196.647236,196.647236,196.647236,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30116,platinum,656,206.86,284.36,13,9,9,5,1794,120.000000,...,120.551075,79.448913,144.377590,55.622410,193.610440,193.610440,193.610440,193.610440,0,0
30117,bronze,852,211.08,365.22,20,12,12,8,2393,123.232324,...,128.013235,71.986763,155.357760,44.642239,195.385700,195.385700,195.385700,195.385700,1,1
30118,silver,780,249.92,344.39,14,9,9,5,1786,129.166664,...,119.381703,80.618304,142.307998,57.692005,196.069200,196.069200,196.069200,196.069200,1,1
30119,silver,744,264.72,336.73,11,7,7,4,1734,129.166664,...,128.526040,71.473962,148.898055,51.101952,198.715385,198.715385,198.715385,198.715385,2,2


In [13]:
y

0          silver
1            gold
2          silver
3        platinum
4        platinum
           ...   
30116    platinum
30117      bronze
30118      silver
30119      silver
30120    platinum
Name: rank, Length: 30121, dtype: object

In [14]:
X.describe()

Unnamed: 0,duration,possession_time,time_in_side,shots,goals,goals_against,saves,score,shooting_percentage,bpm,...,percent_defensive_half,percent_offensive_half,percent_behind_ball,percent_infront_ball,percent_most_back,percent_most_forward,percent_closest_to_ball,percent_farthest_from_ball,demos_inflicted,demos_taken
count,30121.0,30121.0,30121.0,30121.0,30121.0,30121.0,30121.0,30121.0,30121.0,30121.0,...,30121.0,30121.0,30121.0,30121.0,30121.0,30121.0,30121.0,30121.0,30121.0,30121.0
mean,675.070549,206.912483,295.18374,14.219349,8.560108,8.560108,4.330899,1769.317055,118.735062,803.716543,...,123.47235,76.52765,149.60888,50.39112,195.01099,195.01099,195.01099,195.01099,1.239003,1.239003
std,191.509433,63.849973,82.298856,5.107314,3.511116,3.511116,2.618905,598.571317,33.969061,101.07118,...,4.918667,4.918667,5.754766,5.754766,2.864088,2.864088,2.864088,2.864088,1.343723,1.343723
min,120.0,0.0,19.89,0.0,0.0,0.0,0.0,20.0,0.0,28.0,...,84.657612,18.4514,94.16526,18.388399,94.780597,94.780597,94.780597,94.780597,0.0,0.0
25%,588.0,177.72,258.24,11.0,6.0,6.0,2.0,1405.0,98.571428,749.0,...,120.328049,73.687727,146.128726,46.769396,194.0816,194.0816,194.0816,194.0816,0.0,0.0
50%,742.0,229.49,332.96,15.0,9.0,9.0,4.0,1828.0,118.181814,814.0,...,123.271144,76.728854,149.73615,50.263841,195.337731,195.337731,195.337731,195.337731,1.0,1.0
75%,804.0,251.34,349.78,18.0,11.0,11.0,6.0,2184.0,140.0,871.0,...,126.312286,79.671956,153.2306,53.871281,196.36016,196.36016,196.36016,196.36016,2.0,2.0
max,1786.0,514.46,724.29,64.0,60.0,60.0,20.0,7512.0,700.0,1186.0,...,181.5486,115.34239,181.61161,105.834743,225.567464,225.567464,225.567464,225.567464,33.0,33.0


In [None]:
scaler = StandardScaler()


le_y = LabelEncoder()

y_fitted = le_y.fit(y)
y_encoded = le_y.fit_transform(y)


X_NumericOnly = X.drop(columns = ['rank'])
X_NumericOnly_Fitted = scaler.fit(X_NumericOnly)
X_NumericOnly_Transformed = scaler.fit_transform(X_NumericOnly)

X_train, X_test, y_train, y_test = train_test_split(X_NumericOnly_Transformed, y_encoded, test_size = 0.003, random_state = 321, shuffle=True, stratify = y_encoded)


In [None]:

#RandomizedSearchCV

param_dist = {
    'max_depth': range(3, 10, 2),
    'min_child_weight':range(1, 6, 2),
    'learning_rate': stats.uniform(0.01, 0.1),
    'subsample':[i/10.0 for i in range(6,10)],
    'colsample_bytree':[i/10.0 for i in range(6,10)],
    'n_estimators':stats.randint(50, 200),
    'gamma':[i/10.0 for i in range(0, 5)],
   # 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100],
    'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05]
    
}



xgb_model = XGBClassifier()

# Create the RandomizedSearchCV object
random_search = RandomizedSearchCV(xgb_model, param_distributions=param_dist, n_iter=20, cv=10, scoring='accuracy')

# Fit the RandomizedSearchCV object to the training data
random_search.fit(X_train, y_train)

# Print the best set of hyperparameters and the corresponding score
print("Best set of hyperparameters: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)






KeyboardInterrupt: 

In [16]:
xgb = XGBClassifier(learning_rate = 0.10506592104909242,
                    colsample_bytree = 0.8,
                    gamma = 0.2,
                    max_depth = 5,
                    min_child_weight = 3,
                    reg_alpha = 0.01,
                    n_estimators = 179,
                    subsample = 0.9,
                    eval_metric='mlogloss'
                                        
                    ).fit(X_train, y_train)

In [17]:
y_pred = xgb.predict(X_test)

In [18]:
le_y

In [19]:
y_fitted

In [20]:
y_encoded

array([5, 3, 5, ..., 5, 5, 4], shape=(30121,))

In [21]:
y

0          silver
1            gold
2          silver
3        platinum
4        platinum
           ...   
30116    platinum
30117      bronze
30118      silver
30119      silver
30120    platinum
Name: rank, Length: 30121, dtype: object

In [22]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [23]:
accuracy_score(y_test, y_pred)

0.5495435684647303

In [24]:
confusion_matrix(y_test, y_pred)

array([[ 56,   0,   0,  17,   6,  68],
       [  0, 791, 343,   2,  39,   1],
       [  1, 311, 710,  32, 328,   1],
       [ 10,   5,  49, 699, 360, 128],
       [  4,  44, 358, 305, 772,  17],
       [ 38,   0,   1, 218,  28, 283]])

In [25]:
print(classification_report(y_test, y_pred, zero_division = 0))

              precision    recall  f1-score   support

           0       0.51      0.38      0.44       147
           1       0.69      0.67      0.68      1176
           2       0.49      0.51      0.50      1383
           3       0.55      0.56      0.55      1251
           4       0.50      0.51      0.51      1500
           5       0.57      0.50      0.53       568

    accuracy                           0.55      6025
   macro avg       0.55      0.52      0.54      6025
weighted avg       0.55      0.55      0.55      6025



In [26]:
scores = cross_val_score(xgb, X_NumericOnly, y_encoded, cv=10, scoring='accuracy')

In [27]:
print('Cross Validation Accuracy Scores:', scores)

Cross Validation Accuracy Scores: [0.56588118 0.54913679 0.55179283 0.56075697 0.53585657 0.56440903
 0.5498008  0.53950863 0.53353254 0.54216467]


Now that I'm happy with the results, I want to refit the model on all of the available training data.

In [28]:
test = pd.read_csv('../data/test.csv')

In [29]:
test

Unnamed: 0,match_id,color,map_code,duration,car_name,possession_time,time_in_side,shots,shots_against,goals,...,percent_defensive_half,percent_offensive_half,percent_behind_ball,percent_infront_ball,percent_most_back,percent_most_forward,percent_closest_to_ball,percent_farthest_from_ball,demos_inflicted,demos_taken
0,30121,blue,utopiastadium_p,400,Octane,105.06,194.75,8,9,6,...,66.643970,33.356037,75.814224,24.185778,97.731575,97.731575,97.731575,97.731575,0,1
1,30121,orange,utopiastadium_p,400,Fennec,137.27,149.39,9,8,5,...,60.714287,39.285713,75.197850,24.802143,97.676710,97.676710,97.676710,97.676710,1,0
2,30122,blue,cs_day_p,355,Octane,109.79,169.13,3,5,3,...,62.784500,37.215504,74.585790,25.414206,98.727340,98.727340,98.727340,98.727340,1,2
3,30122,orange,cs_day_p,355,Octane,142.90,160.31,5,3,2,...,55.250595,44.749400,71.672920,28.327084,98.757010,98.757010,98.757010,98.757010,2,1
4,30123,blue,wasteland_s_p,206,Fennec,68.06,79.30,2,7,1,...,63.022804,36.977196,67.308720,32.691280,96.396400,96.396400,96.396400,96.396400,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,32618,orange,utopiastadium_p,309,Octane,104.96,171.93,4,7,2,...,68.542650,31.457352,72.559784,27.440212,97.214035,97.214035,97.214035,97.214035,2,0
4996,32619,blue,cs_day_p,395,Octane,129.85,188.64,4,10,3,...,68.351326,31.648680,68.960846,31.039158,98.099884,98.099884,98.099884,98.099884,1,1
4997,32619,orange,cs_day_p,395,Fennec,118.85,160.41,10,4,6,...,55.080130,44.919870,78.544044,21.455957,98.045120,98.045120,98.045120,98.045120,1,1
4998,32620,blue,chn_stadium_p,361,Dominus,165.17,150.74,2,7,2,...,60.910637,39.089360,76.314390,23.685608,98.478615,98.478615,98.478615,98.478615,1,1


In [30]:
FreshTest_MatchAggregated_Matches = test.groupby(['match_id'])[NumericColumns].sum().reset_index()
FreshTest_MatchAggregated_Matches

Unnamed: 0,match_id,duration,possession_time,time_in_side,shots,shots_against,goals,goals_against,saves,assists,...,percent_defensive_half,percent_offensive_half,percent_behind_ball,percent_infront_ball,percent_most_back,percent_most_forward,percent_closest_to_ball,percent_farthest_from_ball,demos_inflicted,demos_taken
0,30121,800,242.33,344.14,17,17,11,11,4,0,...,127.358257,72.641750,151.012074,48.987921,195.408285,195.408285,195.408285,195.408285,1,1
1,30122,710,252.69,329.44,8,8,5,5,3,0,...,118.035095,81.964904,146.258710,53.741290,197.484350,197.484350,197.484350,197.484350,3,3
2,30123,412,133.79,178.87,9,9,6,6,2,0,...,119.691410,80.308586,147.550420,52.449579,192.739810,192.739810,192.739810,192.739810,0,0
3,30124,184,49.29,81.69,5,5,3,3,2,0,...,121.093815,78.906189,160.284490,39.715519,192.569590,192.569590,192.569590,192.569590,0,0
4,30125,654,202.73,281.47,10,10,9,9,2,0,...,122.944644,77.055346,148.812370,51.187633,195.210420,195.210420,195.210420,195.210420,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,32616,686,269.93,318.31,11,11,5,5,5,0,...,121.332752,78.667233,151.086150,48.913858,197.398700,197.398700,197.398700,197.398700,1,1
2496,32617,242,49.58,92.88,6,6,6,6,0,0,...,130.336984,69.663008,151.244784,48.755222,186.102780,186.102780,186.102780,186.102780,0,0
2497,32618,618,211.11,279.42,11,11,7,7,4,0,...,121.769520,78.230477,150.480094,49.519893,194.428070,194.428070,194.428070,194.428070,2,2
2498,32619,790,248.70,349.05,14,14,9,9,5,0,...,123.431456,76.568550,147.504890,52.495115,196.145004,196.145004,196.145004,196.145004,2,2


In [31]:
X_fresh_test = FreshTest_MatchAggregated_Matches.drop(columns = ['match_id', 'assists', 'shots_against'])
X_fresh_test


Unnamed: 0,duration,possession_time,time_in_side,shots,goals,goals_against,saves,score,shooting_percentage,bpm,...,percent_defensive_half,percent_offensive_half,percent_behind_ball,percent_infront_ball,percent_most_back,percent_most_forward,percent_closest_to_ball,percent_farthest_from_ball,demos_inflicted,demos_taken
0,800,242.33,344.14,17,11,11,4,2096,130.555557,860,...,127.358257,72.641750,151.012074,48.987921,195.408285,195.408285,195.408285,195.408285,1,1
1,710,252.69,329.44,8,5,5,3,1282,140.000000,726,...,118.035095,81.964904,146.258710,53.741290,197.484350,197.484350,197.484350,197.484350,3,3
2,412,133.79,178.87,9,6,6,2,1180,121.428570,809,...,119.691410,80.308586,147.550420,52.449579,192.739810,192.739810,192.739810,192.739810,0,0
3,184,49.29,81.69,5,3,3,2,652,100.000000,806,...,121.093815,78.906189,160.284490,39.715519,192.569590,192.569590,192.569590,192.569590,0,0
4,654,202.73,281.47,10,9,9,2,1880,166.666664,854,...,122.944644,77.055346,148.812370,51.187633,195.210420,195.210420,195.210420,195.210420,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,686,269.93,318.31,11,5,5,5,1458,83.333332,746,...,121.332752,78.667233,151.086150,48.913858,197.398700,197.398700,197.398700,197.398700,1,1
2496,242,49.58,92.88,6,6,6,0,804,100.000000,800,...,130.336984,69.663008,151.244784,48.755222,186.102780,186.102780,186.102780,186.102780,0,0
2497,618,211.11,279.42,11,7,7,4,1594,121.428570,981,...,121.769520,78.230477,150.480094,49.519893,194.428070,194.428070,194.428070,194.428070,2,2
2498,790,248.70,349.05,14,9,9,5,1938,135.000000,797,...,123.431456,76.568550,147.504890,52.495115,196.145004,196.145004,196.145004,196.145004,2,2


In [32]:
y

0          silver
1            gold
2          silver
3        platinum
4        platinum
           ...   
30116    platinum
30117      bronze
30118      silver
30119      silver
30120    platinum
Name: rank, Length: 30121, dtype: object

In [33]:
y_encoded.shape

(30121,)

In [34]:
X_fresh_test.shape

(2500, 83)

In [35]:
X_FreshTest_Transformed = X_NumericOnly_Fitted.transform(X_fresh_test)
X_FreshTest_Transformed

array([[ 0.65235181,  0.55470822,  0.59486942, ...,  0.13871854,
        -0.17786899, -0.17786899],
       [ 0.18239327,  0.71696626,  0.41624915, ...,  0.86359141,
         1.31055767,  1.31055767],
       [-1.37369169, -1.14524241, -1.41333278, ..., -0.79299851,
        -0.92208232, -0.92208232],
       ...,
       [-0.2980088 ,  0.0657414 , -0.19154582, ..., -0.20353051,
         0.56634434,  0.56634434],
       [ 0.60013419,  0.65447499,  0.65453102, ...,  0.39594922,
         0.56634434,  0.56634434],
       [ 0.24505441,  1.1465742 ,  0.42888623, ...,  0.6898002 ,
         0.56634434,  0.56634434]], shape=(2500, 83))

In [36]:
y_pred_fresh_test = xgb.predict(X_FreshTest_Transformed)

In [37]:
y_pred_fresh_test

array([2, 3, 4, ..., 1, 3, 4], shape=(2500,))

In [38]:
# remapping/reassigning numbers to ranks, in order to be consistent with the way that Michael assigned/mapped the ranks

converter = { 0:1, 5:2, 3:3, 4:4, 2:5, 1:6 }

y_pred_fresh_test = pd.Series(y_pred_fresh_test).map(converter)

In [39]:
y_pred_fresh_test

0       5
1       3
2       4
3       5
4       5
       ..
2495    4
2496    6
2497    6
2498    3
2499    4
Length: 2500, dtype: int64

In [40]:
# converter = { 'bronze': 1, 'silver': 2, 'gold': 3, 'platinum': 4, 'diamond': 5, 'champion': 6 }

# y_pred = pd.Series(y_pred).map(converter)

In [41]:
y_pred_fresh_test_df = pd.DataFrame(y_pred_fresh_test, columns = ['rank'])
y_pred_fresh_test_df

Unnamed: 0,rank
0,5
1,3
2,4
3,5
4,5
...,...
2495,4
2496,6
2497,6
2498,3


In [42]:
submission_AlmostReady = pd.concat([FreshTest_MatchAggregated_Matches['match_id'], y_pred_fresh_test_df], axis = 1).rename(columns = {0: 'rank'})
submission_AlmostReady

Unnamed: 0,match_id,rank
0,30121,5
1,30122,3
2,30123,4
3,30124,5
4,30125,5
...,...,...
2495,32616,4
2496,32617,6
2497,32618,6
2498,32619,3


In [43]:
NaN_count = submission_AlmostReady.isna().sum()
NaN_count

match_id    0
rank        0
dtype: int64

In [44]:
mode_value = submission_AlmostReady['rank'].mode()[0]
submission = submission_AlmostReady.fillna(mode_value).astype(int)
submission

Unnamed: 0,match_id,rank
0,30121,5
1,30122,3
2,30123,4
3,30124,5
4,30125,5
...,...,...
2495,32616,4
2496,32617,6
2497,32618,6
2498,32619,3


In [45]:
submission.to_csv('../data/submission_RandSearchCV.csv', index = False)

Now that I'm happy with the results, I want to refit the model on all of the available training data.

In [46]:
logreg = LogisticRegression().fit(X, y)

ValueError: could not convert string to float: 'silver'

In [None]:
test = pd.read_csv('../data/test.csv')

In [None]:
test = test.groupby('match_id')['shots'].sum().reset_index()

In [None]:
y_pred = logreg.predict(test[variables])

KeyError: "None of [Index(['avg_speed', 'percent_supersonic_speed'], dtype='object')] are in the [columns]"