In [2]:
import pandas as pd
from sklearn.model_selection import cross_val_score, RepeatedKFold, cross_val_predict
import numpy as np

from xgboost import XGBRegressor

In [9]:
train_x = pd.read_csv("../results/train_x.csv")
train_y = pd.read_csv("../results/train_y.csv")
imdb_info = pd.read_csv("../results/imdb_results-with-directors.csv")

In [10]:
all_info = train_x.merge(imdb_info, left_on="player_1_name", right_on="title", how = "left")

In [11]:
all_info = all_info.merge(imdb_info, left_on="player_2_name", right_on="title", suffixes = ("_player1", "_player2"), how="left")

In [12]:
all_info = all_info.drop(columns=["player_1_name", "player_2_name", "title_player1", "title_player2", "imdb_id_player1", "imdb_id_player2"])

In [13]:
xgb_model = XGBRegressor(n_estimators=500)

results = cross_val_score(xgb_model, 
                 X = all_info,
                 y = train_y["plus_minus"],
                 scoring="neg_mean_squared_error",
                 cv=RepeatedKFold()
               )
np.sqrt(abs(results)).mean()

19.01855675799774

In [14]:
results = cross_val_predict(xgb_model, 
                 X = all_info,
                 y = train_y["plus_minus"],
                 cv=5
               )

In [15]:
train_y["predicted"] = results

In [16]:
train_y["correct"] = np.sign(train_y["plus_minus"]) * np.sign(train_y["predicted"])

In [17]:
train_y.loc[train_y["correct"] == -1, :].to_csv("../results/mistakes1.3.csv", index = False)

In [18]:
train_y.to_csv("../results/gen_1.3_model.csv", index = False)

In [19]:
from eli5 import show_weights

In [21]:
xgb_model.fit(all_info, train_y["plus_minus"])

show_weights(xgb_model, top=10)

Weight,Feature
0.1157,cc.us_player1
0.0986,actor.Crispin Glover_player1
0.0862,director.Quentin Tarantino_player1
0.0827,player_2_seed
0.0574,genre.Thriller_player2
0.0557,lc.other_player1
0.0539,lc.es_player2
0.0483,votes_player1
0.0301,director.David Fincher_player2
0.0276,director.Ethan Coen_player2


In [25]:
test_x = pd.read_csv("../results/test_x.csv")
test_y = pd.read_csv("../results/test_y.csv")

In [26]:
test_info = test_x.merge(imdb_info, left_on="player_1_name", right_on="title", how = "left")
test_info = test_info.merge(imdb_info, left_on="player_2_name", right_on="title", suffixes = ("_player1", "_player2"), how="left")
test_info = test_info.drop(columns=["player_1_name", "player_2_name", "title_player1", "title_player2", "imdb_id_player1", "imdb_id_player2"])

In [27]:
xgb_model.fit(all_info, train_y["plus_minus"])

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=500,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [31]:
predictions = xgb_model.predict(test_info)

In [32]:
test_y["predictions"] = predictions

In [33]:
test_y

Unnamed: 0,player_1_name,player_2_name,plus_minus,predictions
0,Toy Story,The Sixth Sense,50,42.460114
1,Fargo,Rushmore,44,39.029827
2,Eternal Sunshine of the Spotless Mind,The Umbrellas of Cherbourg,62,55.659367
3,Terminator 2: Judgment Day,The Lion King,22,28.387209
4,Magnolia,All About My Mother,58,53.537502
5,The Godfather: Part II,In America,82,67.279869
6,Goodfellas,Pulp Fiction,-20,11.777266
7,Fargo,Dead Man,82,73.167229
8,Goodfellas,Fargo,-10,-1.4199
9,The Godfather,Pulp Fiction,24,19.813684
