In [1]:
from tpot import TPOTRegressor
import pandas as pd
from sklearn.model_selection import RepeatedKFold
import pickle

In [2]:
matches = pd.read_csv("../results/all-matches-with-seeds-final.csv")

In [5]:
film2019_x = matches.loc[matches["match.scores_csv"].isna(),
            ["player_1_name", "player_1_seed", "player_2_name", "player_2_seed"]]

In [3]:
train_x = pd.read_csv("../results/train_x.csv")
train_y = pd.read_csv("../results/train_y.csv")
imdb_info = pd.read_csv("../results/imdb_results-with-directors.csv")
test_x = pd.read_csv("../results/test_x.csv")
test_y = pd.read_csv("../results/test_y.csv")

In [18]:
all_x = pd.concat([train_x, test_x])
all_y = pd.concat([train_y, test_y])

In [19]:
all_info = all_x.merge(imdb_info, left_on="player_1_name", right_on="title", how = "left")
all_info = all_info.merge(imdb_info, left_on="player_2_name", right_on="title", suffixes = ("_player1", "_player2"), how="left")
all_info = all_info.drop(columns=["player_1_name", "player_2_name", "title_player1", "title_player2", "imdb_id_player1", "imdb_id_player2"])

In [26]:
tpot_model = TPOTRegressor(scoring="neg_mean_squared_error",
              cv=RepeatedKFold(n_splits=5, n_repeats=5),
              random_state=0,
              n_jobs=1,
#               memory = "cache",
              periodic_checkpoint_folder = "checkpoints",
              early_stop = 10,
              verbosity=1
        )

In [27]:
tpot_model.fit(all_info, all_y["plus_minus"])


The optimized pipeline was not improved after evaluating 10 more generations. Will end the optimization process.

TPOT closed prematurely. Will use the current best pipeline.
Best pipeline: RidgeCV(OneHotEncoder(ZeroCount(SelectPercentile(input_matrix, percentile=45)), minimum_fraction=0.15, sparse=False, threshold=10))


TPOTRegressor(config_dict=None, crossover_rate=0.1,
       cv=<sklearn.model_selection._split.RepeatedKFold object at 0x7fe862193b00>,
       disable_update_check=False, early_stop=10, generations=100,
       max_eval_time_mins=5, max_time_mins=None, memory=None,
       mutation_rate=0.9, n_jobs=1, offspring_size=None,
       periodic_checkpoint_folder='checkpoints', population_size=100,
       random_state=0, scoring='neg_mean_squared_error', subsample=1.0,
       use_dask=False, verbosity=1, warm_start=False)

In [44]:
with open("../results/tpot.pkl", "wb+") as tpot_file:
    pickle.dump(tpot_file, tpot_model)

TypeError: file must have a 'write' attribute

In [54]:
pareto_models = list(tpot_model.pareto_front_fitted_pipelines_.values())

In [56]:
with open("../results/pareto.pkl", "wb+") as tpot_file:
    pickle.dump(pareto_models, tpot_file)

In [6]:
film2019_all_info = film2019_x.merge(imdb_info, left_on="player_1_name", right_on="title", how = "left")
film2019_all_info = film2019_all_info.merge(imdb_info, left_on="player_2_name", right_on="title", suffixes = ("_player1", "_player2"), how="left")
film2019_all_info = film2019_all_info.drop(columns=["player_1_name", "player_2_name", "title_player1", "title_player2", "imdb_id_player1", "imdb_id_player2"])

In [None]:
with open("../results/pareto.pkl", "rb+") as tpot_file:
    pareto_models = pickle.dump(pareto_models, tpot_file)

In [39]:
film2019_x["predictions"] = tpot_model.predict(film2019_all_info)

In [42]:
film2019_x["Toss-Up"] = abs(film2019_x["predictions"]) < 14.7

In [43]:
film2019_x["Toss-Up"]

127    False
128     True
129    False
130    False
131    False
132    False
133    False
134    False
135    False
136     True
137    False
138    False
139    False
140    False
141    False
142    False
143    False
144     True
145    False
146    False
147    False
148    False
149    False
150    False
151    False
152     True
153    False
154    False
155    False
156    False
157    False
158    False
159    False
160    False
161    False
162    False
163    False
164    False
165    False
166    False
167    False
168    False
169    False
170    False
171    False
Name: Toss-Up, dtype: bool

In [59]:
film2019_x["pareto0"] = pareto_models[0].predict(film2019_all_info)
film2019_x["pareto1"] = pareto_models[1].predict(film2019_all_info)
film2019_x["pareto2"] = pareto_models[2].predict(film2019_all_info)
film2019_x["pareto3"] = pareto_models[3].predict(film2019_all_info)

In [61]:
film2019_x["ensembled_predictions"] = (film2019_x["pareto0"] + film2019_x["pareto1"] + film2019_x["pareto2"] + film2019_x["pareto3"])/4

In [62]:
import numpy as np
film2019_x["agreement"] = np.sign(film2019_x["predictions"]) * np.sign(film2019_x["ensembled_predictions"])

In [66]:
film2019_x.to_csv("../results/2019_predictions.csv", index=False, float_format='%.0f')

In [68]:
all_y["pareto0"] = pareto_models[0].predict(all_info)
all_y["pareto1"] = pareto_models[1].predict(all_info)
all_y["pareto2"] = pareto_models[2].predict(all_info)
all_y["pareto3"] = pareto_models[3].predict(all_info)

In [70]:
all_y["ensembled_predictions"] = (all_y["pareto0"] + all_y["pareto1"] + all_y["pareto2"] + all_y["pareto3"])/4

In [72]:
all_y["best_model_correct"] = np.sign(all_y["plus_minus"]) * np.sign(all_y["pareto3"])
all_y["ensemble_correct"] = np.sign(all_y["plus_minus"]) * np.sign(all_y["ensembled_predictions"])

In [74]:
all_y.to_csv("../results/previous_year_predictions.csv", index=False, float_format='%.0f')

In [75]:
all_y[all_y["ensemble_correct"] == -1]

Unnamed: 0,player_1_name,player_2_name,plus_minus,pareto0,pareto1,pareto2,pareto3,ensembled_predictions,best_model_correct,ensemble_correct
5,Taxi Driver,Casablanca,-6,-1.026298,0.712597,1.014892,0.234874,0.234016,-1.0,-1.0
12,Pulp Fiction,Fargo,-8,10.213324,9.438451,9.112665,9.666807,9.607812,-1.0,-1.0
15,Apocalypse Now,The Empire Strikes Back,-6,-0.662841,0.663629,0.304169,2.109313,0.603568,-1.0,-1.0
65,The Shining,Eternal Sunshine of the Spotless Mind,-2,8.391208,8.977938,8.881725,15.99788,10.562188,-1.0,-1.0
75,Magnolia,Being John Malkovich,-8,1.723542,4.133521,4.570119,11.037917,5.366275,-1.0,-1.0
85,Grave of the Fireflies,Rio Bravo,-4,9.180879,8.944466,9.07981,12.063495,9.817163,-1.0,-1.0
91,Heat,Dazed and Confused,-6,4.112505,2.786394,3.099251,6.057177,4.013832,-1.0,-1.0


In [97]:
from eli5.sklearn import PermutationImportance
from eli5 import show_weights

In [93]:
select_percentile_supports = pareto_models[3].named_steps['selectpercentile'].get_support()

In [99]:
perm_imp = PermutationImportance(pareto_models[3]).fit(all_info, all_y["plus_minus"])

In [105]:
show_weights(perm_imp, feature_names = all_info.columns.tolist())

Weight,Feature
0.6241  ± 0.2297,player_2_seed
0.2035  ± 0.0923,player_1_seed
0.0649  ± 0.0088,votes_player1
0.0606  ± 0.0290,votes_player2
0.0291  ± 0.0141,director.other_player2
0.0254  ± 0.0121,mpaa.PG_player2
0.0209  ± 0.0102,genre.Drama_player2
0.0180  ± 0.0112,genre.Action_player2
0.0126  ± 0.0061,mpaa.PG_player1
0.0122  ± 0.0048,lc.fr_player1


In [106]:
## Round 2
round_2 = pd.read_csv("../results/2019_predictions_round_2.csv")

round_2_all_info = round_2.merge(imdb_info, left_on="player_1_name", right_on="title", how = "left")
round_2_all_info = round_2_all_info.merge(imdb_info, left_on="player_2_name", right_on="title", suffixes = ("_player1", "_player2"), how="left")
round_2_all_info = round_2_all_info.drop(columns=["player_1_name", "player_2_name", "title_player1", "title_player2", "imdb_id_player1", "imdb_id_player2"])

In [107]:
round_2["pareto0"] = pareto_models[0].predict(round_2_all_info)
round_2["pareto1"] = pareto_models[1].predict(round_2_all_info)
round_2["pareto2"] = pareto_models[2].predict(round_2_all_info)
round_2["pareto3"] = pareto_models[3].predict(round_2_all_info)
round_2["ensembled_predictions"] = (round_2["pareto0"] + round_2["pareto1"] + round_2["pareto2"] + round_2["pareto3"])/4

In [110]:
round_2["Toss-Up"] = abs(round_2["ensembled_predictions"]) < 14.7

In [111]:
round_2.to_csv("../results/2019_predictions_round_2.csv", index=False, float_format='%.0f')

In [112]:
round_3 = pd.read_csv("../results/2019_round_3.csv")

round_3_all_info = round_3.merge(imdb_info, left_on="player_1_name", right_on="title", how = "left")
round_3_all_info = round_3_all_info.merge(imdb_info, left_on="player_2_name", right_on="title", suffixes = ("_player1", "_player2"), how="left")
round_3_all_info = round_3_all_info.drop(columns=["player_1_name", "player_2_name", "title_player1", "title_player2", "imdb_id_player1", "imdb_id_player2"])

round_3["pareto0"] = pareto_models[0].predict(round_3_all_info)
round_3["pareto1"] = pareto_models[1].predict(round_3_all_info)
round_3["pareto2"] = pareto_models[2].predict(round_3_all_info)
round_3["pareto3"] = pareto_models[3].predict(round_3_all_info)
round_3["ensembled_predictions"] = (round_3["pareto0"] + round_3["pareto1"] + round_3["pareto2"] + round_3["pareto3"])/4

round_3["Toss-Up"] = abs(round_3["ensembled_predictions"]) < 14.7
round_3.to_csv("../results/2019_predictions_round_3.csv", index=False, float_format='%.0f')

In [113]:
round_3 = pd.read_csv("../results/2019_round_3.csv")

round_3_all_info = round_3.merge(imdb_info, left_on="player_1_name", right_on="title", how = "left")
round_3_all_info = round_3_all_info.merge(imdb_info, left_on="player_2_name", right_on="title", suffixes = ("_player1", "_player2"), how="left")
round_3_all_info = round_3_all_info.drop(columns=["player_1_name", "player_2_name", "title_player1", "title_player2", "imdb_id_player1", "imdb_id_player2"])

round_3["pareto0"] = pareto_models[0].predict(round_3_all_info)
round_3["pareto1"] = pareto_models[1].predict(round_3_all_info)
round_3["pareto2"] = pareto_models[2].predict(round_3_all_info)
round_3["pareto3"] = pareto_models[3].predict(round_3_all_info)
round_3["ensembled_predictions"] = (round_3["pareto0"] + round_3["pareto1"] + round_3["pareto2"] + round_3["pareto3"])/4

round_3["Toss-Up"] = abs(round_3["ensembled_predictions"]) < 14.7
round_3.to_csv("../results/2019_predictions_round_3.csv", index=False, float_format='%.0f')

Unnamed: 0,player_1_name,player_1_seed,player_2_name,player_2_seed,pareto0,pareto1,pareto2,pareto3,ensembled_predictions,Toss-Up
0,No Country For Old Men,1,Kill Bill Vol. 1,17,18.949595,17.638617,17.524843,11.220349,16.333351,False
1,The Royal Tenenbaums,8,Spirited Away,9,9.295562,2.436586,2.388878,-2.157629,2.990849,True
2,Eternal Sunshine of the Spotless Mind,4,Lost In Translation,13,26.664353,29.988992,29.136771,29.116208,28.726581,False
3,Mulholland Dr.,5,Before Sunset,12,8.966503,10.672506,10.577727,11.733164,10.487475,True
4,There Will Be Blood,2,Memento,18,-1.750202,0.197424,0.499612,3.849585,0.699105,True
5,Zodiac,7,Children of Men,10,-13.630091,-11.278088,-11.782376,3.470799,-8.304939,True
6,The Dark Knight,3,WALL-E,14,25.299139,25.988017,25.989103,23.961065,25.309331,False
7,Inglourious Basterds,6,Fellowship of the Ring,11,6.828371,8.198995,11.515569,14.95082,10.373439,True
8,Kill Bill Vol. 1,17,Mulholland Dr.,5,-4.084697,-4.679351,-4.626143,-1.302773,-3.673241,True
9,No Country For Old Men,1,Children of Men,10,11.172905,12.761969,12.164131,15.960578,13.014896,True


In [114]:
round_4 = pd.read_csv("../results/2019_round_4.csv")

round_4_all_info = round_4.merge(imdb_info, left_on="player_1_name", right_on="title", how = "left")
round_4_all_info = round_4_all_info.merge(imdb_info, left_on="player_2_name", right_on="title", suffixes = ("_player1", "_player2"), how="left")
round_4_all_info = round_4_all_info.drop(columns=["player_1_name", "player_2_name", "title_player1", "title_player2", "imdb_id_player1", "imdb_id_player2"])

round_4["pareto0"] = pareto_models[0].predict(round_4_all_info)
round_4["pareto1"] = pareto_models[1].predict(round_4_all_info)
round_4["pareto2"] = pareto_models[2].predict(round_4_all_info)
round_4["pareto3"] = pareto_models[3].predict(round_4_all_info)
round_4["ensembled_predictions"] = (round_4["pareto0"] + round_4["pareto1"] + round_4["pareto2"] + round_4["pareto3"])/4

round_4["Toss-Up"] = abs(round_4["ensembled_predictions"]) < 14.7
round_4.to_csv("../results/2019_predictions_round_4.csv", index=False, float_format='%.0f')

In [115]:
round_4

Unnamed: 0,player_1_name,player_1_seed,player_2_name,player_2_seed,pareto0,pareto1,pareto2,pareto3,ensembled_predictions,Toss-Up
0,No Country For Old Men,1,The Royal Tenenbaums,8,21.49973,19.36032,18.982001,9.598237,17.360072,False
1,Eternal Sunshine of the Spotless Mind,4,Mulholland Dr.,5,12.747414,12.260875,11.411822,13.325238,12.436337,True
2,There Will Be Blood,2,Children of Men,10,-9.370852,-5.491537,-5.763911,7.621564,-3.251184,True
3,The Dark Knight,3,Inglourious Basterds,6,29.505562,31.280863,30.706717,27.950144,29.860821,False
4,Mulholland Dr.,5,No Country For Old Men,1,-25.905748,-25.690023,-26.118878,-19.652311,-24.34174,False


In [116]:
round_5 = pd.read_csv("../results/2019_round_5.csv")

round_5_all_info = round_5.merge(imdb_info, left_on="player_1_name", right_on="title", how = "left")
round_5_all_info = round_5_all_info.merge(imdb_info, left_on="player_2_name", right_on="title", suffixes = ("_player1", "_player2"), how="left")
round_5_all_info = round_5_all_info.drop(columns=["player_1_name", "player_2_name", "title_player1", "title_player2", "imdb_id_player1", "imdb_id_player2"])

round_5["pareto0"] = pareto_models[0].predict(round_5_all_info)
round_5["pareto1"] = pareto_models[1].predict(round_5_all_info)
round_5["pareto2"] = pareto_models[2].predict(round_5_all_info)
round_5["pareto3"] = pareto_models[3].predict(round_5_all_info)
round_5["ensembled_predictions"] = (round_5["pareto0"] + round_5["pareto1"] + round_5["pareto2"] + round_5["pareto3"])/4

round_5["Toss-Up"] = abs(round_5["ensembled_predictions"]) < 14.7
round_5.to_csv("../results/2019_predictions_round_5.csv", index=False, float_format='%.0f')

In [117]:
round_5 = pd.read_csv("../results/2019_round_5.csv")

Unnamed: 0,player_1_name,player_1_seed,player_2_name,player_2_seed,pareto0,pareto1,pareto2,pareto3,ensembled_predictions,Toss-Up
0,No Country For Old Men,1,Eternal Sunshine of the Spotless Mind,4,8.992919,10.047622,9.841951,6.49499,8.84437,True
1,Children of Men,10,The Dark Knight,3,-20.762115,-16.703875,-16.817584,-17.074621,-17.839549,False


In [118]:
round_6 = pd.read_csv("../results/2019_round_6.csv")

round_6_all_info = round_6.merge(imdb_info, left_on="player_1_name", right_on="title", how = "left")
round_6_all_info = round_6_all_info.merge(imdb_info, left_on="player_2_name", right_on="title", suffixes = ("_player1", "_player2"), how="left")
round_6_all_info = round_6_all_info.drop(columns=["player_1_name", "player_2_name", "title_player1", "title_player2", "imdb_id_player1", "imdb_id_player2"])

round_6["pareto0"] = pareto_models[0].predict(round_6_all_info)
round_6["pareto1"] = pareto_models[1].predict(round_6_all_info)
round_6["pareto2"] = pareto_models[2].predict(round_6_all_info)
round_6["pareto3"] = pareto_models[3].predict(round_6_all_info)
round_6["ensembled_predictions"] = (round_6["pareto0"] + round_6["pareto1"] + round_6["pareto2"] + round_6["pareto3"])/4

round_6["Toss-Up"] = abs(round_6["ensembled_predictions"]) < 14.7
round_6.to_csv("../results/2019_predictions_round_6.csv", index=False, float_format='%.0f')

In [119]:
round_6

Unnamed: 0,player_1_name,player_1_seed,player_2_name,player_2_seed,pareto0,pareto1,pareto2,pareto3,ensembled_predictions,Toss-Up
0,No Country For Old Men,1,The Dark Knight,3,0.14392,-2.775646,-2.856139,-0.866456,-1.58858,True
1,Eternal Sunshine of the Spotless Mind,4,Children of Men,10,2.878282,5.26355,4.251537,14.424698,6.704517,True


In [120]:
mistakes = all_y[all_y["ensemble_correct"] == -1]

In [123]:
mistakes["most_surprising"] = mistakes["ensembled_predictions"] - mistakes["plus_minus"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [126]:
mistakes.sort_values("most_surprising")

Unnamed: 0,player_1_name,player_2_name,plus_minus,pareto0,pareto1,pareto2,pareto3,ensembled_predictions,best_model_correct,ensemble_correct,most_surprising
5,Taxi Driver,Casablanca,-6,-1.026298,0.712597,1.014892,0.234874,0.234016,-1.0,-1.0,6.234016
15,Apocalypse Now,The Empire Strikes Back,-6,-0.662841,0.663629,0.304169,2.109313,0.603568,-1.0,-1.0,6.603568
91,Heat,Dazed and Confused,-6,4.112505,2.786394,3.099251,6.057177,4.013832,-1.0,-1.0,10.013832
65,The Shining,Eternal Sunshine of the Spotless Mind,-2,8.391208,8.977938,8.881725,15.99788,10.562188,-1.0,-1.0,12.562188
75,Magnolia,Being John Malkovich,-8,1.723542,4.133521,4.570119,11.037917,5.366275,-1.0,-1.0,13.366275
85,Grave of the Fireflies,Rio Bravo,-4,9.180879,8.944466,9.07981,12.063495,9.817163,-1.0,-1.0,13.817163
12,Pulp Fiction,Fargo,-8,10.213324,9.438451,9.112665,9.666807,9.607812,-1.0,-1.0,17.607812
