In [17]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
import davemodel
from davemodel import MyLogisticRegression, DaveModelBase
from importlib import reload


In [2]:
men_train = pd.read_csv("_RawData/mens_train_file.csv")
men_test = pd.read_csv("_RawData/mens_test_file.csv")
women_train = pd.read_csv("_RawData/womens_train_file.csv")
women_test = pd.read_csv("_RawData/womens_test_file.csv")


In [3]:
print(men_train.shape)
print(men_test.shape)
print(women_train.shape)
print(women_test.shape)


(5000, 28)
(2000, 28)
(5000, 28)
(1000, 28)


In [4]:
# Analyse data

men_analyse = pd.DataFrame(columns=["Name", "Type", "Unique", "NA"])

for col in men_train.columns:
    men_data = men_train[col]
    men_analyse = men_analyse.append({
        "Name": col, 
        "Type": men_data.dtype, 
        "Unique": men_data.nunique(), 
        "NA": men_data.isnull().sum()
    }, ignore_index=True)

men_analyse.sort_values(by = "Type")

Unnamed: 0,Name,Type,Unique,NA
23,server.is.impact.player,bool,2,0
20,same.side,bool,2,0
7,outside.sideline,bool,2,0
8,outside.baseline,bool,2,0
0,rally,int64,30,0
1,serve,int64,2,0
25,train,int64,1,0
24,id,int64,5000,0
22,previous.time.to.net,float64,5000,0
19,opponent.distance.from.center,float64,4505,0


In [5]:
# Remove unneeded columns
def process_data(data, gender):
    ids = data["id"].apply(lambda x: str(x) + "_" + gender)

    cat_cols = ["server.is.impact.player", "same.side", "outside.baseline", "outside.sideline", "serve", "hitpoint", "previous.hitpoint"]
    drop_cols = ["id", "train", "gender", "outcome"]
    drop_cols += cat_cols
    num_cols = [x for x in data.columns.tolist() if x not in drop_cols]
    
    print(len(cat_cols) + len(drop_cols) + len(num_cols))
    
    new_data = data.copy()
    new_data = pd.get_dummies(new_data, columns = cat_cols)
    
    for col in drop_cols:
        if new_data.columns.contains(col):
            new_data = new_data.drop([col], axis = 1)
            
    new_data[num_cols] = (new_data[num_cols] - new_data[num_cols].mean()) / new_data[num_cols].std()
            
    return ids, new_data

    
def get_train_data(data, gender):
    new_label = data["outcome"]
    ids, new_data = process_data(data, gender)

    return ids, new_data, new_label

men_ids, X_men, y_men = get_train_data(men_train, "mens")
women_ids, X_women, y_women = get_train_data(women_train, "womens")

men_test_ids, X_men_test = process_data(men_test, "mens")
women_test_ids, X_women_test = process_data(women_test, "womens")

print(X_men.shape)
print(y_men.shape)
print(X_women.shape)
print(y_women.shape)
print(X_men_test.shape)
print(X_women_test.shape)


35
35
35
35
(5000, 35)
(5000,)
(5000, 35)
(5000,)
(2000, 35)
(1000, 35)


In [6]:
pd.options.display.max_columns = 50
pd.options.display.max_rows = 1000
X_men.head()

Unnamed: 0,rally,speed,net.clearance,distance.from.sideline,depth,player.distance.travelled,player.impact.depth,player.impact.distance.from.center,player.depth,player.distance.from.center,previous.speed,previous.net.clearance,previous.distance.from.sideline,previous.depth,opponent.depth,opponent.distance.from.center,previous.time.to.net,server.is.impact.player_False,server.is.impact.player_True,same.side_False,same.side_True,outside.baseline_False,outside.baseline_True,outside.sideline_False,outside.sideline_True,serve_1,serve_2,hitpoint_B,hitpoint_F,hitpoint_U,hitpoint_V,previous.hitpoint_B,previous.hitpoint_F,previous.hitpoint_U,previous.hitpoint_V
0,-0.55607,0.533482,-0.632118,1.830812,0.693324,-0.808756,0.762886,-0.738909,0.612054,-1.042314,0.454678,0.852426,0.330016,-1.699856,-0.018598,-0.292499,-0.516968,1,0,0,1,1,0,1,0,1,0,1,0,0,0,0,1,0,0
1,-0.55607,0.272181,0.468333,0.987769,-0.617805,-0.345224,0.300325,2.056659,0.16663,0.948868,0.704066,-0.068022,-1.458892,-0.217178,-0.107019,1.965721,-0.584499,1,0,1,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0
2,4.5386,-1.08382,-0.857184,1.883527,1.519057,0.52863,1.150147,-0.506976,0.624626,0.733424,1.524208,-0.665135,-0.948881,-0.663091,0.532636,-0.601518,-0.767404,0,1,1,0,1,0,1,0,1,0,1,0,0,0,0,1,0,0
3,0.784633,0.69551,0.131678,-0.775933,-0.388228,-1.293952,0.619564,-1.633234,0.387663,-1.247052,-0.586553,0.678488,1.104211,-1.769436,0.701332,-1.712651,0.671082,0,1,0,1,1,0,0,1,1,0,0,1,0,0,1,0,0,0
4,-0.55607,0.537056,-0.498538,-0.476397,0.286893,-0.333407,1.092496,-0.245167,0.810355,-0.963936,1.093764,-0.995855,-0.646018,-0.175435,-0.526469,-1.00874,-1.066828,1,0,1,0,1,0,1,0,1,0,1,0,0,0,0,1,0,0


In [89]:
reload(davemodel)
from davemodel import MyLogisticRegression, MyRandomForest, MyXGBoost, MyBagging, MyGradientBoosting


In [86]:
blah.model_men

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [90]:
blah = MyLogisticRegression(men_test_ids, X_men, y_men, X_men_test, women_test_ids, X_women, y_women, X_women_test)
blah.run()


Training men...
{'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
Log loss: 0.505158601069

Training women...
Log loss: 0.495263499068

Predicting men...


NotFittedError: Call fit before prediction

In [63]:

log = MyLogisticRegression(men_test_ids, X_men, y_men, X_men_test, women_test_ids, X_women, y_women, X_women_test)
forest = MyRandomForest(men_test_ids, X_men, y_men, X_men_test, women_test_ids, X_women, y_women, X_women_test)
xgb = MyXGBoost(men_test_ids, X_men, y_men, X_men_test, women_test_ids, X_women, y_women, X_women_test)
bag = MyBagging(men_test_ids, X_men, y_men, X_men_test, women_test_ids, X_women, y_women, X_women_test)
grad = MyGradientBoosting(men_test_ids, X_men, y_men, X_men_test, women_test_ids, X_women, y_women, X_women_test)

In [64]:
log.run()
forest.run()
xgb.run()
bag.run()
grad.run()


Training men...
Log loss: 0.50757452405

Training women...
Log loss: 0.497407190386

Predicting men...
Prediction count: (2000, 3)

Predicting women...
Prediction count: (1000, 3)
Creating submission...
Done!
Saved

Training men...
Log loss: 0.0925285707373

Training women...
Log loss: 0.0932425495307

Predicting men...
Prediction count: (2000, 3)

Predicting women...
Prediction count: (1000, 3)
Creating submission...
Done!
Saved

Training men...
Log loss: 0.0550823955867

Training women...
Log loss: 0.0630389453839

Predicting men...
Prediction count: (2000, 3)

Predicting women...
Prediction count: (1000, 3)
Creating submission...
Done!
Saved

Training men...
Log loss: 0.0883779935142

Training women...
Log loss: 0.0879621731554

Predicting men...
Prediction count: (2000, 3)

Predicting women...
Prediction count: (1000, 3)
Creating submission...
Done!
Saved

Training men...
Log loss: 0.244927089419

Training women...
Log loss: 0.259421664974

Predicting men...
Prediction count: (200

In [65]:
test_log = DaveModelBase.load_model(log.get_name() + ".pickle")
test_forest = DaveModelBase.load_model(forest.get_name() + ".pickle")
test_xgb = DaveModelBase.load_model(xgb.get_name() + ".pickle")
test_bag = DaveModelBase.load_model(bag.get_name() + ".pickle")
test_grad = DaveModelBase.load_model(grad.get_name() + ".pickle")


Loaded
Loaded
Loaded
Loaded
Loaded


In [66]:
test_log.evaluate_cv()
test_forest.evaluate_cv()
test_xgb.evaluate_cv()
test_bag.evaluate_cv()
test_grad.evaluate_cv()



Log Loss: (-0.522) +/- (0.023)

Log Loss: (-0.512) +/- (0.026)

Log Loss: (-0.381) +/- (0.039)

Log Loss: (-0.447) +/- (0.031)

Log Loss: (-0.322) +/- (0.039)

Log Loss: (-0.341) +/- (0.024)

Log Loss: (-0.472) +/- (0.058)

Log Loss: (-0.501) +/- (0.082)

Log Loss: (-0.338) +/- (0.026)

Log Loss: (-0.358) +/- (0.025)


In [67]:
test_forest.evaluate()

Log loss: 0.366949887435
Log loss: 0.352060539347


In [68]:
def train_me(model, X_train, X_test, y_train, y_test):
    DaveModelBase.train_model(model, X_train, y_train)
    preds = DaveModelBase.predict_model(model, X_test)
    print(log_loss(y_test, preds))
    return preds

X_train, X_test, y_train, y_test = train_test_split(X_men, y_men, test_size = 0.25, random_state = 42)

model_log = test_log.get_model()
log_preds = train_me(model_log, X_train, X_test, y_train, y_test)

model_forest = test_forest.get_model()
forest_preds = train_me(model_forest, X_train, X_test, y_train, y_test)

model_xgb = test_xgb.get_model()
xgb_preds = train_me(model_xgb, X_train, X_test, y_train, y_test)

model_bag = test_bag.get_model()
bag_preds = train_me(model_bag, X_train, X_test, y_train, y_test)

model_grad = test_grad.get_model()
grad_preds = train_me(model_grad, X_train, X_test, y_train, y_test)




Prediction count: (1250, 3)
0.536163023819
Prediction count: (1250, 3)
0.366922208483
Prediction count: (1250, 3)
0.337911808214
Prediction count: (1250, 3)
0.420468145236
Prediction count: (1250, 3)
0.346595907286


In [123]:
from sklearn.svm import SVC
model_svc = SVC(probability=True)
model_svc.fit(X_train, y_train)
svc_preds = model_svc.predict_proba(X_test)


In [287]:
# preds = (log_preds + forest_preds + xgb_preds + bag_preds + grad_preds) / 5
# print(log_loss(y_test, preds))

preds = xgb_preds
print(log_loss(y_test, preds))

preds = (log_preds + forest_preds + xgb_preds + bag_preds + grad_preds + svc_preds) / 6
print(log_loss(y_test, preds))

weight_preds = (0.8 * xgb_preds) + (0.2 * forest_preds)
print(log_loss(y_test, weight_preds))

pred_arr = np.array([log_preds, forest_preds, xgb_preds, bag_preds, grad_preds, svc_preds, weight_preds])
preds = np.median(pred_arr, axis=0)
print(log_loss(y_test, preds))


# pd.DataFrame(pred_arr).shape
pred_arr.shape

0.337911808214
0.3647058751
0.332720460214
0.329638545346


(7, 1250, 3)

In [291]:
# for pred_value in pred_arr:
#     print(pred_value.shape)
 
# print(len(pred_arr))
# FE, UE, W

pred_all = []

for j in range(pred_arr.shape[1]):

    fe_count = 0
    ue_count = 0
    w_count = 0
    max_value = len(pred_arr)

    for i in range(len(pred_arr)):
        fe = pred_arr[i, j, 0]
        ue = pred_arr[i, j, 1]
        w = pred_arr[i, j, 2]

        value = max([fe, ue, w])

        if value == fe and value > 0.8:
            fe_count += 1
        elif value == ue and value > 0.8:
            ue_count += 1
        elif value == w and value > 0.8:
            w_count += 1

    found = True
    
    if fe_count == max_value:
        fe_pred = np.clip(max(pred_arr[:, j, 0]), 0.01, 0.99)
        leftovers = 1 - fe_pred
        ue_pred = leftovers / 2
        w_pred = ue_pred
    elif ue_count == max_value:
        ue_pred = np.clip(max(pred_arr[:, j, 1]), 0.01, 0.99)
        leftovers = 1 - ue_pred
        fe_pred = leftovers / 2
        w_pred = fe_pred
    elif w_count == max_value:
        w_pred = np.clip(max(pred_arr[:, j, 2]), 0.01, 0.99)
        leftovers = 1 - w_pred
        ue_pred = leftovers / 2
        fe_pred = ue_pred
    else:
#         fe_pred = pred_arr[6, j, 0]
#         ue_pred = pred_arr[6, j, 1]
#         w_pred = pred_arr[6, j, 2]
        
        fe_pred = np.median(pred_arr[:, j, 0])
        ue_pred = np.median(pred_arr[:, j, 1])
        w_pred = np.median(pred_arr[:, j, 2])
        
        found = False


#     if found:
#         print("Found!")
#     else:
#         print("Median!")
        
    pred_value = [fe_pred, ue_pred, w_pred]
    pred_all.append(pred_value)
#     print(pred_value)
    # print(fe_count)
    # print(ue_count)
    # print(w_count)


In [292]:
pred_all = np.array(pred_all)
pred_all.shape

(1250, 3)

In [293]:
print(log_loss(y_test, pred_all))

0.331103808663


In [249]:
(pred_all[:, 0] + pred_all[:, 1] + pred_all[:, 2])[0:100]

array([ 0.99999998,  1.00000003,  1.00000001,  0.99999998,  1.        ,
        0.99999999,  0.99999996,  1.00000002,  0.99999997,  1.00000001,
        1.        ,  1.00000001,  1.        ,  1.        ,  0.99999995,
        1.00000004,  1.00000007,  1.00000002,  1.00000008,  1.        ,
        1.        ,  0.99999998,  1.        ,  1.        ,  1.00000006,
        0.99999999,  1.00000004,  0.99999996,  0.99999998,  0.99999997,
        0.99999996,  1.        ,  0.99999997,  1.        ,  0.99999999,
        1.00000001,  1.00000006,  1.00000002,  1.00000001,  1.00000003,
        1.00000006,  1.        ,  1.        ,  1.        ,  1.00000002,
        1.        ,  1.        ,  0.99999999,  0.99999997,  0.99999997,
        0.99999999,  1.00000001,  1.00000003,  0.99999998,  1.        ,
        1.00000004,  1.        ,  0.99999998,  0.99999997,  1.00000001,
        0.99999996,  1.00000002,  0.99999998,  0.99999994,  0.99999997,
        0.99999996,  1.00000005,  0.99999999,  1.00000002,  1.00

In [140]:
(preds[:, 0] + preds[:, 1] + preds[:, 2])[0:100]

array([ 0.99999999,  1.00000001,  1.00000001,  0.99999999,  0.99999999,
        0.99999999,  0.99999998,  1.00000001,  0.99999998,  1.00000001,
        1.00000003,  1.00000001,  1.        ,  0.99999997,  0.99999998,
        1.00000002,  1.00000003,  1.00000001,  1.00000004,  1.        ,
        1.        ,  0.99999999,  1.        ,  0.99999999,  1.00000003,
        0.99999999,  1.00000002,  0.99999998,  0.99999999,  0.99999999,
        0.99999998,  1.        ,  0.99999999,  1.        ,  0.99999999,
        1.        ,  1.00000003,  1.00000001,  1.00000001,  1.00000001,
        1.00000003,  0.99999998,  0.99999997,  1.00000001,  1.00000001,
        1.00000002,  1.        ,  0.99999999,  0.99999998,  0.99999999,
        0.99999999,  1.00000001,  1.00000002,  0.99999999,  1.        ,
        1.00000002,  1.        ,  0.99999999,  0.99999998,  1.00000001,
        0.99999998,  1.00000001,  0.99999999,  0.99999997,  0.99999999,
        0.99999998,  1.00000002,  1.        ,  1.00000001,  1.00

In [102]:
from sklearn.model_selection import GridSearchCV
blah = LogisticRegression()
params = {
            'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
            'solver': ['newton-cg', 'sag', 'saga', 'lbfgs', 'liblinear']
         }

gscv = GridSearchCV(blah, params, cv=5)
gscv.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'solver': ['newton-cg', 'sag', 'saga', 'lbfgs', 'liblinear'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [103]:
gscv.best_estimator_.predict_proba(X_test)
print(log_loss(y_test, pred1))
print(gscv.best_params_)

0.533708225897
{'solver': 'newton-cg', 'C': 10}


In [104]:
blah2 = LogisticRegression()
blah2.fit(X_train, y_train)
pred2 = blah2.predict_proba(X_test)
print(log_loss(y_test, pred2))
blah2.C

0.536163023819


1.0

In [114]:
from sklearn.svm import SVC

svc = SVC(probability=True)
svc.fit(X_train, y_train)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [115]:
svc_pred = svc.predict_proba(X_test)

print(log_loss(y_test, svc_pred))


0.434240701383


In [116]:
from sklearn.linear_model import RidgeClassifier, SGDClassifier, PassiveAggressiveClassifier

ridge = RidgeClassifier(solver = '')
ridge.fit(X_train, y_train)

sgd = SGDClassifier()
sgd.fit(X_train, y_train)

passive = PassiveAggressiveClassifier()
passive.fit(X_train, y_train)



PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
              fit_intercept=True, loss='hinge', max_iter=5, n_iter=None,
              n_jobs=1, random_state=None, shuffle=True, tol=None,
              verbose=0, warm_start=False)

In [121]:
ridge_pred = ridge.predict(X_test)
sgd_pred = sgd.predict(X_test)
passive_pred = passive.predict(X_test)

ridge_pred
# print(log_loss(y_test, ridge_pred))
# print(log_loss(y_test, sgd_pred))
# print(log_loss(y_test, passive_pred))


array(['UE', 'W', 'UE', ..., 'UE', 'FE', 'UE'],
      dtype='<U2')