In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss


In [2]:
men_train = pd.read_csv("_RawData/mens_train_file.csv")
men_test = pd.read_csv("_RawData/mens_test_file.csv")
women_train = pd.read_csv("_RawData/womens_train_file.csv")
women_test = pd.read_csv("_RawData/womens_test_file.csv")


In [3]:
print(men_train.shape)
print(men_test.shape)
print(women_train.shape)
print(women_test.shape)


(5000, 28)
(2000, 28)
(5000, 28)
(1000, 28)


In [4]:
# Analyse data

men_analyse = pd.DataFrame(columns=["Name", "Type", "Unique", "NA"])

for col in men_train.columns:
    men_data = men_train[col]
    men_analyse = men_analyse.append({
        "Name": col, 
        "Type": men_data.dtype, 
        "Unique": men_data.nunique(), 
        "NA": men_data.isnull().sum()
    }, ignore_index=True)

men_analyse.sort_values(by = "Type")

Unnamed: 0,Name,Type,Unique,NA
23,server.is.impact.player,bool,2,0
20,same.side,bool,2,0
7,outside.sideline,bool,2,0
8,outside.baseline,bool,2,0
0,rally,int64,30,0
1,serve,int64,2,0
25,train,int64,1,0
24,id,int64,5000,0
22,previous.time.to.net,float64,5000,0
19,opponent.distance.from.center,float64,4505,0


In [5]:
# Remove unneeded columns
def process_data(data, gender):
    ids = data["id"].apply(lambda x: str(x) + "_" + gender)

    cat_cols = ["server.is.impact.player", "same.side", "outside.baseline", "outside.sideline", "serve", "hitpoint", "previous.hitpoint"]
    drop_cols = ["id", "train", "gender", "outcome"]
    drop_cols += cat_cols
    num_cols = [x for x in data.columns.tolist() if x not in drop_cols]
    
    print(len(cat_cols) + len(drop_cols) + len(num_cols))
    
    new_data = data.copy()
    new_data = pd.get_dummies(new_data, columns = cat_cols)
    
    for col in drop_cols:
        if new_data.columns.contains(col):
            new_data = new_data.drop([col], axis = 1)
            
    new_data[num_cols] = (new_data[num_cols] - X_men[num_cols].mean()) / X_men[num_cols].std()
            
    return ids, new_data

    
def get_train_data(data, gender):
    new_label = data["outcome"]
    ids, new_data = process_data(data, gender)

    return ids, new_data, new_label

men_ids, X_men, y_men = get_train_data(men_train, "mens")
women_ids, X_women, y_women = get_train_data(women_train, "womens")

men_test_ids, X_men_test = process_data(men_test, "mens")
women_test_ids, X_women_test = process_data(women_test, "womens")

print(X_men.shape)
print(y_men.shape)
print(X_women.shape)
print(y_women.shape)
print(X_men_test.shape)
print(X_women_test.shape)


35


NameError: name 'X_men' is not defined

In [207]:
pd.options.display.max_columns = 50
pd.options.display.max_rows = 1000
X_men.head()

Unnamed: 0,rally,speed,net.clearance,distance.from.sideline,depth,player.distance.travelled,player.impact.depth,player.impact.distance.from.center,player.depth,player.distance.from.center,previous.speed,previous.net.clearance,previous.distance.from.sideline,previous.depth,opponent.depth,opponent.distance.from.center,previous.time.to.net,server.is.impact.player_False,server.is.impact.player_True,same.side_False,same.side_True,outside.baseline_False,outside.baseline_True,outside.sideline_False,outside.sideline_True,serve_1,serve_2,hitpoint_B,hitpoint_F,hitpoint_U,hitpoint_V,previous.hitpoint_B,previous.hitpoint_F,previous.hitpoint_U,previous.hitpoint_V
0,-0.55607,0.533482,-0.632118,1.830812,0.693324,-0.808756,0.762886,-0.738909,0.612054,-1.042314,0.454678,0.852426,0.330016,-1.699856,-0.018598,-0.292499,-0.516968,1,0,0,1,1,0,1,0,1,0,1,0,0,0,0,1,0,0
1,-0.55607,0.272181,0.468333,0.987769,-0.617805,-0.345224,0.300325,2.056659,0.16663,0.948868,0.704066,-0.068022,-1.458892,-0.217178,-0.107019,1.965721,-0.584499,1,0,1,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0
2,4.5386,-1.08382,-0.857184,1.883527,1.519057,0.52863,1.150147,-0.506976,0.624626,0.733424,1.524208,-0.665135,-0.948881,-0.663091,0.532636,-0.601518,-0.767404,0,1,1,0,1,0,1,0,1,0,1,0,0,0,0,1,0,0
3,0.784633,0.69551,0.131678,-0.775933,-0.388228,-1.293952,0.619564,-1.633234,0.387663,-1.247052,-0.586553,0.678488,1.104211,-1.769436,0.701332,-1.712651,0.671082,0,1,0,1,1,0,0,1,1,0,0,1,0,0,1,0,0,0
4,-0.55607,0.537056,-0.498538,-0.476397,0.286893,-0.333407,1.092496,-0.245167,0.810355,-0.963936,1.093764,-0.995855,-0.646018,-0.175435,-0.526469,-1.00874,-1.066828,1,0,1,0,1,0,1,0,1,0,1,0,0,0,0,1,0,0


In [208]:
def predict(X, y, X_test):
    log_reg = LogisticRegression()
    log_reg.fit(X, y)
    
    preds = log_reg.predict_proba(X_test)
    
    return log_reg, preds


In [209]:
log_reg_men, pred_men = predict(X_men, y_men, X_men_test)

In [210]:
log_reg_women, pred_women = predict(X_women, y_women, X_women_test)

In [211]:
def analyse(model, data, truth):
    pred_train = model.predict_proba(data)
    pred_train = np.clip(pred_train, 0.0001, 0.9999)
    print(log_loss(truth, pred_train))
    
analyse(log_reg_men, X_men, y_men)
analyse(log_reg_women, X_women, y_women)


0.507574523984
0.842760307841


In [264]:
submission = pd.DataFrame()
submission["submission_id"] = men_test_ids.append(women_test_ids)
submission["train"] = 0
submission["UE"] = np.append(pred_men[:, 0], pred_women[:, 0], axis = 0)
submission["FE"] = np.append(pred_men[:, 1], pred_women[:, 1], axis = 0)
submission["W"] = np.append(pred_men[:, 2], pred_women[:, 2], axis = 0)

submission_test = pd.read_csv("_RawData/AUS_SubmissionFormat.csv")
sorter = submission_test["submission_id"]
submission["submission_id"] = submission["submission_id"].astype("category")
submission["submission_id"].cat.set_categories(sorter, inplace = True)
submission = submission.sort_values(["submission_id"])

submission.to_csv("AUS_SubmissionFormat.csv", index=False)

In [263]:
submission_test.head()

Unnamed: 0,submission_id,train,UE,FE,W
0,4314_mens,0,,,
1,3559_mens,0,,,
2,768_mens,0,,,
3,426_mens,0,,,
4,807_mens,0,,,
