In [27]:
import pandas as pd
import numpy as np
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [28]:
data = pd.read_csv("data/train.csv")
data["is_test"] = 0
test = pd.read_csv("data/test.csv")
test["is_test"] = 1
data = data.append(test, sort=False)

In [29]:
data.sort_values(by=["year", "month", "day", "sched_dep_time"], inplace=True)
data["date"] = data.apply(lambda x: str(x["year"]) + "_" + str(x["month"]) + "_" + str(x["day"]), axis=1)

In [30]:
data["sched_arr_hr"] = data["sched_arr_time"].apply(lambda x: int(x / 100))
weather = pd.read_csv("data/weather.csv")
weather.drop_duplicates(subset=["year", "month", "day"], keep="first", inplace=True)
weather["date"] = weather.apply(lambda x: str(x["year"]) + "_" + str(x["month"]) + "_" + str(x["day"]), axis=1)
weather.drop(["year", "month", "day", "hour"], axis=1, inplace=True)

In [31]:
data = data.merge(weather, on=["origin", "date"], how="left")

In [32]:
data["precip"].fillna("NA", inplace=True)
data["precip"] = data["precip"].apply(lambda x: -999 if str(x) == "NA" else x)
data["temp"].fillna("NA", inplace=True)
data["temp"] = data["temp"].apply(lambda x: -999 if str(x) == "NA" else x)
data["humid"].fillna("NA", inplace=True)
data["humid"] = data["humid"].apply(lambda x: -999 if str(x) == "NA" else x)
data["wind_speed"].fillna("NA", inplace=True)
data["wind_speed"] = data["wind_speed"].apply(lambda x: -999 if str(x) == "NA" else x)
data["wind_gust"].fillna("NA", inplace=True)
data["wind_gust"] = data["wind_gust"].apply(lambda x: -999 if str(x) == "NA" else x)

In [33]:
delay_dict = {}
origin_vals = data["origin"].values
date_vals = data["date"].values



delay_vals = data["is_delayed"].values
is_test = data["is_test"].values
for n in range(0, len(data)):
    if origin_vals[n] not in delay_dict:
        delay_dict[origin_vals[n]] = {}
    if date_vals[n] not in delay_dict[origin_vals[n]]:
        delay_dict[origin_vals[n]][date_vals[n]] = {"total":0, "train":[]}
    if is_test[n] == 0:
        delay_dict[origin_vals[n]][date_vals[n]]["train"].append(delay_vals[n])
    delay_dict[origin_vals[n]][date_vals[n]]["total"] += 1

In [34]:
count_day = np.zeros(len(data))
delay_day = np.zeros(len(data))
for n in range(0, len(data)):
    count_day[n] = delay_dict[origin_vals[n]][date_vals[n]]["total"]
    delay_day[n] = np.average(delay_dict[origin_vals[n]][date_vals[n]]["train"])
data["count_day"] = count_day
data["delay_day"] = delay_day

In [35]:
data["origin"] = pd.Categorical(data["origin"])
data["dest"] = pd.Categorical(data["dest"])
data["carrier"] = pd.Categorical(data["carrier"])

In [36]:
train = data[data["is_test"] == 0]
test = data[data["is_test"] == 1]

In [37]:
print(len(train), len(test))

168573 168203


In [89]:
use_features = ["distance", 
                "origin", 
                "dest", 
                "carrier", 
                "sched_arr_hr", 
                "precip", 
                "temp", 
                "humid", 
                "wind_speed", 
                "count_day", 
                "delay_day",
                "visib"]

In [82]:
data

Unnamed: 0,id,is_delayed,year,month,day,sched_dep_time,sched_arr_time,carrier,origin,dest,...,humid,wind_dir,wind_speed,wind_gust,precip,pressure,visib,time_hour,count_day,delay_day
0,280700,,2013,1,1,515,819,UA,EWR,IAH,...,59.37,270.0,10.35702,-999.0,0.0,1012.0,10.0,2013-01-01 01:00:00,305.0,0.575163
1,193431,1.0,2013,1,1,529,830,UA,LGA,IAH,...,-999.00,,-999.00000,-999.0,-999.0,,,,240.0,0.587302
2,131924,,2013,1,1,540,850,AA,JFK,MIA,...,-999.00,,-999.00000,-999.0,-999.0,,,,297.0,0.500000
3,85357,0.0,2013,1,1,545,1022,B6,JFK,BQN,...,-999.00,,-999.00000,-999.0,-999.0,,,,297.0,0.500000
4,3538,,2013,1,1,558,728,UA,EWR,ORD,...,59.37,270.0,10.35702,-999.0,0.0,1012.0,10.0,2013-01-01 01:00:00,305.0,0.575163
5,220397,,2013,1,1,559,706,B6,JFK,BOS,...,-999.00,,-999.00000,-999.0,-999.0,,,,297.0,0.500000
6,77755,0.0,2013,1,1,600,850,B6,EWR,PBI,...,59.37,270.0,10.35702,-999.0,0.0,1012.0,10.0,2013-01-01 01:00:00,305.0,0.575163
7,84227,0.0,2013,1,1,600,937,UA,EWR,SFO,...,59.37,270.0,10.35702,-999.0,0.0,1012.0,10.0,2013-01-01 01:00:00,305.0,0.575163
8,87885,1.0,2013,1,1,600,735,MQ,EWR,ORD,...,59.37,270.0,10.35702,-999.0,0.0,1012.0,10.0,2013-01-01 01:00:00,305.0,0.575163
9,94244,0.0,2013,1,1,600,856,B6,JFK,TPA,...,-999.00,,-999.00000,-999.0,-999.0,,,,297.0,0.500000


In [90]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss', 'auc'},
    'num_leaves': 200,
    'max_depth': 10,
    'learning_rate': 0.05,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.7,
    'verbose': -1,
}

In [91]:
train["is_val"] = np.random.randint(low=0, high=9, size=len(train))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [92]:
lgb_train = lgb.Dataset(
    train[use_features].loc[train["is_val"]>1], 
    train["is_delayed"].loc[train["is_val"]>1])
lgb_eval = lgb.Dataset(
    train[use_features].loc[train["is_val"]<=1], 
    train["is_delayed"].loc[train["is_val"]<=1])
gbm = lgb.train(
    params, 
    lgb_train, 
    num_boost_round=5000, 
    valid_sets=lgb_eval, early_stopping_rounds=100, verbose_eval=50)


Training until validation scores don't improve for 500 rounds.
[50]	valid_0's auc: 0.765535	valid_0's binary_logloss: 0.569979
[100]	valid_0's auc: 0.770037	valid_0's binary_logloss: 0.558331
[150]	valid_0's auc: 0.772741	valid_0's binary_logloss: 0.55486
[200]	valid_0's auc: 0.77409	valid_0's binary_logloss: 0.553305
[250]	valid_0's auc: 0.775027	valid_0's binary_logloss: 0.552445
[300]	valid_0's auc: 0.775399	valid_0's binary_logloss: 0.552153
[350]	valid_0's auc: 0.776083	valid_0's binary_logloss: 0.551565
[400]	valid_0's auc: 0.776564	valid_0's binary_logloss: 0.551181
[450]	valid_0's auc: 0.77677	valid_0's binary_logloss: 0.551172
[500]	valid_0's auc: 0.776864	valid_0's binary_logloss: 0.551394
[550]	valid_0's auc: 0.777133	valid_0's binary_logloss: 0.551431
[600]	valid_0's auc: 0.777344	valid_0's binary_logloss: 0.551591
[650]	valid_0's auc: 0.777492	valid_0's binary_logloss: 0.551743
[700]	valid_0's auc: 0.777615	valid_0's binary_logloss: 0.551992
[750]	valid_0's auc: 0.777567	v

In [87]:
pred = gbm.predict(test[use_features])

In [88]:
test["is_delayed"] = pred
test[["id" ,"is_delayed"]].to_csv("sub6.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
