In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV

from lightgbm import LGBMClassifier

In [2]:
train_df = pd.read_csv('../input/flight_delays_train.csv')
test_df = pd.read_csv('../input/flight_delays_test.csv')
train_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [3]:
train_df.shape

(100000, 9)

In [4]:
# A function to transform Month, DayofMonth, DayofWeek to numbers
def transformtoint(df):
    result = []
    for target in df.tolist():
        tokens = target.split('-')
        result.append(int(tokens[1]))
        
    result = np.asarray(result)
    result = result.reshape((len(result), 1))
    return result
        

In [5]:
trainmonths = transformtoint(train_df['Month'])
traindays = transformtoint(train_df['DayofMonth'])
trainweeks = transformtoint(train_df['DayOfWeek'])

In [6]:
testmonths = transformtoint(test_df['Month'])
testdays = transformtoint(test_df['DayofMonth'])
testweeks = transformtoint(test_df['DayOfWeek'])

In [7]:
#is satarday and is sunday feature
issat_train = (trainweeks == 6).astype('int')
issun_train = (trainweeks == 7).astype('int')

issat_test = (testweeks == 6).astype('int')
issun_test = (testweeks == 7).astype('int')

In [8]:
hour_train = (train_df['DepTime'] // 100).values.reshape(-1, 1)
hour_test = (test_df['DepTime'] // 100).values.reshape(-1, 1)

In [9]:
hour_train.shape

(100000, 1)

In [10]:
X_train = train_df[['Distance', 'DepTime']].values
X_test = test_df[['Distance', 'DepTime']].values
X_train.shape
    

(100000, 2)

In [11]:
X_train = np.concatenate([X_train, trainmonths, traindays, trainweeks, issat_train, issun_train, hour_train], axis = 1)
X_test = np.concatenate([X_test, testmonths, testdays, testweeks, issat_test, issun_test, hour_test], axis = 1)

In [12]:
y_train = train_df['dep_delayed_15min'].map({'Y': 1, 'N': 0}).values

In [13]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(train_df[['UniqueCarrier', 'Origin', 'Dest']])

OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='ignore',
       n_values=None, sparse=True)

In [14]:
other_feature = enc.transform(train_df[['UniqueCarrier', 'Origin', 'Dest']]).toarray()
other_feature_test = enc.transform(test_df[['UniqueCarrier', 'Origin', 'Dest']]).toarray()

In [15]:
other_feature.shape
other_feature_test.shape

(100000, 600)

In [16]:
other_feature_test.shape

(100000, 600)

In [17]:
X_test.shape

(100000, 8)

In [18]:
X_train = np.concatenate([X_train, other_feature], axis = 1)
X_test = np.concatenate([X_test, other_feature_test], axis = 1)

In [19]:
param_grid = {'num_leaves': [7, 15, 31, 63], 
              'max_depth': [3, 4, 5, 6, -1]}

In [None]:
#lgb_clf = LGBMClassifier(random_state=17)
#grid_searcher = GridSearchCV(estimator=lgb_clf, param_grid=param_grid, cv=5, verbose=1)
#grid_searcher.fit(X_train, y_train)
#grid_searcher.best_params_, grid_searcher.best_score

In [20]:
lgb_clf = LGBMClassifier(num_leaves=63, max_depth=-1, random_state=17, n_estimators=200)

In [21]:
lgb_clf.fit(X_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=200, n_jobs=-1, num_leaves=63, objective=None,
        random_state=17, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [22]:
predict = lgb_clf.predict_proba(X_test)[:,1]

In [23]:
pd.Series(predict, 
          name='dep_delayed_15min').to_csv('lgboost.csv', 
                                           index_label='id', header=True)

In [24]:
predict

array([0.06730835, 0.03850231, 0.05779686, ..., 0.20980315, 0.07636789,
       0.05813223])