In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack,vstack,csr_matrix
from sklearn.ensemble import RandomForestClassifier
import xgboost

In [3]:
train_df = pd.read_csv('flight\\flight_delays_train.csv')
test_df = pd.read_csv('flight\\flight_delays_test.csv')

In [4]:
y_train = train_df['dep_delayed_15min'].map({'Y': 1, 'N': 0}).values
train_df['Route'] = train_df['Origin'] + "-" +train_df['Dest']
test_df['Route'] = test_df['Origin'] + "-" +test_df['Dest']
train_df.drop(["dep_delayed_15min", "DayofMonth"], axis=1, inplace=True)
test_df.drop(["DayofMonth"], axis=1, inplace=True)

In [5]:
routes = pd.concat([train_df['Route'], test_df['Route']],axis=0)
carriers = pd.concat([train_df['UniqueCarrier'], test_df['UniqueCarrier']],axis=0)
ohe = OneHotEncoder(sparse = True)

In [6]:
weekday_ohe_train = ohe.fit_transform(train_df['DayOfWeek'].values.reshape(-1,1))
month_ohe_train = ohe.fit_transform(train_df['Month'].values.reshape(-1,1))
weekday_ohe_test = ohe.fit_transform(test_df['DayOfWeek'].values.reshape(-1,1))
month_ohe_test = ohe.fit_transform(test_df['Month'].values.reshape(-1,1))

In [7]:
carr_ohe = ohe.fit(carriers.values.reshape(-1,1))
carr_ohe_train = carr_ohe.transform(train_df['UniqueCarrier'].values.reshape(-1,1))
carr_ohe_test = carr_ohe.transform(test_df['UniqueCarrier'].values.reshape(-1,1))
carr_ohe_train

<100000x23 sparse matrix of type '<class 'numpy.float64'>'
	with 100000 stored elements in Compressed Sparse Row format>

In [8]:
route_ohe = ohe.fit(routes.values.reshape(-1,1))
route_ohe_train = route_ohe.transform(train_df['Route'].values.reshape(-1,1))
route_ohe_test = route_ohe.transform(test_df['Route'].values.reshape(-1,1))
route_ohe_train

<100000x5048 sparse matrix of type '<class 'numpy.float64'>'
	with 100000 stored elements in Compressed Sparse Row format>

In [9]:
dep_time_train = train_df['DepTime'].apply(lambda dt:dt).astype('float64')
distance_train = train_df['Distance'].apply(lambda dt:dt).astype('float64')
dep_time_test = test_df['DepTime'].apply(lambda dt:dt).astype('float64')
distance_test = test_df['Distance'].apply(lambda dt:dt).astype('float64')
dep_time_train = StandardScaler().fit_transform(dep_time_train.values.reshape(-1,1))
dep_time_test = StandardScaler().fit_transform(dep_time_test.values.reshape(-1,1))
distance_train = StandardScaler().fit_transform(distance_train.values.reshape(-1,1))
distance_test = StandardScaler().fit_transform(distance_test.values.reshape(-1,1))

In [10]:
result_train = hstack([weekday_ohe_train, month_ohe_train, carr_ohe_train, route_ohe_train, dep_time_train, distance_train])
result_test = hstack([weekday_ohe_test, month_ohe_test, carr_ohe_test, route_ohe_test, dep_time_test, distance_test])

In [12]:
X_train_part, X_valid, y_train_part, y_valid = \
    train_test_split(result_train, y_train, 
                     test_size=0.3, random_state=17)

In [13]:
rf = RandomForestClassifier(n_estimators=500, max_depth=30)
rf.fit(X_train_part, y_train_part)

KeyboardInterrupt: 

In [None]:
rf_valid_pred = rf.predict_proba(X_valid)[:, 1]
roc_auc_score(y_valid, rf_valid_pred)

In [None]:
rf = RandomForestClassifier(n_estimators=500, max_depth=30)
test_params = {
 "n_estimators"    : [100, 300,  500] ,
 "max_depth"        : [6, 8, 16, 30],
 "min_samples_split" : [ 2, 4]
}
model = GridSearchCV(estimator = xgb_model,param_grid = test_params, n_jobs=-1, verbose=10)