In [45]:
import warnings
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
import xgboost

In [159]:
train_df=pd.read_csv('C:\\Users\\Алматы\\Desktop\\flight_delays_train.csv')
train_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [160]:
test_df=pd.read_csv('C:\\Users\\Алматы\\Desktop\\flight_delays_test.csv')
test_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
0,c-7,c-25,c-3,615,YV,MRY,PHX,598
1,c-4,c-17,c-2,739,WN,LAS,HOU,1235
2,c-12,c-2,c-7,651,MQ,GSP,ORD,577
3,c-3,c-25,c-7,1614,WN,BWI,MHT,377
4,c-6,c-6,c-3,1505,UA,ORD,STL,258


In [161]:
train_df['flight'] = train_df['Origin'] + '-->' + train_df['Dest']
test_df['flight'] = test_df['Origin'] + '-->' + test_df['Dest']

In [162]:
train_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min,flight
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N,ATL-->DFW
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N,PIT-->MCO
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N,RDU-->CLE
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N,DEN-->MEM
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y,MDW-->OMA


In [163]:
categ_feat_idx = np.where(train_df.drop('dep_delayed_15min', axis=1).dtypes == 'object')[0]
categ_feat_idx

array([0, 1, 2, 4, 5, 6, 8], dtype=int64)

In [164]:
X_train = train_df.drop('dep_delayed_15min', axis=1).values
y_train = train_df['dep_delayed_15min'].map({'Y': 1, 'N': 0}).values
X_test = test_df.values

In [165]:
X_train_part, X_valid, y_train_part, y_valid = train_test_split(X_train, y_train, 
                                                                test_size=0.3, 
                                                                random_state=17)

In [166]:
ctb = CatBoostClassifier(random_seed=2019, silent=True,iterations=1200,learning_rate=0.05,eval_metric='AUC',max_depth=None)

In [None]:
ctb.fit(X_train_part, y_train_part,
        cat_features=categ_feat_idx);

In [None]:
ctb_valid_pred = ctb.predict_proba(X_valid)[:, 1]

In [None]:
roc_auc_score(y_valid, ctb_valid_pred)

In [None]:
ctb.fit(X_train, y_train,
        cat_features=categ_feat_idx);

In [None]:
ctb_test_pred = ctb.predict_proba(X_test)[:, 1]

In [None]:
roc_auc_score(y_valid, ctb_valid_pred)

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    sample_sub = pd.read_csv('C:\\Users\\Алматы\\Desktop\\sample_submission.csv')
    sample_sub['dep_delayed_15min'] = ctb_test_pred
    sample_sub.to_csv('ctb_pred.csv')
    

In [None]:
!head ctb_pred.csv

### Training part

In [None]:
train_df.head()

In [32]:
#train_df['dep_delayed_15min']=train_df['dep_delayed_15min'].map({'Y':1,'N':0})

In [146]:
Months1=pd.get_dummies(train_df['Month'])

In [147]:
train_df['HighPicMonths']=Months1.iloc[:,5]+Months1.iloc[:,6]+Months1.iloc[:,11]

In [148]:
Week1=pd.get_dummies(train_df['DayOfWeek'])
train_df['HighPicWeekDays']=Week1.iloc[:,0]+Week1.iloc[:,3]+Week1.iloc[:,4]+Week1.iloc[:,6]




train_df['hour'] = train_df['DepTime'] // 100
train_df.loc[train_df['hour'] == 24, 'hour'] = 0
train_df.loc[train_df['hour'] == 25, 'hour'] = 1


train_df['summer'] = (train_df['Month'].isin(['c-6', 'c-7', 'c-8'])).astype(np.int32)
train_df['autumn'] = (train_df['Month'].isin(['c-9', 'c-10', 'c-11'])).astype(np.int32)
train_df['winter'] = (train_df['Month'].isin(['c-12', 'c-1', 'c-2'])).astype(np.int32)
train_df['spring'] = (train_df['Month'].isin(['c-3', 'c-4', 'c-5'])).astype(np.int32)

In [167]:


train_df['hour'] = train_df['DepTime'] // 100
train_df.loc[train_df['hour'] == 24, 'hour'] = 0
train_df.loc[train_df['hour'] == 25, 'hour'] = 1

In [168]:
for col in ['Origin', 'Dest', 'UniqueCarrier', 'flight']:
    train_df[col] = pd.factorize(train_df[col])[0]

In [169]:
train_df['dep_delayed_15min']=train_df['dep_delayed_15min'].map({'Y':1,'N':0})

In [151]:
train_df['Month'] = train_df['Month'].str[2:].astype('int')
train_df['DayofMonth'] = train_df['DayofMonth'].str[2:].astype('int')
train_df['DayOfWeek'] = train_df['DayOfWeek'].str[2:].astype('int')

In [152]:
train_df['autumn']=train_df['autumn'].map({'1':2,'0':0})
train_df['winter']=train_df['winter'].map({'1':3,'0':0})
train_df['spring']=train_df['spring'].map({'1':4,'0':0})

In [133]:
train_df.drop(['summer','autumn','winter','spring',],axis=1,inplace=True)

In [172]:
train_df.head()

Unnamed: 0,Month,UniqueCarrier,Origin,Dest,dep_delayed_15min,flight,hour
0,c-8,0,0,0,0,0,19
1,c-4,1,1,1,0,1,15
2,c-9,2,2,2,0,2,14
3,c-11,3,3,3,0,3,10
4,c-10,4,4,4,1,4,18


In [154]:
train_df.columns

Index(['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'UniqueCarrier',
       'Origin', 'Dest', 'Distance', 'dep_delayed_15min', 'flight',
       'HighPicMonths', 'HighPicWeekDays', 'hour', 'summer', 'autumn',
       'winter', 'spring'],
      dtype='object')

In [171]:
train_df.drop(['DayofMonth', 'DayOfWeek', 'DepTime','Distance' ],axis=1,inplace=True)

In [173]:
ctb.fit(X_train, y_train,
        cat_features=categ_feat_idx);

In [174]:
ctb_valid_pred = ctb.predict_proba(X_valid)[:, 1]

In [175]:
roc_auc_score(y_valid, ctb_valid_pred)

0.8757110906512164

In [176]:
ctb.fit(X_train_part, y_train_part,
        cat_features=categ_feat_idx);

In [177]:
ctb_valid_pred = ctb.predict_proba(X_valid)[:, 1]

roc_auc_score(y_valid, ctb_valid_pred)

0.7565874938863623

In [179]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    sample_sub = pd.read_csv('C:\\Users\\Алматы\\Desktop\\sample_submission.csv')
    sample_sub['dep_delayed_15min'] = ctb_test_pred
    sample_sub.to_csv('ctb_pred_new.csv')

In [180]:
csv_df=pd.read_csv('C:\\Users\\Алматы\\Desktop\\ctb_pred_new.csv')

In [181]:
del csv_df['id1']

In [182]:
csv_df.head()

Unnamed: 0,id,dep_delayed_15min
0,0,0.031972
1,1,0.053566
2,2,0.047117
3,3,0.266878
4,4,0.263253


In [183]:
export_csv = csv_df.to_csv ('C:\\Users\\Алматы\\Desktop\\ctb_pred_new2.csv', index = None, header=True) #Don't forget to add '.csv' at the end of the path


In [184]:
csv_df=pd.read_csv('C:\\Users\\Алматы\\Desktop\\ctb_pred_new2.csv')

In [185]:
csv_df

Unnamed: 0,id,dep_delayed_15min
0,0,0.031972
1,1,0.053566
2,2,0.047117
3,3,0.266878
4,4,0.263253
5,5,0.088702
6,6,0.066598
7,7,0.213647
8,8,0.129245
9,9,0.295244
