In [12]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import datetime

from sklearn.ensemble import StackingClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [13]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

print(train.shape, test.shape)

train.head()

(54931, 21) (11834, 20)


Unnamed: 0,id,Gender,Travel,Age,Mode of reservation,Class,Distance Travelled,Departure/Arrival time convenient,Ease of Online booking,Platform Number,...,Railway service,Seat comfort,Ticket-collector service,Washroom service,Baggage security score,Compartment safety score,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,93908,Male,frequent,44,online,first-class,3574.4,3,5,5,...,5,4,5,5,5,3,4,37,31.0,1
1,44467,Female,frequent,38,online,second-class,313.6,4,4,4,...,4,4,2,5,4,2,4,53,76.0,0
2,87068,Female,frequent,42,online,first-class,950.4,4,4,4,...,5,4,5,5,5,5,5,33,25.0,1
3,17230,Male,non-frequent,31,online,second-class,1395.2,3,3,5,...,3,4,4,2,2,1,4,32,28.0,0
4,20054,Male,non-frequent,25,online,second-class,540.8,5,5,3,...,5,1,3,5,2,2,4,0,0.0,1


In [37]:
def get_data():
    train = pd.read_csv('input/train.csv')
    test = pd.read_csv('input/test.csv')
    y_train = train['satisfaction']
    train.drop('satisfaction', axis = 1, inplace = True)
    
    data = pd.concat([train, test]).reset_index(drop = True)
    data = pd.get_dummies(data)
    n_train = train.shape[0]
    
    print('Missing values before: \n')
    print(data.isna().sum().sum())
    
    data['Arrival Delay in Minutes'] = data['Arrival Delay in Minutes'].fillna(0)
    
    print('Missing values after: \n')
    print(data.isna().sum().sum())
    
    X_train = data[:n_train]
    X_test = data[n_train:]
    return (X_train, y_train, X_test)
    
X_train, y_train, X_test = get_data()
print(X_train.shape, y_train.shape, X_test.shape)
X_train.head()

Missing values before: 

203
Missing values after: 

0
(54931, 25) (54931,) (11834, 25)


Unnamed: 0,id,Age,Distance Travelled,Departure/Arrival time convenient,Ease of Online booking,Platform Number,Food Service,Railway service,Seat comfort,Ticket-collector service,...,Arrival Delay in Minutes,Gender_Female,Gender_Male,Travel_frequent,Travel_non-frequent,Mode of reservation_offline,Mode of reservation_online,Class_first-class,Class_handicapped-class,Class_second-class
0,93908,44,3574.4,3,5,5,4,5,4,5,...,31.0,0,1,1,0,0,1,1,0,0
1,44467,38,313.6,4,4,4,4,4,4,2,...,76.0,1,0,1,0,0,1,0,0,1
2,87068,42,950.4,4,4,4,4,5,4,5,...,25.0,1,0,1,0,0,1,1,0,0
3,17230,31,1395.2,3,3,5,4,3,4,4,...,28.0,0,1,0,1,0,1,0,0,1
4,20054,25,540.8,5,5,3,4,5,1,3,...,0.0,0,1,0,1,0,1,0,0,1


In [5]:
def get_stacking():
    level0 = []
    level0.append(('lr', LogisticRegression()))
    level0.append(('knn', KNeighborsClassifier()))
    level0.append(('cart', DecisionTreeClassifier()))
    level0.append(('svm', SVC()))
    level0.append(('bayes', GaussianNB()))
    
    level1 = RandomForestClassifier()
    
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv = 5)
    return model
get_stacking()

StackingClassifier(cv=5,
                   estimators=[('lr', LogisticRegression()),
                               ('knn', KNeighborsClassifier()),
                               ('cart', DecisionTreeClassifier()),
                               ('svm', SVC()), ('bayes', GaussianNB())],
                   final_estimator=RandomForestClassifier())

In [6]:
def sub(name, score, preds):
    out = pd.DataFrame(
        {
            'satisfaction' : preds
        }
    )
    name = str(score) + '__' + name + '__.csv'
    out.to_csv(name, index = False)

In [8]:
def get_models():
    models = {
        'lr' : LogisticRegression(),
        'knn' : KNeighborsClassifier(),
        'cart' : DecisionTreeClassifier(),
        'svm' : SVC(),
        'bayes' : GaussianNB(),
        'stacking' : get_stacking(),
        'rfc' : RandomForestClassifier()
    }
    return models

models = get_models()

In [9]:
def evaluate(model):
    cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=1)
    data = get_dataset()
    X_train, y_train, X_test = get_data()
    
    scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv = cv, n_jobs=-1, error_score='raise')
    model.fit(X_train, y_train)
    preds = model.predict()
    return (np.mean(scores), preds)
evaluate(models.items())

KeyError: 0

In [11]:
from tqdm import tqdm
for i in tqdm(range(1000000)):
    pass

100%|██████████| 1000000/1000000 [00:00<00:00, 1412385.73it/s]


In [15]:
X_train, y_train, X_test = get_data()

Missing values before: 

203
Missing values after: 

0


In [16]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

RandomForestClassifier()

In [21]:
imp = list(model.feature_importances_)
cols = X_train.columns.values

In [35]:
imps = pd.DataFrame({
    'feature' : cols,
    'importance' : imp
})
imps.sort_values(by = 'importance', ascending = False)

Unnamed: 0,feature,importance
7,Railway service,0.181816
8,Seat comfort,0.070462
22,Class_first-class,0.058222
10,Washroom service,0.056984
20,Mode of reservation_offline,0.05299
9,Ticket-collector service,0.049257
2,Distance Travelled,0.046043
21,Mode of reservation_online,0.044373
1,Age,0.043491
11,Baggage security score,0.041443


In [40]:
imps.sort_values(by = 'importance', ascending = False)[:17]

Unnamed: 0,feature,importance
7,Railway service,0.181816
8,Seat comfort,0.070462
22,Class_first-class,0.058222
10,Washroom service,0.056984
20,Mode of reservation_offline,0.05299
9,Ticket-collector service,0.049257
2,Distance Travelled,0.046043
21,Mode of reservation_online,0.044373
1,Age,0.043491
11,Baggage security score,0.041443


In [41]:
feats = imps.sort_values(by = 'importance', ascending = False)[:17].feature.values
print(feats)

['Railway service' 'Seat comfort' 'Class_first-class' 'Washroom service'
 'Mode of reservation_offline' 'Ticket-collector service'
 'Distance Travelled' 'Mode of reservation_online' 'Age'
 'Baggage security score' 'id' 'Travel_frequent' 'Class_second-class'
 'Cleanliness' 'Compartment safety score' 'Ease of Online booking'
 'Travel_non-frequent']


In [42]:
list(feats)

['Railway service',
 'Seat comfort',
 'Class_first-class',
 'Washroom service',
 'Mode of reservation_offline',
 'Ticket-collector service',
 'Distance Travelled',
 'Mode of reservation_online',
 'Age',
 'Baggage security score',
 'id',
 'Travel_frequent',
 'Class_second-class',
 'Cleanliness',
 'Compartment safety score',
 'Ease of Online booking',
 'Travel_non-frequent']