In [1]:
from sklearn.linear_model import SGDClassifier
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Visualization Library
import matplotlib.pyplot as plt 
import seaborn as sns
from collections import Counter
import tensorflow as tf
from sklearn.model_selection import train_test_split,StratifiedKFold,GridSearchCV
import pdb
import tensorflow.contrib.keras as keras
from tensorflow.contrib.keras import losses,optimizers,metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix,classification_report
from tensorboard.plugins.hparams import api as hp

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [2]:
train_df=pd.read_csv("original_data/train.csv")
test_df=pd.read_csv("original_data/test.csv")

In [3]:


train_df = train_df.drop(['Unnamed: 0', 'id'], axis=1)
test_df = test_df.drop(['Unnamed: 0', 'id'], axis=1)

train_df.columns=[each.replace(" ","_") for each in train_df.columns]
test_df.columns=[each.replace(" ","_") for each in train_df.columns]

##Feature engineering
train_df=pd.get_dummies(train_df,columns=["Customer_Type"])
train_df=pd.get_dummies(train_df,columns=["Type_of_Travel"])
train_df=pd.get_dummies(train_df,columns=["Class"])
train_df=pd.get_dummies(train_df,columns=["Gender"])

test_df=pd.get_dummies(test_df,columns=["Customer_Type"])
test_df=pd.get_dummies(test_df,columns=["Type_of_Travel"])
test_df=pd.get_dummies(test_df,columns=["Class"])
test_df=pd.get_dummies(test_df,columns=["Gender"])


training_data = np.asarray(train_df)
training_count = len(training_data[:,0])

testing_data = np.asarray(test_df)
testing_count = len(testing_data[:,0])

In [4]:
def outlierIdentifier(dataFrame,columns):
    outlier_indices=[]
    
    for f in columns:
        
        Quartile1 = np.percentile(dataFrame[f],25)
        Quartile3 = np.percentile(dataFrame[f],75)
        interquartile_range = Quartile3-Quartile1
        outlier_step= interquartile_range * 1.5
        outlier_list_col = dataFrame[(dataFrame[f]< Quartile1 - outlier_step)|( dataFrame[f] > Quartile3 + outlier_step)].index
        outlier_indices.extend(outlier_list_col)
    
    outliers_indices = Counter(outlier_indices)
    multiple_outliers = list(i for i , v in outliers_indices.items() if v>2 )
    return multiple_outliers

In [5]:
# remove outliers
train_df = train_df.drop(outlierIdentifier(train_df,[ 'Age', 'Flight_Distance', 'Inflight_wifi_service',
       'Departure/Arrival_time_convenient', 'Ease_of_Online_booking',
       'Gate_location', 'Food_and_drink', 'Online_boarding', 'Seat_comfort',
       'Inflight_entertainment', 'On-board_service', 'Leg_room_service',
       'Baggage_handling', 'Checkin_service', 'Inflight_service',
       'Cleanliness', 'Departure_Delay_in_Minutes',
       'Arrival_Delay_in_Minutes']),axis = 0).reset_index(drop = True)

In [6]:
train_df.isna().any()

Age                                  False
Flight_Distance                      False
Inflight_wifi_service                False
Departure/Arrival_time_convenient    False
Ease_of_Online_booking               False
Gate_location                        False
Food_and_drink                       False
Online_boarding                      False
Seat_comfort                         False
Inflight_entertainment               False
On-board_service                     False
Leg_room_service                     False
Baggage_handling                     False
Checkin_service                      False
Inflight_service                     False
Cleanliness                          False
Departure_Delay_in_Minutes           False
Arrival_Delay_in_Minutes              True
satisfaction                         False
Customer_Type_Loyal Customer         False
Customer_Type_disloyal Customer      False
Type_of_Travel_Business travel       False
Type_of_Travel_Personal Travel       False
Class_Busin

In [9]:
test_df.isna().any()

Age                                  False
Flight_Distance                      False
Inflight_wifi_service                False
Departure/Arrival_time_convenient    False
Ease_of_Online_booking               False
Gate_location                        False
Food_and_drink                       False
Online_boarding                      False
Seat_comfort                         False
Inflight_entertainment               False
On-board_service                     False
Leg_room_service                     False
Baggage_handling                     False
Checkin_service                      False
Inflight_service                     False
Cleanliness                          False
Departure_Delay_in_Minutes           False
Arrival_Delay_in_Minutes              True
satisfaction                         False
Customer_Type_Loyal Customer         False
Customer_Type_disloyal Customer      False
Type_of_Travel_Business travel       False
Type_of_Travel_Personal Travel       False
Class_Busin

In [6]:
train_df_len=len(train_df)
train_df= pd.concat([train_df,test_df],axis=0).reset_index(drop=True)

train_df["Arrival_Delay_in_Minutes"]=train_df["Arrival_Delay_in_Minutes"].fillna(np.mean(train_df["Arrival_Delay_in_Minutes"]))


In [10]:
train_df.isna().any()

Age                                  False
Flight_Distance                      False
Inflight_wifi_service                False
Departure/Arrival_time_convenient    False
Ease_of_Online_booking               False
Gate_location                        False
Food_and_drink                       False
Online_boarding                      False
Seat_comfort                         False
Inflight_entertainment               False
On-board_service                     False
Leg_room_service                     False
Baggage_handling                     False
Checkin_service                      False
Inflight_service                     False
Cleanliness                          False
Departure_Delay_in_Minutes           False
Arrival_Delay_in_Minutes              True
satisfaction                         False
Customer_Type_Loyal Customer         False
Customer_Type_disloyal Customer      False
Type_of_Travel_Business travel       False
Type_of_Travel_Personal Travel       False
Class_Busin

In [7]:
X_test=train_df[train_df_len:]
X_train=train_df[:train_df_len]

y_train = X_train.satisfaction
y_test = X_test.satisfaction

X_train.drop(labels=["satisfaction"],axis=1,inplace=True)
X_test.drop(labels=["satisfaction"],axis=1,inplace=True)

#Below causes problem when using pd.dummies later
#y_train = y_train.eq('satisfied').mul(1)
#y_test = y_test.eq('satisfied').mul(1)

try:
    os.stat('processed_data')
except:
    os.mkdir('processed_data')  
    

X_train.to_csv("processed_data/X_train.csv")
X_test.to_csv("processed_data/X_test.csv")
y_train.to_csv("processed_data/y_train.csv")
y_test.to_csv("processed_data/y_test.csv")


print("X_train",len(X_train))
print("X_test",len(X_test))
print("y_train",len(y_train))
print("y_test",len(y_test))


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


X_train 103864
X_test 25976
y_train 103864
y_test 25976
