In [1]:
# Importing the basic libraries we will require for the project

# Libraries to help with reading and manipulating data
import pandas as pd
import numpy as np
import warnings;
warnings.filterwarnings('ignore')

file_set = 'test'

In [2]:
travel_orig = pd.read_csv("data/Traveldata_{file_set}.csv".format(file_set=file_set))
survey_orig = pd.read_csv("data/Surveydata_{file_set}.csv".format(file_set=file_set))

In [3]:
data = pd.concat([travel_orig.copy(), survey_orig.copy()], axis=1)
data = data.loc[:,~data.columns.duplicated()].copy()
IDs = data['ID']

data = data.drop(["ID"], axis=1)

In [4]:
IDs.head()

0    99900001
1    99900002
2    99900003
3    99900004
4    99900005
Name: ID, dtype: int64

In [5]:
# convert survey results to exponential numeric scale
survey_values = {
    "Extremely Poor": -100,
    "Very Inconvenient": -100,
    "Poor": -10,
    "Inconvenient": -10,
    "Needs Improvement": -1,
    "": 0,
    "Acceptable": 1,
    "Manageable": 1,
    "Good": 10,
    "Convenient": 10,
    "Excellent": 100,
    "Very Convenient": 100
}

def numerize(pd, fld, d):
    for key in d:
        pd[fld] = pd[fld].replace(key, d[key])
    return pd



In [6]:
data.head()


Unnamed: 0,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins,Seat_Comfort,Seat_Class,...,Onboard_Wifi_Service,Onboard_Entertainment,Online_Support,Ease_of_Online_Booking,Onboard_Service,Legroom,Baggage_Handling,CheckIn_Service,Cleanliness,Online_Boarding
0,Female,,36.0,Business Travel,Business,532,0.0,0.0,Acceptable,Green Car,...,Needs Improvement,Excellent,Good,Excellent,Excellent,Excellent,Excellent,Good,Excellent,Poor
1,Female,Disloyal Customer,21.0,Business Travel,Business,1425,9.0,28.0,Extremely Poor,Ordinary,...,Acceptable,Poor,Acceptable,Acceptable,Excellent,Acceptable,Good,Acceptable,Excellent,Acceptable
2,Male,Loyal Customer,60.0,Business Travel,Business,2832,0.0,0.0,Excellent,Ordinary,...,Excellent,Excellent,Excellent,Needs Improvement,Needs Improvement,Needs Improvement,Needs Improvement,Good,Needs Improvement,Excellent
3,Female,Loyal Customer,29.0,Personal Travel,Eco,1352,0.0,0.0,Acceptable,Green Car,...,Poor,Acceptable,Excellent,Poor,Acceptable,Needs Improvement,Excellent,Excellent,Excellent,Poor
4,Male,Disloyal Customer,18.0,Business Travel,Business,1610,17.0,0.0,Excellent,Ordinary,...,Excellent,Excellent,Excellent,Excellent,,Acceptable,Excellent,Excellent,Excellent,Excellent


In [7]:
data = numerize(data, "Seat_Comfort", survey_values)
data = numerize(data, "Arrival_Time_Convenient", survey_values)
data = numerize(data, "Catering", survey_values)
data = numerize(data, "Platform_Location", survey_values)
data = numerize(data, "Onboard_Wifi_Service", survey_values)
data = numerize(data, "Onboard_Entertainment", survey_values)
data = numerize(data, "Online_Support", survey_values)
data = numerize(data, "Ease_of_Online_Booking", survey_values)
data = numerize(data, "Onboard_Service", survey_values)
data = numerize(data, "Legroom", survey_values)
data = numerize(data, "Baggage_Handling", survey_values)
data = numerize(data, "CheckIn_Service", survey_values)
data = numerize(data, "Cleanliness", survey_values)
data = numerize(data, "Online_Boarding", survey_values)



In [8]:
data.head()

Unnamed: 0,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins,Seat_Comfort,Seat_Class,...,Onboard_Wifi_Service,Onboard_Entertainment,Online_Support,Ease_of_Online_Booking,Onboard_Service,Legroom,Baggage_Handling,CheckIn_Service,Cleanliness,Online_Boarding
0,Female,,36.0,Business Travel,Business,532,0.0,0.0,1.0,Green Car,...,-1.0,100.0,10.0,100.0,100.0,100.0,100.0,10.0,100.0,-10.0
1,Female,Disloyal Customer,21.0,Business Travel,Business,1425,9.0,28.0,-100.0,Ordinary,...,1.0,-10.0,1.0,1.0,100.0,1.0,10.0,1.0,100.0,1.0
2,Male,Loyal Customer,60.0,Business Travel,Business,2832,0.0,0.0,100.0,Ordinary,...,100.0,100.0,100.0,-1.0,-1.0,-1.0,-1.0,10.0,-1.0,100.0
3,Female,Loyal Customer,29.0,Personal Travel,Eco,1352,0.0,0.0,1.0,Green Car,...,-10.0,1.0,100.0,-10.0,1.0,-1.0,100.0,100.0,100.0,-10.0
4,Male,Disloyal Customer,18.0,Business Travel,Business,1610,17.0,0.0,100.0,Ordinary,...,100.0,100.0,100.0,100.0,,1.0,100.0,100.0,100.0,100.0


In [9]:
# Encode the Categorical features

gender = pd.get_dummies(data['Gender'], prefix="Gender", drop_first=False)
customer_type = pd.get_dummies(data['Customer_Type'], prefix="Customer_Type", drop_first=False)
type_travel = pd.get_dummies(data['Type_Travel'], prefix="Type_Travel", drop_first=False)
travel_class = pd.get_dummies(data['Travel_Class'], prefix="Travel_Class", drop_first=False)
seat_classes = pd.get_dummies(data['Seat_Class'], prefix="Seat_Class", drop_first=True)

data = pd.concat([data, gender, customer_type, type_travel, travel_class, seat_classes], axis=1)
data.head()

Unnamed: 0,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins,Seat_Comfort,Seat_Class,...,Online_Boarding,Gender_Female,Gender_Male,Customer_Type_Disloyal Customer,Customer_Type_Loyal Customer,Type_Travel_Business Travel,Type_Travel_Personal Travel,Travel_Class_Business,Travel_Class_Eco,Seat_Class_Ordinary
0,Female,,36.0,Business Travel,Business,532,0.0,0.0,1.0,Green Car,...,-10.0,1,0,0,0,1,0,1,0,0
1,Female,Disloyal Customer,21.0,Business Travel,Business,1425,9.0,28.0,-100.0,Ordinary,...,1.0,1,0,1,0,1,0,1,0,1
2,Male,Loyal Customer,60.0,Business Travel,Business,2832,0.0,0.0,100.0,Ordinary,...,100.0,0,1,0,1,1,0,1,0,1
3,Female,Loyal Customer,29.0,Personal Travel,Eco,1352,0.0,0.0,1.0,Green Car,...,-10.0,1,0,0,1,0,1,0,1,0
4,Male,Disloyal Customer,18.0,Business Travel,Business,1610,17.0,0.0,100.0,Ordinary,...,100.0,0,1,1,0,1,0,1,0,1


In [10]:
data = data.drop(["Gender"], axis=1)
data = data.drop(["Customer_Type"], axis=1)
data = data.drop(["Type_Travel"], axis=1)
data = data.drop(["Travel_Class"], axis=1)
data = data.drop(["Seat_Class"], axis=1)

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35602 entries, 0 to 35601
Data columns (total 27 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Age                              35591 non-null  float64
 1   Travel_Distance                  35602 non-null  int64  
 2   Departure_Delay_in_Mins          35573 non-null  float64
 3   Arrival_Delay_in_Mins            35479 non-null  float64
 4   Seat_Comfort                     35580 non-null  float64
 5   Arrival_Time_Convenient          32277 non-null  float64
 6   Catering                         32245 non-null  float64
 7   Platform_Location                35590 non-null  float64
 8   Onboard_Wifi_Service             35590 non-null  float64
 9   Onboard_Entertainment            35594 non-null  float64
 10  Online_Support                   35576 non-null  float64
 11  Ease_of_Online_Booking           35584 non-null  float64
 12  Onboard_Service   

In [12]:
data.head()

Unnamed: 0,Age,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins,Seat_Comfort,Arrival_Time_Convenient,Catering,Platform_Location,Onboard_Wifi_Service,Onboard_Entertainment,...,Online_Boarding,Gender_Female,Gender_Male,Customer_Type_Disloyal Customer,Customer_Type_Loyal Customer,Type_Travel_Business Travel,Type_Travel_Personal Travel,Travel_Class_Business,Travel_Class_Eco,Seat_Class_Ordinary
0,36.0,532,0.0,0.0,1.0,1.0,1.0,1.0,-1.0,100.0,...,-10.0,1,0,0,0,1,0,1,0,0
1,21.0,1425,9.0,28.0,-100.0,10.0,-10.0,1.0,1.0,-10.0,...,1.0,1,0,1,0,1,0,1,0,1
2,60.0,2832,0.0,0.0,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,0,1,0,1,1,0,1,0,1
3,29.0,1352,0.0,0.0,1.0,100.0,1.0,100.0,-10.0,1.0,...,-10.0,1,0,0,1,0,1,0,1,0
4,18.0,1610,17.0,0.0,100.0,-100.0,100.0,-1.0,100.0,100.0,...,100.0,0,1,1,0,1,0,1,0,1


In [13]:
data = data.fillna(0)

In [14]:
if file_set == 'test':
    data = pd.concat([IDs.copy(), data.copy()], axis=1)
data.to_csv("out/out_{file_set}.csv".format(file_set=file_set))

In [15]:
IDs.head()

0    99900001
1    99900002
2    99900003
3    99900004
4    99900005
Name: ID, dtype: int64