# PREPROCESSING DATA PENERBANGAN
Margareta Valencia (A11.2022.14704)

### Import Library

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Import Dataset

In [2]:
dataset = pd.read_csv('flights.csv')
print(dataset.head())

   id  year  month  day  dep_time  sched_dep_time  dep_delay  arr_time  \
0   0  2013      1    1     517.0             515        2.0     830.0   
1   1  2013      1    1     533.0             529        4.0     850.0   
2   2  2013      1    1     542.0             540        2.0     923.0   
3   3  2013      1    1     544.0             545       -1.0    1004.0   
4   4  2013      1    1     554.0             600       -6.0     812.0   

   sched_arr_time  arr_delay  ... flight  tailnum origin dest air_time  \
0             819       11.0  ...   1545   N14228    EWR  IAH    227.0   
1             830       20.0  ...   1714   N24211    LGA  IAH    227.0   
2             850       33.0  ...   1141   N619AA    JFK  MIA    160.0   
3            1022      -18.0  ...    725   N804JB    JFK  BQN    183.0   
4             837      -25.0  ...    461   N668DN    LGA  ATL    116.0   

   distance  hour  minute            time_hour                    name  
0      1400     5      15  2013-01-01

In [3]:
# Hapus kolom yang tidak diinginkan
dataset.drop(['id', 'year', 'flight', 'tailnum', 'time_hour', 'minute', 'hour', 'carrier'], axis=1, inplace=True)

In [4]:
dataset.columns

Index(['month', 'day', 'dep_time', 'sched_dep_time', 'dep_delay', 'arr_time',
       'sched_arr_time', 'arr_delay', 'origin', 'dest', 'air_time', 'distance',
       'name'],
      dtype='object')

In [5]:
# Memisahkan fitur (X) dan target (y)
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [6]:
print (x)

[[1 1 517.0 ... 'IAH' 227.0 1400]
 [1 1 533.0 ... 'IAH' 227.0 1416]
 [1 1 542.0 ... 'MIA' 160.0 1089]
 ...
 [9 30 nan ... 'BNA' nan 764]
 [9 30 nan ... 'CLE' nan 419]
 [9 30 nan ... 'RDU' nan 431]]


In [7]:
print (y)

['United Air Lines Inc.' 'United Air Lines Inc.' 'American Airlines Inc.'
 ... 'Envoy Air' 'Envoy Air' 'Envoy Air']


### Menghilangkan Missing Value

In [8]:
# periksa nilai yang hilang
print(dataset.isnull().sum().sort_values(ascending=False))

arr_delay         9430
air_time          9430
arr_time          8713
dep_time          8255
dep_delay         8255
month                0
day                  0
sched_dep_time       0
sched_arr_time       0
origin               0
dest                 0
distance             0
name                 0
dtype: int64


In [9]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(x[:, 1:3])
x[:, 1:3] = imputer.transform(x[:, 1:3])

In [10]:
print (x)

[[1 1.0 517.0 ... 'IAH' 227.0 1400]
 [1 1.0 533.0 ... 'IAH' 227.0 1416]
 [1 1.0 542.0 ... 'MIA' 160.0 1089]
 ...
 [9 30.0 1349.1099473093045 ... 'BNA' nan 764]
 [9 30.0 1349.1099473093045 ... 'CLE' nan 419]
 [9 30.0 1349.1099473093045 ... 'RDU' nan 431]]


In [11]:
dataset.dropna(subset=['arr_delay'], inplace=True)
dataset.reset_index(drop=True, inplace=True)

### Encoding Data Kategori 

In [27]:
# Identify categorical columns
cat_columns = x.select_dtypes(include=['object']).columns

# Check the number of unique categories in each categorical feature
x[cat_columns].nunique()

AttributeError: 'numpy.ndarray' object has no attribute 'select_dtypes'

In [13]:
print (x)

[[1.0 0.0 0.0 ... 'IAH' 227.0 1400]
 [1.0 0.0 0.0 ... 'IAH' 227.0 1416]
 [1.0 0.0 0.0 ... 'MIA' 160.0 1089]
 ...
 [0.0 0.0 0.0 ... 'BNA' nan 764]
 [0.0 0.0 0.0 ... 'CLE' nan 419]
 [0.0 0.0 0.0 ... 'RDU' nan 431]]


In [14]:
from sklearn.preprocessing import LabelEncoder
# Encoding target "Purchased" menggunakan LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [15]:
print (y)

[14 14  2 ...  5  5  5]


### Membagi Dataset ke Dalam Training Set dan Test Set

In [17]:
from sklearn.model_selection import train_test_split

# test_size = 0.2 berarti 20% dari data akan digunakan untuk pengujian
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state = 1)

In [18]:
print (x_train)

[[0.0 0.0 0.0 ... 'BTV' 46.0 266]
 [0.0 0.0 0.0 ... 'MSP' 143.0 1020]
 [0.0 1.0 0.0 ... 'BUF' 53.0 301]
 ...
 [0.0 0.0 0.0 ... 'CLT' 86.0 529]
 [0.0 0.0 0.0 ... 'ORD' 102.0 719]
 [0.0 1.0 0.0 ... 'FLL' 154.0 1065]]


In [19]:
print (x_test)

[[0.0 0.0 0.0 ... 'LAS' 316.0 2248]
 [0.0 0.0 1.0 ... 'BOS' 36.0 187]
 [0.0 0.0 0.0 ... 'AUS' 220.0 1521]
 ...
 [1.0 0.0 0.0 ... 'SFO' 377.0 2565]
 [1.0 0.0 0.0 ... 'MIA' 153.0 1096]
 [1.0 0.0 0.0 ... 'JAX' 135.0 820]]


In [20]:
print (y_train)

[ 6  3  9 ...  6  5 14]


In [21]:
print (y_test)

[ 3  9  3 ... 14  2  6]


### Feature Scaling

In [24]:
from sklearn.preprocessing import StandardScaler
# Menerapkan standarisasi pada fitur numerik (mulai dari kolom ke-4)
scaler = StandardScaler()
x_train[:, 3:] = scaler.fit_transform(x_train[:, 3:])
x_test[:, 3:] = scaler.transform(x_test[:,  3:])

ValueError: could not convert string to float: 'EWR'