# PREPROCESSING DATA PENERBANGAN
Margareta Valencia (A11.2022.14704)

### Import Library

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Import Dataset

In [2]:
df = pd.read_csv('flights.csv')
print(df.head())

   id  year  month  day  dep_time  sched_dep_time  dep_delay  arr_time  \
0   0  2013      1    1     517.0             515        2.0     830.0   
1   1  2013      1    1     533.0             529        4.0     850.0   
2   2  2013      1    1     542.0             540        2.0     923.0   
3   3  2013      1    1     544.0             545       -1.0    1004.0   
4   4  2013      1    1     554.0             600       -6.0     812.0   

   sched_arr_time  arr_delay  ... flight  tailnum origin dest air_time  \
0             819       11.0  ...   1545   N14228    EWR  IAH    227.0   
1             830       20.0  ...   1714   N24211    LGA  IAH    227.0   
2             850       33.0  ...   1141   N619AA    JFK  MIA    160.0   
3            1022      -18.0  ...    725   N804JB    JFK  BQN    183.0   
4             837      -25.0  ...    461   N668DN    LGA  ATL    116.0   

   distance  hour  minute            time_hour                    name  
0      1400     5      15  2013-01-01

In [3]:
# Memisahkan fitur (X) dan target (y)
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [4]:
print(X)

[[0 2013 1 ... 5 15 '2013-01-01 05:00:00']
 [1 2013 1 ... 5 29 '2013-01-01 05:00:00']
 [2 2013 1 ... 5 40 '2013-01-01 05:00:00']
 ...
 [336773 2013 9 ... 12 10 '2013-09-30 12:00:00']
 [336774 2013 9 ... 11 59 '2013-09-30 11:00:00']
 [336775 2013 9 ... 8 40 '2013-09-30 08:00:00']]


### Menghilangkan Missing Value

In [5]:
# periksa nilai yang hilang
print(df.isnull().sum().sort_values(ascending=False))

arr_delay         9430
air_time          9430
arr_time          8713
dep_time          8255
dep_delay         8255
tailnum           2512
id                   0
origin               0
time_hour            0
minute               0
hour                 0
distance             0
dest                 0
carrier              0
flight               0
year                 0
sched_arr_time       0
sched_dep_time       0
day                  0
month                0
name                 0
dtype: int64


In [6]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 3:6])
X[:, 3:6] = imputer.transform(X[:, 3:6])

In [7]:
print(X)

[[0 2013 1 ... 5 15 '2013-01-01 05:00:00']
 [1 2013 1 ... 5 29 '2013-01-01 05:00:00']
 [2 2013 1 ... 5 40 '2013-01-01 05:00:00']
 ...
 [336773 2013 9 ... 12 10 '2013-09-30 12:00:00']
 [336774 2013 9 ... 11 59 '2013-09-30 11:00:00']
 [336775 2013 9 ... 8 40 '2013-09-30 08:00:00']]


### Encoding Data Kategori 

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Ubah kolom 'Carrier' bertipe string ke numerik menggunakan OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [2])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [9]:
# Tampilkan hasil
print(X)

[[1.0 0.0 0.0 ... 5 15 '2013-01-01 05:00:00']
 [1.0 0.0 0.0 ... 5 29 '2013-01-01 05:00:00']
 [1.0 0.0 0.0 ... 5 40 '2013-01-01 05:00:00']
 ...
 [0.0 0.0 0.0 ... 12 10 '2013-09-30 12:00:00']
 [0.0 0.0 0.0 ... 11 59 '2013-09-30 11:00:00']
 [0.0 0.0 0.0 ... 8 40 '2013-09-30 08:00:00']]


In [10]:
from sklearn.preprocessing import LabelEncoder
# Encoding target "Anaemic" menggunakan LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [11]:
print(y)

[14 14  2 ...  5  5  5]


### Membagi Dataset ke Dalam Training Set dan Test Set

In [12]:
from sklearn.model_selection import train_test_split

# test_size = 0.2 berarti 20% dari data akan digunakan untuk pengujian
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 1)

In [13]:
print(X_train)

[[0.0 0.0 0.0 ... 13 40 '2013-06-17 13:00:00']
 [0.0 0.0 0.0 ... 6 10 '2013-06-11 06:00:00']
 [0.0 1.0 0.0 ... 14 59 '2013-02-07 14:00:00']
 ...
 [0.0 0.0 0.0 ... 12 26 '2013-09-04 12:00:00']
 [0.0 0.0 0.0 ... 21 5 '2013-07-18 21:00:00']
 [0.0 1.0 0.0 ... 6 52 '2013-02-20 06:00:00']]


In [14]:
print (X_test)

[[0.0 0.0 0.0 ... 14 59 '2013-12-13 14:00:00']
 [0.0 0.0 1.0 ... 22 55 '2013-03-18 22:00:00']
 [0.0 0.0 0.0 ... 17 0 '2013-06-07 17:00:00']
 ...
 [1.0 0.0 0.0 ... 19 5 '2013-01-09 19:00:00']
 [1.0 0.0 0.0 ... 9 40 '2013-01-25 09:00:00']
 [1.0 0.0 0.0 ... 8 43 '2013-01-18 08:00:00']]


In [15]:
print (y_train)

[ 6  3  9 ...  6  5 14]


In [16]:
print (y_test)

[ 3  9  3 ... 14  2  6]
