In [193]:
import pandas as pd
pd.set_option("display.max_columns", None)

In [206]:
df = pd.read_csv('flights.csv')
df.head()

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,0,363169,5290975,2015,11,28,6,DL,1590,N971AT,CAE,ATL,700,655.0,-5.0,17.0,712.0,68.0,61.0,39.0,192,751.0,5.0,808,756.0,-12.0,0,0,,,,,,
1,1,704061,4899614,2015,11,2,1,AA,2516,N3CVAA,ORD,DEN,1715,1743.0,28.0,40.0,1823.0,164.0,177.0,126.0,888,1929.0,11.0,1859,1940.0,41.0,0,0,,13.0,0.0,28.0,0.0,0.0
2,2,389056,3769408,2015,8,22,6,AS,93,N317AS,SEA,ANC,1355,1353.0,-2.0,20.0,1413.0,220.0,206.0,182.0,1448,1615.0,4.0,1635,1619.0,-16.0,0,0,,,,,,
3,3,132167,4999624,2015,11,9,1,AA,2383,N871AA,MCO,DFW,650,652.0,2.0,16.0,708.0,181.0,172.0,142.0,985,830.0,14.0,851,844.0,-7.0,0,0,,,,,,
4,4,304371,2572568,2015,6,12,5,US,1978,N833AW,CLE,CLT,705,700.0,-5.0,12.0,712.0,93.0,92.0,71.0,430,823.0,9.0,838,832.0,-6.0,0,0,,,,,,


In [195]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 274964 entries, 0 to 274963
Data columns (total 34 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Unnamed: 0.2         274964 non-null  int64  
 1   Unnamed: 0           274964 non-null  int64  
 2   Unnamed: 0.1         274964 non-null  int64  
 3   YEAR                 274964 non-null  int64  
 4   MONTH                274964 non-null  int64  
 5   DAY                  274964 non-null  int64  
 6   DAY_OF_WEEK          274964 non-null  int64  
 7   AIRLINE              274964 non-null  object 
 8   FLIGHT_NUMBER        274964 non-null  int64  
 9   TAIL_NUMBER          274225 non-null  object 
 10  ORIGIN_AIRPORT       274964 non-null  object 
 11  DESTINATION_AIRPORT  274964 non-null  object 
 12  SCHEDULED_DEPARTURE  274964 non-null  int64  
 13  DEPARTURE_TIME       270719 non-null  float64
 14  DEPARTURE_DELAY      270719 non-null  float64
 15  TAXI_OUT         

In [207]:
df.drop(columns=['Unnamed: 0.2', 'Unnamed: 0', 'Unnamed: 0.1'], inplace=True)

> We notice that there are many columns we need to edit their data type first:
> * There are many Null values here.
>
>
> * `SCHEDULED_DEPARTURE`, `DEPARTURE_TIME`, `SCHEDULED_TIME`, `ELAPSED_TIME`, `AIR_TIME`, `SCHEDULED_ARRIVAL`, `ARRIVAL_TIME` **All needed to be converted to string to handle the missed zeros then to time.**
>
>
> * Most values of `CANCELLATION_REASON` is NULL so we will replace them by the reason "Security" 
>  * And replacing the values of this column by readable values.
> 

> 
>

### Operating with Null values from columns.

In [197]:
df.isnull().sum()

YEAR                        0
MONTH                       0
DAY                         0
DAY_OF_WEEK                 0
AIRLINE                     0
FLIGHT_NUMBER               0
TAIL_NUMBER               739
ORIGIN_AIRPORT              0
DESTINATION_AIRPORT         0
SCHEDULED_DEPARTURE         0
DEPARTURE_TIME           4245
DEPARTURE_DELAY          4245
TAXI_OUT                 4393
WHEELS_OFF               4393
SCHEDULED_TIME              0
ELAPSED_TIME             5191
AIR_TIME                 5191
DISTANCE                    0
WHEELS_ON                4587
TAXI_IN                  4587
SCHEDULED_ARRIVAL           0
ARRIVAL_TIME             4587
ARRIVAL_DELAY            5191
DIVERTED                    0
CANCELLED                   0
CANCELLATION_REASON    270531
AIR_SYSTEM_DELAY       223101
SECURITY_DELAY         223101
AIRLINE_DELAY          223101
LATE_AIRCRAFT_DELAY    223101
WEATHER_DELAY          223101
dtype: int64

> We will drop the columns that has more than 10,000 Null values. **They are the last 5 columns.**

In [208]:
needed_columns = df.columns[range(-5, 0, 1)]
df.drop(columns= needed_columns, inplace=True)
df.head(1)

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON
0,2015,11,28,6,DL,1590,N971AT,CAE,ATL,700,655.0,-5.0,17.0,712.0,68.0,61.0,39.0,192,751.0,5.0,808,756.0,-12.0,0,0,


> Droping rows that has null values in `AIR_TIME` column as now it has the most null values but we need it.

In [209]:
df = df[df.AIR_TIME.isnull() == False]
print('Number of null values now is: ', df.isnull().sum().sum())

Number of null values now is:  269773


### Cleaning time columns
`SCHEDULED_DEPARTURE`, `DEPARTURE_TIME`, `SCHEDULED_TIME`, `ELAPSED_TIME`, `AIR_TIME`, `SCHEDULED_ARRIVAL`, `ARRIVAL_TIME` 

> We need to make them on the format **HH:MM** so:

In [210]:
# Getting the columns names in one place.
time_columns = ['SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'SCHEDULED_TIME', 'ELAPSED_TIME', 'AIR_TIME', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME']

In [223]:
def adding_zeros(row_value):
    '''
    Input: the row value come from the getting_time_form function.
    Output: the new format after adding the needed number of zeos
    
    '''
    nedded_number_of_zeors = 4 - len(row_value)
    return  '0' *  nedded_number_of_zeors + row_value

def getting_time_form(row_val):
    '''
    
    Input: the current value in the column form convert_to_time function
    Output: returning it on the format HH : MM, but still string
    
    '''
    value = adding_zeros(row_val)
    return value[0:2] + ':' + value[2: 4]

def convert_to_time(row_value):
    '''
    Input: the row value.
    Output: converting the row value from string to time.
    
    '''
    formatted_string_value = getting_time_form(row_value)
    value_as_time = pd.to_datetime(formatted_string_value).time()
    
    return value_as_time
    
    

In [224]:
size = df.size

for column in time_columns:
    df[column] = df[column].astype('int') # to get rid of the decimal sign.
    df[column] = df[column].astype('str') # to handel its length and add easily zeros.
    
    # Converting the value from the shape '####' to the shape HH:MM
    df[column] = df[column].apply(lambda val: convert_to_time(val))
    
df[time_columns].head(2)

ParserError: hour must be in 0..23: 24:00

> **Now we converting them into time values.**

In [None]:
from datetime import datetime

### Cleaning the `CANCELLATION_REASON` columne

In [None]:
df.CANCELLATION_REASON.value_counts()

In [None]:
def get_reasons(reason):
    if reason == "A":
        return "Airline/Carrier"
    elif reason == "B":
        return "Weather"
    elif reason == "C":
        return "National Air System"
    else:
        return "Security"

In [None]:
df['CANCELLATION_REASON'] = df.CANCELLATION_REASON.apply(lambda reason: get_reasons(reason))
df.CANCELLATION_REASON.value_counts()

### Cleaning the `CANCELLATION_REASON` columne