In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [3]:
df = pd.read_csv("airline.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9882798 entries, 0 to 9882797
Data columns (total 29 columns):
 #   Column             Dtype  
---  ------             -----  
 0   ActualElapsedTime  float64
 1   AirTime            float64
 2   ArrDelay           float64
 3   ArrTime            float64
 4   CRSArrTime         int64  
 5   CRSDepTime         int64  
 6   CRSElapsedTime     float64
 7   CancellationCode   object 
 8   Cancelled          int64  
 9   CarrierDelay       float64
 10  DayOfWeek          int64  
 11  DayofMonth         int64  
 12  DepDelay           float64
 13  DepTime            float64
 14  Dest               object 
 15  Distance           float64
 16  Diverted           int64  
 17  FlightNum          int64  
 18  LateAircraftDelay  float64
 19  Month              int64  
 20  NASDelay           float64
 21  Origin             object 
 22  SecurityDelay      float64
 23  TailNum            object 
 24  TaxiIn             float64
 25  TaxiOut           

### Exploring non numeric Columns

In [5]:
df_objs = df.select_dtypes(include=[object])

In [6]:
df_objs.head()

Unnamed: 0,CancellationCode,Dest,Origin,TailNum,UniqueCarrier
0,,DFW,ORD,N293AA,AA
1,,MSP,DTW,N8921E,NW
2,,GRR,CVG,N331DL,DL
3,,DTW,MDW,,ML (1)
4,,FLL,PHL,N512AU,US


In [7]:
# Finding unique values for each row with non-integer values

print(f"Unique values for Cancellation Code: {df['CancellationCode'].unique()}\n")

print(f"Number of unique values for Cancellation Code: {len(df['CancellationCode'].unique())}")
print(f"Number of unique values for Dest: {len(df['Dest'].unique())}")
print(f"Number of unique values for Origin: {len(df['Origin'].unique())}")
print(f"Number of unique values for TailNum: {len(df['TailNum'].unique())}")
print(f"Number of unique values for UniqueCarrier: {len(df['UniqueCarrier'].unique())}")

Unique values for Cancellation Code: [nan 'C' 'A' 'B' 'D']

Number of unique values for Cancellation Code: 5
Number of unique values for Dest: 339
Number of unique values for Origin: 339
Number of unique values for TailNum: 13421
Number of unique values for UniqueCarrier: 29


In [8]:
# Finding number of nan values present

df.isnull().sum()

ActualElapsedTime     206652
AirTime              3141004
ArrDelay              206652
ArrTime               206399
CRSArrTime                 0
CRSDepTime                 0
CRSElapsedTime          2074
CancellationCode     9824178
Cancelled                  0
CarrierDelay         7145066
DayOfWeek                  0
DayofMonth                 0
DepDelay              183923
DepTime               183923
Dest                       0
Distance               16155
Diverted                   0
FlightNum                  0
LateAircraftDelay    7145066
Month                      0
NASDelay             7145066
Origin                     0
SecurityDelay        7145066
TailNum              2990853
TaxiIn               2991662
TaxiOut              2990517
UniqueCarrier              0
WeatherDelay         7145066
Year                       0
dtype: int64

In [9]:
df["AirTime"].unique()

array([  122.,   125.,    71., ..., -1266.,  -419.,  1140.])

In [10]:
len(df["AirTime"].unique())

1048

In [11]:
df["CancellationCode"].unique()

array([nan, 'C', 'A', 'B', 'D'], dtype=object)

In [12]:
useful_cols = df.columns[df.isnull().sum() < 7145066].to_list()

useful_cols.remove("TailNum")

In [13]:
df[useful_cols].isnull().sum()

ActualElapsedTime     206652
AirTime              3141004
ArrDelay              206652
ArrTime               206399
CRSArrTime                 0
CRSDepTime                 0
CRSElapsedTime          2074
Cancelled                  0
DayOfWeek                  0
DayofMonth                 0
DepDelay              183923
DepTime               183923
Dest                       0
Distance               16155
Diverted                   0
FlightNum                  0
Month                      0
Origin                     0
TaxiIn               2991662
TaxiOut              2990517
UniqueCarrier              0
Year                       0
dtype: int64

In [14]:
df["AirTime"]

0          122.0
1          125.0
2           71.0
3            NaN
4          135.0
           ...  
9882793     93.0
9882794      NaN
9882795     61.0
9882796    124.0
9882797      NaN
Name: AirTime, Length: 9882798, dtype: float64

In [15]:
pd.set_option('display.max_columns', None)
df[useful_cols].head()

Unnamed: 0,ActualElapsedTime,AirTime,ArrDelay,ArrTime,CRSArrTime,CRSDepTime,CRSElapsedTime,Cancelled,DayOfWeek,DayofMonth,DepDelay,DepTime,Dest,Distance,Diverted,FlightNum,Month,Origin,TaxiIn,TaxiOut,UniqueCarrier,Year
0,154.0,122.0,90.0,1850.0,1720,1455,145.0,0,3,21,81.0,1616.0,DFW,802.0,0,2337,6,ORD,8.0,24.0,AA,2006
1,159.0,125.0,59.0,1703.0,1604,1510,114.0,0,2,16,14.0,1524.0,MSP,528.0,0,787,9,DTW,19.0,15.0,NW,1997
2,,71.0,,,1140,1037,63.0,1,7,5,,,GRR,268.0,0,1610,2,CVG,4.0,15.0,DL,1995
3,70.0,,80.0,20.0,2300,2100,60.0,0,2,5,70.0,2210.0,DTW,229.0,0,318,2,MDW,,,ML (1),1991
4,150.0,135.0,5.0,2030.0,2025,1740,165.0,0,6,1,20.0,1800.0,FLL,992.0,0,803,2,PHL,3.0,12.0,US,1997


In [16]:
len(df.loc[df["Diverted"] == 1])

22641

In [17]:
new_df = df[useful_cols]

In [18]:
new_df["ArrTime"].fillna(0)

0          1850.0
1          1703.0
2             0.0
3            20.0
4          2030.0
            ...  
9882793     906.0
9882794     708.0
9882795     736.0
9882796       3.0
9882797       0.0
Name: ArrTime, Length: 9882798, dtype: float64

In [19]:
new_df.head(20)

Unnamed: 0,ActualElapsedTime,AirTime,ArrDelay,ArrTime,CRSArrTime,CRSDepTime,CRSElapsedTime,Cancelled,DayOfWeek,DayofMonth,DepDelay,DepTime,Dest,Distance,Diverted,FlightNum,Month,Origin,TaxiIn,TaxiOut,UniqueCarrier,Year
0,154.0,122.0,90.0,1850.0,1720,1455,145.0,0,3,21,81.0,1616.0,DFW,802.0,0,2337,6,ORD,8.0,24.0,AA,2006
1,159.0,125.0,59.0,1703.0,1604,1510,114.0,0,2,16,14.0,1524.0,MSP,528.0,0,787,9,DTW,19.0,15.0,NW,1997
2,,71.0,,,1140,1037,63.0,1,7,5,,,GRR,268.0,0,1610,2,CVG,4.0,15.0,DL,1995
3,70.0,,80.0,20.0,2300,2100,60.0,0,2,5,70.0,2210.0,DTW,229.0,0,318,2,MDW,,,ML (1),1991
4,150.0,135.0,5.0,2030.0,2025,1740,165.0,0,6,1,20.0,1800.0,FLL,992.0,0,803,2,PHL,3.0,12.0,US,1997
5,110.0,,10.0,1450.0,1440,1300,100.0,0,2,21,0.0,1300.0,CLE,475.0,0,283,2,BDL,,,CO,1989
6,26.0,,-8.0,1254.0,1302,1235,27.0,0,7,14,-7.0,1228.0,BTM,65.0,0,1647,11,BZN,,,DL,1993
7,50.0,41.0,-1.0,1320.0,1321,1230,51.0,0,3,11,0.0,1230.0,LGA,214.0,0,1752,6,DCA,2.0,7.0,DL,1997
8,293.0,272.0,-1.0,1543.0,1544,750,294.0,0,2,8,0.0,750.0,IAD,2288.0,0,946,5,LAX,8.0,13.0,UA,2007
9,65.0,40.0,0.0,2005.0,2005,1900,65.0,0,3,28,0.0,1900.0,MDW,251.0,0,371,2,STL,3.0,22.0,WN,1996


In [20]:
new_df["ArrDelay"].unique()

array([  90.,   59.,   nan, ..., -134., 1195., 1071.])

In [21]:
new_df["Distance"] = new_df["Distance"].fillna(new_df["Distance"].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["Distance"] = new_df["Distance"].fillna(new_df["Distance"].mean())


In [22]:
new_df["CRSElapsedTime"].fillna(0)

0          145.0
1          114.0
2           63.0
3           60.0
4          165.0
           ...  
9882793    119.0
9882794    148.0
9882795     80.0
9882796    150.0
9882797     40.0
Name: CRSElapsedTime, Length: 9882798, dtype: float64

In [23]:
new_df.isnull().sum()

ActualElapsedTime     206652
AirTime              3141004
ArrDelay              206652
ArrTime               206399
CRSArrTime                 0
CRSDepTime                 0
CRSElapsedTime          2074
Cancelled                  0
DayOfWeek                  0
DayofMonth                 0
DepDelay              183923
DepTime               183923
Dest                       0
Distance                   0
Diverted                   0
FlightNum                  0
Month                      0
Origin                     0
TaxiIn               2991662
TaxiOut              2990517
UniqueCarrier              0
Year                       0
dtype: int64