In [2]:
# Library Imports
%matplotlib inline
import pandas as pd
from pandas import Grouper
import dateutil.parser
import dateutil.parser
import seaborn as sns
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff

py.init_notebook_mode(connected=True)

In [27]:
# Read data 
flights = pd.read_csv('BrFlights2.csv', encoding='latin1')

In [29]:
flights.shape

(2542519, 21)

# Questions to be answered

1. How does air traffic activity change during this period?
2. Are there any outliers?
3. Are there any patterns?

In [41]:
# Drop all other columns to reduce DataFrame size 
flights = flights[['Partida.Real', 'Situacao.Voo']]

# Only keep the flights that weren't canceled
flights = flights[flights['Situacao.Voo'] == 'Realizado']

In [42]:
flights.head()

Unnamed: 0,Partida.Real,Situacao.Voo
0,2016-01-30T08:58:00Z,Realizado
1,2016-01-13T12:13:00Z,Realizado
2,2016-01-29T12:13:00Z,Realizado
3,2016-01-18T12:03:00Z,Realizado
4,2016-01-30T12:13:00Z,Realizado


In [43]:
def convert_time(row):
    return dateutil.parser.parse(row['Partida.Real']).date()

In [44]:
# Convert the dates to date object
flights['Partida.Real'] = flights.apply(convert_time, axis=1)

In [45]:
flights.head()

Unnamed: 0,Partida.Real,Situacao.Voo
0,2016-01-30,Realizado
1,2016-01-13,Realizado
2,2016-01-29,Realizado
3,2016-01-18,Realizado
4,2016-01-30,Realizado


In [None]:
# I exported this to CSV since the apply took a really long time and I didn't want to do it again
flights.to_csv('final_dates.csv', index=False)

In [12]:
flight_dates = pd.read_csv('final_dates.csv', parse_dates=['Partida.Real'])

In [4]:
flight_dates.head()

Unnamed: 0,Partida.Real,Situacao.Voo
0,2016-01-30,Realizado
1,2016-01-13,Realizado
2,2016-01-29,Realizado
3,2016-01-18,Realizado
4,2016-01-30,Realizado


In [22]:
# Makes a Panda series with the dates and the number of flights on that day
flights_in_day = flight_dates['Partida.Real'].value_counts()

In [23]:
flights_in_day.head()

2016-03-11    3252
2016-03-16    3018
2015-02-20    2974
2015-01-30    2970
2015-12-17    2965
Name: Partida.Real, dtype: int64

In [24]:
flights_in_day.sort_index(inplace=True)

In [25]:
# Remove anomalies to clear data
flights_in_day = flights_in_day[flights_in_day > 100]

In [30]:
# Convert this to a Panda Dataframe so we can plot it with Seaborn
flights_df = pd.DataFrame(data={'Flights': flights_in_day.values, 'Date': flights_in_day.index})

In [22]:
data = [go.Scatter(x=flights_df['Date'], y=flights_df['Flights'])]

In [25]:
layout = dict(title='Flight activity during 2016', xaxis=dict(range=['2016-01-01', '2016-12-31']), yaxis=dict(title='Flights'))
fig = dict(data=data, layout=layout)

In [26]:
py.iplot(fig)

In [45]:
flights_df['DayOfWeek'] = flights_df['Date'].dt.weekday_name

In [56]:
flights_by_day_of_week = flights_df.groupby(['DayOfWeek']).mean().reindex(days)

In [66]:
data2 = [go.Bar(x=flights_by_day_of_week.index, y=flights_by_day_of_week.Flights)]
layout2 = dict(title='Mean value of flights per day', yaxis=dict(title='Flights'))
fig2 = fig = dict(data=data2, layout=layout2)

In [67]:
py.iplot(fig2)