# Data Visualization 3 - Lecture 3

---



In [None]:
import plotly.express as px
import pandas as pd

# Warmup

Warmup exercises for the 5 core chart types on a new dataset.

## The dataset

The dataset has 20k rows and 18 columns and consists of airline ratings from various customers.

### Dataset discovery

In [None]:
# Loading the dataset from the web server
df_scores = pd.read_csv('https://biconsulting.hu/download/ceu2022/airline_scores.csv')

df_scores.shape

(20000, 18)

In [None]:
# Loading the dataset from the public github repo
df_scores = pd.read_csv('https://raw.githubusercontent.com/bencearato/data/master/airline_scores.csv')

df_scores.shape



(20000, 18)

### Adding the Cnt column

In [None]:
df_scores = pd.read_csv('https://biconsulting.hu/download/ceu2022/airline_scores.csv')
df_scores['Cnt']=1
df_scores

Unnamed: 0,ID,Gender,Age,Customer Type,Type of Travel,Class,Flight Distance,Departure Delay,Arrival Delay,Check-in Service,Online Boarding,Gate Location,On-board Service,Seat Comfort,Cleanliness,Food and Drink,Baggage Handling,Satisfaction,Cnt
0,1,Male,48,First-time,Business,Business,821,2,5.0,4,3,3,3,5,5,5,5,Neutral or Dissatisfied,1
1,2,Female,35,Returning,Business,Business,821,26,39.0,3,5,2,5,4,5,3,5,Satisfied,1
2,3,Male,41,Returning,Business,Business,853,0,0.0,4,5,4,3,5,5,5,3,Satisfied,1
3,4,Male,50,Returning,Business,Business,1905,0,0.0,3,4,2,5,5,4,4,5,Satisfied,1
4,5,Female,49,Returning,Business,Business,3470,0,1.0,3,5,3,3,4,5,4,3,Satisfied,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,19996,Female,45,Returning,Business,Economy,599,0,0.0,2,4,5,4,3,2,5,4,Satisfied,1
19996,19997,Female,37,Returning,Business,Economy,589,29,30.0,3,5,2,3,5,5,5,3,Satisfied,1
19997,19998,Male,25,First-time,Business,Economy,589,2,13.0,5,3,2,3,2,2,2,2,Neutral or Dissatisfied,1
19998,19999,Male,53,Returning,Business,Business,2517,1,0.0,3,1,1,4,1,5,5,4,Satisfied,1


## Bar chart examples


### Record count by Class
How many records we have for each class type?


In [None]:
# first try using a simple bar chart
px.bar(df_scores, x='Class', y='Cnt', color = 'Class')

In [None]:
# trying histogramm
px.histogram(df_scores, x='Class', y='Cnt',color = 'Class')

In [None]:
# customizing the histogram with text label and  template
# list of available plotly templates : https://plotly.com/python/templates

px.histogram(df_scores, x='Class', y='Cnt',color = 'Class',text_auto=True, template='simple_white')

In [None]:
# the bar chart would also work with a summarized dataframe
df_sum= df_scores.groupby( ['Class'])['Cnt'].sum().reset_index()
px.bar(df_sum, x='Class', y='Cnt', color='Class',text_auto=True, template='ggplot2')

### Record Count by Seat Comfort

In [None]:
# trying a simple bar chart
px.bar(df_scores, x='Seat Comfort', y='Cnt')

In [None]:
# trying a histogram
# not really nice because Seat Comfort is a numerical variable
px.histogram(df_scores, x='Seat Comfort', y='Cnt', text_auto=True)

In [None]:
# the bar chart  with a summarized dataframe would work well
df_sum= df_scores.groupby( ['Seat Comfort'])['Cnt'].sum().reset_index()
px.bar(df_sum, x='Seat Comfort', y='Cnt',  text_auto=True)


### Seat Comfort by Class
What's the average Seat Comfort score for each class?

In [None]:
# We need to average the Seat Comfort variable, grouped by the Class variable
# we are using  mean() the get the average
df_sum= df_scores.groupby( ['Class'])['Seat Comfort'].mean().reset_index()
df_sum


px.bar(df_sum, x='Class', y='Seat Comfort', color='Class',text_auto=True, template='simple_white')

### Using placeholder variables


In [None]:
# we can make it easier to run summarized charts by using a placeholder variable for the main  column

# Aggregating by Seat Comfort
_dim='Seat Comfort'
df_sum= df_scores.groupby( [_dim])['Cnt'].sum().reset_index()
px.bar(df_sum, x=_dim, y='Cnt', text_auto=True).show()


# Now aggregating by Cleanlines - only to first line changes!
_dim='Cleanliness'
df_sum= df_scores.groupby( [_dim])['Cnt'].sum().reset_index()
px.bar(df_sum, x=_dim, y='Cnt', text_auto=True).show()


## Bar chart Practice

### Count by Customer type

Let's visualize the distribution of different customer types.

#### Code

In [None]:
barCustomer = px.histogram(df_scores, x = 'Customer Type', y = 'Cnt', text_auto=True, template = 'simple_white')

#### Chart

In [None]:
barCustomer.show()

### Count by On-board Service

Let's visualize the ratings of the on board service on a bar chart.

#### Code

In [None]:
df_obs = df_scores.groupby(['On-board Service'])['Cnt'].sum().reset_index()

title = 'Ratings of the On-board Service'

df_obs

Unnamed: 0,On-board Service,Cnt
0,0,1
1,1,2410
2,2,2873
3,3,4333
4,4,5780
5,5,4603


In [None]:
barOBS = px.bar(df_obs, x = 'On-board Service', y = 'Cnt', text_auto=True,title = title, template = 'simple_white')
barOBS.show()

#### Chart

In [None]:
barOBS.show()

### Count by Class and Gender

Let's visualize how many male and female customers use the different class types on a grouped bar chart.

#### Code

In [None]:
df_grouped = df_scores.groupby(['Class', 'Gender'])['Cnt'].sum().reset_index()
df_grouped

groupedBar = px.bar(df_grouped, x = 'Class', y = 'Cnt', color ='Gender', barmode = 'group', text_auto = True)

groupedBar.show()

#### Chart

In [None]:
groupedBar.show()

## Line chart - Distance

Let's visualize the flight distance values on a line chart.


#### Code

In [None]:
_dim='Flight Distance'
df_sum= df_scores.groupby( [ _dim])['Cnt'].sum().reset_index()


#### Chart

In [None]:
px.line(df_sum, x=_dim, y='Cnt')

In [None]:
df_sum= df_scores.groupby( [ _dim])['Cnt'].sum().reset_index()

#### Filtering for distances smaller than 1500 

In [None]:
# Query using backtick espace
df_sum2=df_sum.query( ' `Flight Distance` < 1500' )
df_sum2

Unnamed: 0,Flight Distance,Cnt
0,67,18
1,74,36
2,84,10
3,89,68
4,95,25
...,...,...
533,1250,10
534,1290,3
535,1325,7
536,1371,16


In [None]:
px.line(df_sum2, x=_dim, y='Cnt')

## Scatter plot - Distance and Delay

Let's compare the Flight Distance and Arrival delay values for Male customers who are older then 20.

#### Code

In [None]:
df_scatter = df_scores.query('Gender == "Male" and Age > 20')
df_scatter= df_scores

df_scatter

Unnamed: 0,ID,Gender,Age,Customer Type,Type of Travel,Class,Flight Distance,Departure Delay,Arrival Delay,Check-in Service,Online Boarding,Gate Location,On-board Service,Seat Comfort,Cleanliness,Food and Drink,Baggage Handling,Satisfaction
0,1,Male,48,First-time,Business,Business,821,2,5.0,4,3,3,3,5,5,5,5,Neutral or Dissatisfied
1,2,Female,35,Returning,Business,Business,821,26,39.0,3,5,2,5,4,5,3,5,Satisfied
2,3,Male,41,Returning,Business,Business,853,0,0.0,4,5,4,3,5,5,5,3,Satisfied
3,4,Male,50,Returning,Business,Business,1905,0,0.0,3,4,2,5,5,4,4,5,Satisfied
4,5,Female,49,Returning,Business,Business,3470,0,1.0,3,5,3,3,4,5,4,3,Satisfied
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,19996,Female,45,Returning,Business,Economy,599,0,0.0,2,4,5,4,3,2,5,4,Satisfied
19996,19997,Female,37,Returning,Business,Economy,589,29,30.0,3,5,2,3,5,5,5,3,Satisfied
19997,19998,Male,25,First-time,Business,Economy,589,2,13.0,5,3,2,3,2,2,2,2,Neutral or Dissatisfied
19998,19999,Male,53,Returning,Business,Business,2517,1,0.0,3,1,1,4,1,5,5,4,Satisfied


#### Chart

In [None]:
_title = 'Comparing Distance and Arrival delays'
scatter = px.scatter(df_scatter, x = 'Flight Distance', y = 'Arrival Delay', color = 'Class', width=900, height=600, title = _title)

scatter.show()

## Pie chart - Satisfaction

Let's visualize the satisfaction of all customers on a pie chart. We want to have the label, percent, and values as well on our traces.

#### Code

In [None]:
title = 'How satisfied are our customers?'

#### Chart

In [None]:
pie = px.pie(df_scores, names="Satisfaction", values="Cnt", title = title)
pie.update_traces(textposition='inside', textinfo='label+percent+value')

## Pie chart - Satisfaction of returning customers

Let's do the same chart but this time we want to see the satisfaction stats for returning customers. We also want to use the simple_white theme this time.

### Code

In [None]:
df_returning = df_scores.query(' `Customer Type` == "Returning" ')

title = 'Satisfaction of returning customers'

df_returning

Unnamed: 0,ID,Gender,Age,Customer Type,Type of Travel,Class,Flight Distance,Departure Delay,Arrival Delay,Check-in Service,Online Boarding,Gate Location,On-board Service,Seat Comfort,Cleanliness,Food and Drink,Baggage Handling,Satisfaction,Cnt
1,2,Female,35,Returning,Business,Business,821,26,39.0,3,5,2,5,4,5,3,5,Satisfied,1
2,3,Male,41,Returning,Business,Business,853,0,0.0,4,5,4,3,5,5,5,3,Satisfied,1
3,4,Male,50,Returning,Business,Business,1905,0,0.0,3,4,2,5,5,4,4,5,Satisfied,1
4,5,Female,49,Returning,Business,Business,3470,0,1.0,3,5,3,3,4,5,4,3,Satisfied,1
5,6,Male,43,Returning,Business,Business,3788,0,0.0,3,5,4,4,4,3,3,4,Satisfied,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19993,19994,Female,48,Returning,Business,Economy,589,121,124.0,1,3,3,2,4,1,1,2,Neutral or Dissatisfied,1
19995,19996,Female,45,Returning,Business,Economy,599,0,0.0,2,4,5,4,3,2,5,4,Satisfied,1
19996,19997,Female,37,Returning,Business,Economy,589,29,30.0,3,5,2,3,5,5,5,3,Satisfied,1
19998,19999,Male,53,Returning,Business,Business,2517,1,0.0,3,1,1,4,1,5,5,4,Satisfied,1


### Chart

In [None]:
pie2 = px.pie(df_returning, names="Satisfaction", values="Cnt", title = title, template = 'simple_white')
pie2.update_traces(textposition='inside', textinfo='label+percent+value')