In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px

# Bar Plot

In [3]:
df = px.data.gapminder().query("country == 'Ireland'")
df.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,iso_alpha,iso_num
744,Ireland,Europe,1952,66.91,2952156,5210.280328,IRL,372
745,Ireland,Europe,1957,68.9,2878220,5599.077872,IRL,372
746,Ireland,Europe,1962,70.29,2830000,6631.597314,IRL,372
747,Ireland,Europe,1967,71.08,2900100,7655.568963,IRL,372
748,Ireland,Europe,1972,71.28,3024400,9530.772896,IRL,372


In [None]:
# x = groups that you are interested in; y = counts you want to display
fig = px.bar(df, x='year', y='pop') 
fig.show()

In [5]:
df = px.data.tips()
df = df.sample(n=50, random_state=1)
df.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
67,3.07,1.0,Female,Yes,Sat,Dinner,1
243,18.78,3.0,Female,No,Thur,Dinner,2
206,26.59,3.41,Male,Yes,Sat,Dinner,3


In [6]:
fig = px.bar(df, x="sex")
fig.show()

##### Note:
 Several rows share the same value of x (in this case, `sex` variable: female or male). The rectangles are stacked on top of one another by default.

You can prepare your data before plotting by using .value_counts() to count the occurrences of values in one variable, and then convert the result into a DataFrame with .to_frame(). You can then plot the data using `px.bar()`. This will eliminate the stacked rectangles.

In [21]:
df_processed = (df['sex']
                .value_counts()
                .to_frame()
                .reset_index()
                .set_index(['sex'])
)
df_processed

Unnamed: 0_level_0,count
sex,Unnamed: 1_level_1
Male,28
Female,22


In [22]:
fig = px.bar(data_frame=df_processed, y="count")
fig.show()

In [23]:
fig = px.bar(df, x="sex", y='tip')
fig.show()

In [24]:
df_processed = df[['sex','tip']].groupby('sex').sum().reset_index()
df_processed

Unnamed: 0,sex,tip
0,Female,64.62
1,Male,95.66


In [25]:
fig = px.bar(data_frame=df_processed, x="sex", y='tip')
fig.show()

You may be interested in colouring your bar plot with another categorical variable. You can do that with the `color` argument

* The barmode allows you to stack them (`stack`) or leave them side by side (`group`)

In [26]:
fig = px.bar(df, x="sex",  color="smoker", barmode="stack")
fig.show()

In [27]:
fig = px.bar(df, x="sex", color="smoker", barmode="group")
fig.show()

In [28]:
# Using facet_row and facet_col
fig = px.bar(data_frame=df, x="sex", color="smoker", barmode="group",
             facet_row="time", facet_col="day")
fig.show()

You will notice the days of the week are not ordered conventionally.

* You can manually arrange the order for your categorical variables in your plot with `category_orders`; just parse a dictionary, where the `key` is the variable name, and the `values` are in the order you want.

In [29]:
fig = px.bar(data_frame=df, x="sex", color="smoker", barmode="group",
             facet_row="time", facet_col="day",
             category_orders={"day": ["Thur", "Fri", "Sat", "Sun"], "time": ["Lunch", "Dinner"]})

fig.show()

# Scatter Plot

In [30]:
df = pd.read_csv("insurance.csv")
df = df.sample(n=100, random_state=1)
df.head()

Unnamed: 0.1,Unnamed: 0,age,sex,bmi,children,smoker,region,charges
559,559,19,male,35.53,0,no,northwest,1646.4297
1087,1087,57,male,31.54,0,no,northwest,11353.2276
1020,1020,51,male,37.0,0,no,southwest,8798.593
460,460,49,female,36.63,3,no,southeast,10381.4787
802,802,21,male,22.3,1,no,southwest,2103.08


In [31]:
fig = px.scatter(df, x="age", y="charges", color="smoker")
fig.show()

In [32]:
fig = px.scatter(df, x="age", y="charges", color="smoker",
                 marginal_y="box", marginal_x="box") # Add marginal plots
fig.show()

In [34]:
# Use trenline to show overall bhavior e.g. variance, range, min and max, and change rate.
fig = px.scatter(df, x="age", y="charges", color="smoker",
                 trendline="ols") # Ordinary Least Squares regression
fig.show()

In [35]:
df = px.data.gapminder().head(500)
print(df.shape)
df.head()

(500, 8)


Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,iso_alpha,iso_num
0,Afghanistan,Asia,1952,28.801,8425333,779.445314,AFG,4
1,Afghanistan,Asia,1957,30.332,9240934,820.85303,AFG,4
2,Afghanistan,Asia,1962,31.997,10267083,853.10071,AFG,4
3,Afghanistan,Asia,1967,34.02,11537966,836.197138,AFG,4
4,Afghanistan,Asia,1972,36.088,13079460,739.981106,AFG,4


You can transform your scatter plot to a bubble chart and add Animation

* When you set the argument `size`, you create a bubble chart, where each data point has its size related to a variable. In this case, we will use population
* Use the argument `animation_frame` to assign marks to animation frames. In this case, we animate over the years
* X and Y axis will GDP per capita and life expectancy
* The data points will be coloured by continent, and we will see the related country when we hover over them
* We set `size_max=55`, so we get bigger circles (there is no rule for 55, it is trial and error)
* The x-axis has a wide range; we can compare levels better over a large range of values when it is in log scale, so we add `log_x=True`
* We set in a list [min, max] the x and y range limits with range_x and range_y. You will find a suitable number for min and max using trial and error

In [36]:
fig = px.scatter(df, x="gdpPercap", y="lifeExp", 
                 animation_frame="year", 
                 size="pop",size_max=55, 
                 color="continent", hover_name="country",
                 log_x=True, range_x=[100,100000], range_y=[25,90]
            )

fig.show()

# Scatter Plot 3D

In [37]:
df = pd.read_csv("insurance.csv")
df = df.sample(n=100, random_state=1)
df.head()

Unnamed: 0.1,Unnamed: 0,age,sex,bmi,children,smoker,region,charges
559,559,19,male,35.53,0,no,northwest,1646.4297
1087,1087,57,male,31.54,0,no,northwest,11353.2276
1020,1020,51,male,37.0,0,no,southwest,8798.593
460,460,49,female,36.63,3,no,southeast,10381.4787
802,802,21,male,22.3,1,no,southwest,2103.08


1. When the **colour variable is categorical**, Plotly uses a **discrete colour sequence**. You can control the colours with the `color_discrete_sequence` argument, and explore available options using `px.colors.qualitative.swatches()`.

2. When the **colour variable is continuous**, Plotly uses a **colour gradient (continuous scale) instead of discrete colours**. You can control this with `the color_continuous_scale` argument, and explore available scales with `px.colors.sequential.swatches()`.

In [38]:
px.colors.qualitative.swatches()

In [39]:
fig = px.scatter_3d(df, x="age", y="bmi", z="charges",
                    color='smoker',
                    color_discrete_sequence=px.colors.qualitative.Vivid)
fig.show()

In [40]:
df = pd.read_csv("UKMacroData.csv")
df.head()

Unnamed: 0,Date,GDP (£ m),CPI,Bank Rate,Gross Fixed Capital Formation (Investments)
0,2000 Q1,401242,1.1,5.875,69114.0
1,2000 Q2,404196,1.0,6.0,73074.0
2,2000 Q3,406795,1.2,6.0,68011.0
3,2000 Q4,409411,1.4,6.0,70115.0
4,2001 Q1,413054,1.3,5.75,70186.0


We will use a continuous variable to colour the plot: ``Bank Rate``

In [41]:
px.colors.sequential.swatches()

In [42]:
fig = px.scatter_3d(df, x='Gross Fixed Capital Formation (Investments)', y='CPI', z='GDP (£ m)',
                    color='Bank Rate',color_continuous_scale='Bluyl')
fig.show()

# Scatter Matrix

The plot duplicates information in the upper and lower triangles, but its advantage is the ability to interactively select data points in one plot and see them highlighted across all related plots.

In [50]:
df_practice = pd.read_csv("insurance.csv")
df_practice = df_practice.sample(n=100, random_state=1)
df_practice = df_practice.drop(columns=['Unnamed: 0'])
df_practice.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
559,19,male,35.53,0,no,northwest,1646.4297
1087,57,male,31.54,0,no,northwest,11353.2276
1020,51,male,37.0,0,no,southwest,8798.593
460,49,female,36.63,3,no,southeast,10381.4787
802,21,male,22.3,1,no,southwest,2103.08


In [44]:
fig = px.scatter_matrix(df_practice,
                        dimensions=['age', 'bmi', 'children', 'charges'],
                        color='region')
fig.show()

In [45]:
# write your code here
fig = px.scatter_matrix(df_practice,
                        dimensions=['age', 'bmi', 'children', 'charges'],
                        color='sex')
fig.show()

# Parallel Plots

A Parallel Coordinates plot represents each observation as a line across parallel axes (numerical variables) and is useful for identifying patterns and relationships between those variables.

In [51]:
df = pd.read_csv("insurance.csv")
df = df.sample(n=50, random_state=1)
df = df.drop(columns=['Unnamed: 0'])
df.head(3)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
559,19,male,35.53,0,no,northwest,1646.4297
1087,57,male,31.54,0,no,northwest,11353.2276
1020,51,male,37.0,0,no,southwest,8798.593


In [52]:
# We convert categorical to numerical with .replace()
df['smoker'] = df['smoker'].replace({'no':0, 'yes':1})
df['sex'] = df['sex'].replace({'male':0, 'female':1})
df['region']= df['region'].replace({'northwest':0, 'northeast':1, 'southwest':2, 'southeast':3})
df.head(3)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
559,19,0,35.53,0,0,0,1646.4297
1087,57,0,31.54,0,0,0,11353.2276
1020,51,0,37.0,0,0,2,8798.593


In [53]:
fig = px.parallel_coordinates(df, color="smoker",
                              dimensions = ['age','sex','bmi',
                                            'children',	'region','charges'],
                              color_continuous_scale='viridis')
fig.show()

For datasets with mainly **categorical variables**, ``px.parallel_categories()`` is more effective since it avoids converting categories to numbers. However, **the color variable must be numerical**, so categorical ones need conversion first.

In [54]:
df = pd.read_csv("BankChurners.csv").drop(['CLIENTNUM'],axis=1).sample(n=50, random_state=5)
df['Attrition_Flag'] = df['Attrition_Flag'].replace({'Existing Customer':0, 'Attrited Customer':1})
print(df.shape)
df.head(3)

(50, 20)


Unnamed: 0,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
1646,0,36,F,3,Unknown,Married,Less than $40K,Blue,36,6,5,3,2786.0,1628,1158.0,1.314,2853,55,0.667,0.584
7880,0,42,M,2,Uneducated,Married,$60K - $80K,Blue,30,1,2,3,3086.0,0,3086.0,0.808,4129,81,0.884,0.0
7586,1,38,F,1,High School,Married,Less than $40K,Blue,28,3,3,3,4196.0,731,3465.0,0.485,1868,30,0.2,0.174


In [None]:
fig = px.parallel_categories(df,dimensions=['Attrition_Flag','Gender','Gender', 'Dependent_count',
                                            'Education_Level',
                                            'Marital_Status','Income_Category', 'Card_Category','Total_Relationship_Count', 
                                            'Months_Inactive_12_mon','Contacts_Count_12_mon'], 
                             color="Attrition_Flag", color_continuous_scale='viridis')
fig.show()

In [56]:
df_practice = px.data.gapminder()
df_practice = df_practice.sample(n=100, random_state=1).reset_index(drop=True)
df_practice.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,iso_alpha,iso_num
0,Indonesia,Asia,1952,37.468,82052000,749.681655,IDN,360
1,Sierra Leone,Africa,1997,39.897,4578212,574.648158,SLE,694
2,Equatorial Guinea,Africa,2007,51.579,551201,12154.08975,GNQ,226
3,India,Asia,1987,58.553,788000000,976.512676,IND,356
4,Rwanda,Africa,1962,43.0,3051242,597.473073,RWA,646


In [57]:
continent_map = {'Asia':0, 'Europe':1, 'Africa':2,'Americas':3, 'Oceania':4} # Mapping continents to numbers
df_practice['continent'] = df_practice['continent'].replace(continent_map)
df_practice.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,iso_alpha,iso_num
0,Indonesia,0,1952,37.468,82052000,749.681655,IDN,360
1,Sierra Leone,2,1997,39.897,4578212,574.648158,SLE,694
2,Equatorial Guinea,2,2007,51.579,551201,12154.08975,GNQ,226
3,India,0,1987,58.553,788000000,976.512676,IND,356
4,Rwanda,2,1962,43.0,3051242,597.473073,RWA,646


In [58]:
continent_map

{'Asia': 0, 'Europe': 1, 'Africa': 2, 'Americas': 3, 'Oceania': 4}

In [62]:
fig = px.parallel_categories(df_practice,
                             dimensions=['continent', 'lifeExp', 'pop', 'gdpPercap'],
                             color='continent',
                             color_continuous_scale='tealgrn'
                             )
fig.show()