In [70]:
import pandas as pd
import altair as alt

In [71]:
dinos = pd.read_csv("dinosaurs.csv")
dinos.describe()

Unnamed: 0,name,diet,period,lived_in,type,length,taxonomy,named_by,species,link
count,309,309,308,309,308,292,309,309,304,309
unique,309,5,153,32,6,73,102,277,272,309
top,aardonyx,herbivorous,Late Cretaceous 76-74 million years ago,USA,sauropod,6.0m,Dinosauria Ornithischia Genasauria Cerapoda Or...,Marsh (1877),mongoliensis,https://www.nhm.ac.uk/discover/dino-directory/...
freq,1,185,14,78,69,21,11,4,7,1


In [72]:
dinos["length"] = [(float(s[:-1]) if ('m' in str(s)) else None) for s in dinos["length"]]

In [73]:
dinos["time_from"] = [(int(s.split()[-4].split("-")[0]) if (s and ("million years ago" in str(s))) else None) for s in dinos["period"]]
dinos["time_to"] = [(int(s.split()[2].split("-")[-1]) if (s and ("million years ago" in str(s))) else None) for s in dinos["period"]]
dinos["Order"] = ["".join(s.split()[1:2]) or "" for s in dinos["taxonomy"]]
dinos["Family"] =dinos["Order"] +":"+ ["".join(s.split()[2:3]) or "Other" for s in dinos["taxonomy"]]
# dinos = dinos[dinos["Order"]!=":Other"][not(dinos["time_from"] is None)]
dinos = dinos.dropna()
dinos[['time_from', 'time_to', "Order", "Family"]].head()


Unnamed: 0,time_from,time_to,Order,Family
0,199.0,189.0,Saurischia,Saurischia:Sauropodomorpha
1,74.0,70.0,Saurischia,Saurischia:Theropoda
2,83.0,70.0,Ornithischia,Ornithischia:Genasauria
3,99.0,84.0,Saurischia,Saurischia:Theropoda
4,115.0,105.0,Saurischia,Saurischia:Theropoda


In [74]:
periods = pd.DataFrame({"period":['Triassic', 'Jurassic', 'Cretaceous'], 'start':[232, 201, 145], 'end':[201, 145, 65]})
periods

Unnamed: 0,period,start,end
0,Triassic,232,201
1,Jurassic,201,145
2,Cretaceous,145,65


In [104]:
# General plot: existance intervals for each species

points1 = alt.Chart().mark_point().encode(
    x = "time_from:Q",
)
points2 = alt.Chart().mark_point().encode(
    x = "time_to:Q",
)
lines = alt.Chart().mark_line().encode(
    x = alt.X("time_from:Q", title='Time, millions years ago'),
    x2 = "time_to:Q",
)

len_time_chart = (points1 + points2 + lines).encode(
        y=alt.Y("length_j:Q"),
    tooltip = [alt.Tooltip("name:N"),alt.Tooltip("period:N")],
    color = alt.Color("type:N",
            scale=alt.Scale(scheme='dark2')
            )
)

period_rects = alt.Chart(periods).mark_rect(opacity=0.1).encode(
    x2=alt.X2('start:Q'),
    x=alt.X('end:Q', title=None),
    color=alt.Color('period:N',
    #  legend=False
        scale=alt.Scale(scheme='set1')
     )
)
alt.layer(period_rects, len_time_chart, data=dinos
).transform_calculate(
    # Generate Gaussian jitter with a Box-Muller transform
    length_j='datum.length + abs(sqrt(-0.5*log(random()))*cos(2*PI*random()))'
).configure_scale(
    xReverse=True
).resolve_scale(
    color='independent'
).configure_line(

).properties(
    width = 1100, height=500
)


In [106]:
# How big were they? length distriution by main families; Sauropodomorpha are the biggest; Therapoeas are mostly small with few large exceptions

alt.Chart(dinos).transform_density(
    density='length',
    groupby=['Family'],
    counts=True
    # extent= [0, 8]
).mark_line().encode(
    x = alt.X("value:Q", title="Length, m"),
    y = alt.Y("density:Q"),
    color=alt.Color("Family:N")
).properties(width = 1100, height=500)

In [87]:
dinos_expanded = pd.concat([pd.DataFrame({'time': [*range(round(row.time_to), round(row.time_from)+1)],
            'taxonomy': row.taxonomy,
            'length':row.length,
            'type':row.type,
            'Family':row.Family
            }, columns=['time', 'taxonomy', 'length', 'type', "Family"]) 
        for i, row in dinos.iterrows()], ignore_index=True)
dinos_expanded["Genus"] = ["".join(tax.split(" ")[3:4]) or "" for tax in dinos_expanded["taxonomy"]]
# pd.concat([pd.DataFrame({'time': row.time,
#             'taxonomy': row.taxonomy,
#             }, columns=['time', 'taxonomy']) 
#         for i, row in dinos.iterrows()], ignore_index=True)
# dinos_tax
dinos_expanded

Unnamed: 0,time,taxonomy,length,type,Family,Genus
0,189.0,Dinosauria Saurischia Sauropodomorpha Prosauro...,8.0,sauropod,Saurischia:Sauropodomorpha,Prosauropoda
1,190.0,Dinosauria Saurischia Sauropodomorpha Prosauro...,8.0,sauropod,Saurischia:Sauropodomorpha,Prosauropoda
2,191.0,Dinosauria Saurischia Sauropodomorpha Prosauro...,8.0,sauropod,Saurischia:Sauropodomorpha,Prosauropoda
3,192.0,Dinosauria Saurischia Sauropodomorpha Prosauro...,8.0,sauropod,Saurischia:Sauropodomorpha,Prosauropoda
4,193.0,Dinosauria Saurischia Sauropodomorpha Prosauro...,8.0,sauropod,Saurischia:Sauropodomorpha,Prosauropoda
...,...,...,...,...,...,...
2780,116.0,Dinosauria Ornithischia Genasauria Cerapoda Or...,1.8,euornithopod,Ornithischia:Genasauria,Cerapoda
2781,117.0,Dinosauria Ornithischia Genasauria Cerapoda Or...,1.8,euornithopod,Ornithischia:Genasauria,Cerapoda
2782,118.0,Dinosauria Ornithischia Genasauria Cerapoda Or...,1.8,euornithopod,Ornithischia:Genasauria,Cerapoda
2783,119.0,Dinosauria Ornithischia Genasauria Cerapoda Or...,1.8,euornithopod,Ornithischia:Genasauria,Cerapoda


In [107]:
chart_counts = alt.Chart().transform_density(
    'time',
    counts=True,
    # extent=[60, 240],
    groupby=['type']
).mark_bar(orient='vertical', opacity=0.7).encode(
    x=alt.X('value:Q', title='Time, millions years ago'),
    color=alt.Color('type:N',
                    scale=alt.Scale(scheme='dark2')
                    ),
    y=alt.Y(
        'density:Q',
        stack='center',
        impute=None,
        title=None,
        axis=None,
    ),
)

alt.layer(
    chart_counts, period_rects, data=dinos_expanded
).properties(
    width=1100, height=70
).facet(
    row=alt.Row(
        'type:N',
        header=alt.Header(
            titleOrient='bottom',
            labelOrient='bottom',
            labelPadding=0,
        ))
).configure_facet(
    spacing=0
).configure_scale(
    xReverse=True
)


In [101]:
chart_counts = alt.Chart().transform_density(
    'time',
    counts=True,
    # extent=[60, 240],
    groupby=['Genus']
).mark_bar(orient='vertical', opacity=0.7).encode(
    x=alt.X('value:Q'),
    color=alt.Color('Genus:N',
                    scale=alt.Scale(scheme='dark2')
                    ),
    y=alt.Y(
        'density:Q',
        stack='center',
        impute=None,
        title=None,
        axis=None,
    ),
)

alt.layer(
    chart_counts, period_rects, data=dinos_expanded
).properties(
    width=1100, height=70
).facet(
    row=alt.Row(
        'Genus:N',
        header=alt.Header(
            titleOrient='bottom',
            labelOrient='bottom',
            labelPadding=0,
        ))
).configure_facet(
    spacing=0
).configure_scale(
    xReverse=True
)

In [79]:
# How size of different dinosour groups changed over time? Steady growth of Sauropods is seen with peak near beginning and middle of Cretaceous period
chart_lt = alt.Chart(dinos_expanded).mark_line(interpolate="monotone", point=True).encode(
    x=alt.X('time:Q',bin=alt.Bin(step=5), title='Time, millions years ago'),
    color=alt.Color('Family:N'),
    y=alt.Y(
        'mean(length):Q',
        title='Average length',
    ),

)
(chart_lt+period_rects).resolve_scale(
    color='independent'
).configure_scale(
    xReverse=True
).properties(width = 1100, height=500)