# Ibis Basics
Author: Mark Bauer

# Introduction
This tutorial was inspired by my recent but intense interest in composable SQL and Python dataframe code, and of course, Ibis. Learning how well DuckDB integrates with Ibis was even more of a reason to delve deeper into this portable Python dataframe library. And lastly, anything that Wes McKinney is passionate about is probably worth learning.

**Resources**  
Tutorials from Ibis contributors served as great starting points. Much of what I know about Ibis either came from the offical docs or these tutorials:
- Official Docs Tutorial: getting started: https://ibis-project.org/tutorials/getting_started
- Ibis Tutorial PyCon 2024: https://github.com/ibis-project/ibis-tutorial
- A repository of runnable examples using ibis: https://github.com/ibis-project/ibis-examples

**Ibis Documentation**  
Always start at the Official Docs: https://ibis-project.org/

In [1]:
import ibis

In [2]:
# to reproduce
%reload_ext watermark
%watermark -v -p ibis

Python implementation: CPython
Python version       : 3.8.13
IPython version      : 8.4.0

ibis: 3.2.0



In [3]:
# list items in data directory
%ls data/

[34mraw[m[m/                  storm-events.db       storm-events.parquet


In [4]:
# we will use Ibis interactive mode
ibis.options.interactive = True

# Connect to a database. I'm using DuckDB.

In [5]:
# reconnect to the persisted database called storm-events
con = ibis.duckdb.connect("data/storm-events.db")

type(con)

ibis.backends.duckdb.Backend

In [6]:
# list tables in .db file
con.list_tables()

['storm_events']

In [7]:
# create a table expression and assign to storm_events alias
storm_events = con.table("storm_events")

type(storm_events)



ibis.expr.types.relations.Table

Note: DuckDB will limit table preview to 10,000 rows.

# Familiarize with Ibis table 

In [8]:
# get table schema
storm_events.schema()

ibis.Schema {
  EVENT_ID                   int32
  STATE                      string
  STATE_FIPS                 int32
  YEAR                       int32
  MONTH_NAME                 string
  EVENT_TYPE                 string
  CZ_TYPE                    string
  CZ_FIPS                    int32
  CZ_NAME                    string
  WFO                        string
  BEGIN_DATE_TIME            string
  END_DATE_TIME              string
  INJURIES_DIRECT            int32
  INJURIES_INDIRECT          int32
  DEATHS_DIRECT              int32
  DEATHS_INDIRECT            int32
  DAMAGE_PROPERTY            string
  DAMAGE_CROPS               string
  SOURCE                     string
  DAMAGE_PROPERTY_CONVERTED  float64
}

In [9]:
# summarize table
storm_events.info()

[3m                         Summary of storm_events                          [0m
[3m                               1295193 rows                               [0m
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┓
┃[1m [0m[1mName                     [0m[1m [0m┃[1m [0m[1mType                  [0m[1m [0m┃[1m [0m[1m# Nulls[0m[1m [0m┃[1m [0m[1m% Nulls[0m[1m [0m┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━┩
│ EVENT_ID                  │ [1;35mInt32[0m[1m([0m[33mnullable[0m=[3;92mTrue[0m[1m)[0m   │       0 │    0.00 │
│ STATE                     │ [1;35mString[0m[1m([0m[33mnullable[0m=[3;92mTrue[0m[1m)[0m  │       0 │    0.00 │
│ STATE_FIPS                │ [1;35mInt32[0m[1m([0m[33mnullable[0m=[3;92mTrue[0m[1m)[0m   │       0 │    0.00 │
│ YEAR                      │ [1;35mInt32[0m[1m([0m[33mnullable[0m=[3;92mTrue[0m[1m)[0m   │       0 │    0.00 │
│ MONTH_NAME             

In [10]:
# columns attribute
storm_events.columns

['EVENT_ID',
 'STATE',
 'STATE_FIPS',
 'YEAR',
 'MONTH_NAME',
 'EVENT_TYPE',
 'CZ_TYPE',
 'CZ_FIPS',
 'CZ_NAME',
 'WFO',
 'BEGIN_DATE_TIME',
 'END_DATE_TIME',
 'INJURIES_DIRECT',
 'INJURIES_INDIRECT',
 'DEATHS_DIRECT',
 'DEATHS_INDIRECT',
 'DAMAGE_PROPERTY',
 'DAMAGE_CROPS',
 'SOURCE',
 'DAMAGE_PROPERTY_CONVERTED']

# Examine Ibis types

In [11]:
type(storm_events.EVENT_TYPE)

ibis.expr.types.strings.StringColumn

In [12]:
type(storm_events.YEAR)

ibis.expr.types.numeric.IntegerColumn

In [13]:
type(storm_events.DAMAGE_PROPERTY_CONVERTED)

ibis.expr.types.numeric.FloatingColumn

In [14]:
type(storm_events.DAMAGE_PROPERTY_CONVERTED > 100_000)

ibis.expr.types.logical.BooleanColumn

In [15]:
type(storm_events.select('YEAR'))

ibis.expr.types.relations.Table

In [16]:
type(storm_events.select('YEAR').YEAR)

ibis.expr.types.numeric.IntegerColumn

In [17]:
type(storm_events.select('YEAR', 'EVENT_TYPE'))

ibis.expr.types.relations.Table

# Preview data

In [18]:
# preview first five rows
storm_events.head()

Unnamed: 0,EVENT_ID,STATE,STATE_FIPS,YEAR,MONTH_NAME,EVENT_TYPE,CZ_TYPE,CZ_FIPS,CZ_NAME,WFO,BEGIN_DATE_TIME,END_DATE_TIME,INJURIES_DIRECT,INJURIES_INDIRECT,DEATHS_DIRECT,DEATHS_INDIRECT,DAMAGE_PROPERTY,DAMAGE_CROPS,SOURCE,DAMAGE_PROPERTY_CONVERTED
0,10096222,OKLAHOMA,40,1950,April,Tornado,C,149,WASHITA,,28-APR-50 14:45:00,28-APR-50 14:45:00,0,0,0,0,250K,0,,250000.0
1,10120412,TEXAS,48,1950,April,Tornado,C,93,COMANCHE,,29-APR-50 15:30:00,29-APR-50 15:30:00,0,0,0,0,25K,0,,25000.0
2,10104927,PENNSYLVANIA,42,1950,July,Tornado,C,77,LEHIGH,,05-JUL-50 18:00:00,05-JUL-50 18:00:00,2,0,0,0,25K,0,,25000.0
3,10104928,PENNSYLVANIA,42,1950,July,Tornado,C,43,DAUPHIN,,05-JUL-50 18:30:00,05-JUL-50 18:30:00,0,0,0,0,2.5K,0,,2500.0
4,10104929,PENNSYLVANIA,42,1950,July,Tornado,C,39,CRAWFORD,,24-JUL-50 14:40:00,24-JUL-50 14:40:00,0,0,0,0,2.5K,0,,2500.0


In [19]:
type(storm_events.head())

ibis.expr.types.relations.Table

In [20]:
# transform to pandas df
storm_events.head().execute()

Unnamed: 0,EVENT_ID,STATE,STATE_FIPS,YEAR,MONTH_NAME,EVENT_TYPE,CZ_TYPE,CZ_FIPS,CZ_NAME,WFO,BEGIN_DATE_TIME,END_DATE_TIME,INJURIES_DIRECT,INJURIES_INDIRECT,DEATHS_DIRECT,DEATHS_INDIRECT,DAMAGE_PROPERTY,DAMAGE_CROPS,SOURCE,DAMAGE_PROPERTY_CONVERTED
0,10096222,OKLAHOMA,40,1950,April,Tornado,C,149,WASHITA,,28-APR-50 14:45:00,28-APR-50 14:45:00,0,0,0,0,250K,0,,250000.0
1,10120412,TEXAS,48,1950,April,Tornado,C,93,COMANCHE,,29-APR-50 15:30:00,29-APR-50 15:30:00,0,0,0,0,25K,0,,25000.0
2,10104927,PENNSYLVANIA,42,1950,July,Tornado,C,77,LEHIGH,,05-JUL-50 18:00:00,05-JUL-50 18:00:00,2,0,0,0,25K,0,,25000.0
3,10104928,PENNSYLVANIA,42,1950,July,Tornado,C,43,DAUPHIN,,05-JUL-50 18:30:00,05-JUL-50 18:30:00,0,0,0,0,2.5K,0,,2500.0
4,10104929,PENNSYLVANIA,42,1950,July,Tornado,C,39,CRAWFORD,,24-JUL-50 14:40:00,24-JUL-50 14:40:00,0,0,0,0,2.5K,0,,2500.0


In [21]:
type(storm_events.head().execute())

pandas.core.frame.DataFrame

In [22]:
# numer of rows
storm_events.count()

1295193

In [23]:
# numer of columns
len(storm_events.schema())

20

In [24]:
# row, columns - similar to pandas df.shape attribute
(storm_events.count().execute(), len(storm_events.schema()))

(1295193, 20)

In [25]:
# limit number of rows
storm_events.limit(5)

Unnamed: 0,EVENT_ID,STATE,STATE_FIPS,YEAR,MONTH_NAME,EVENT_TYPE,CZ_TYPE,CZ_FIPS,CZ_NAME,WFO,BEGIN_DATE_TIME,END_DATE_TIME,INJURIES_DIRECT,INJURIES_INDIRECT,DEATHS_DIRECT,DEATHS_INDIRECT,DAMAGE_PROPERTY,DAMAGE_CROPS,SOURCE,DAMAGE_PROPERTY_CONVERTED
0,10096222,OKLAHOMA,40,1950,April,Tornado,C,149,WASHITA,,28-APR-50 14:45:00,28-APR-50 14:45:00,0,0,0,0,250K,0,,250000.0
1,10120412,TEXAS,48,1950,April,Tornado,C,93,COMANCHE,,29-APR-50 15:30:00,29-APR-50 15:30:00,0,0,0,0,25K,0,,25000.0
2,10104927,PENNSYLVANIA,42,1950,July,Tornado,C,77,LEHIGH,,05-JUL-50 18:00:00,05-JUL-50 18:00:00,2,0,0,0,25K,0,,25000.0
3,10104928,PENNSYLVANIA,42,1950,July,Tornado,C,43,DAUPHIN,,05-JUL-50 18:30:00,05-JUL-50 18:30:00,0,0,0,0,2.5K,0,,2500.0
4,10104929,PENNSYLVANIA,42,1950,July,Tornado,C,39,CRAWFORD,,24-JUL-50 14:40:00,24-JUL-50 14:40:00,0,0,0,0,2.5K,0,,2500.0


# Methods
## Select

In [26]:
# select columns
storm_events.select(
    "YEAR",
    "STATE",
    "EVENT_TYPE",
    "DAMAGE_PROPERTY_CONVERTED"
)

Unnamed: 0,YEAR,STATE,EVENT_TYPE,DAMAGE_PROPERTY_CONVERTED
0,1950,OKLAHOMA,Tornado,250000.0
1,1950,TEXAS,Tornado,25000.0
2,1950,PENNSYLVANIA,Tornado,25000.0
3,1950,PENNSYLVANIA,Tornado,2500.0
4,1950,PENNSYLVANIA,Tornado,2500.0
...,...,...,...,...
9995,1956,OKLAHOMA,Thunderstorm Wind,0.0
9996,1956,OKLAHOMA,Thunderstorm Wind,0.0
9997,1956,OKLAHOMA,Thunderstorm Wind,0.0
9998,1956,OKLAHOMA,Tornado,250000.0


## Drop


In [27]:
# columns to drop
cols = [
    'STATE_FIPS',
    'MONTH_NAME',
    'EVENT_TYPE',
    'CZ_TYPE',
    'CZ_FIPS',
    'CZ_NAME',
    'WFO',
    'BEGIN_DATE_TIME',
    'END_DATE_TIME',
    'INJURIES_DIRECT',
    'INJURIES_INDIRECT',
    'DEATHS_DIRECT',
    'DEATHS_INDIRECT',
    'DAMAGE_PROPERTY',
    'DAMAGE_CROPS',
    'SOURCE',
]

storm_events.drop(cols)

Unnamed: 0,EVENT_ID,STATE,YEAR,DAMAGE_PROPERTY_CONVERTED
0,10096222,OKLAHOMA,1950,250000.0
1,10120412,TEXAS,1950,25000.0
2,10104927,PENNSYLVANIA,1950,25000.0
3,10104928,PENNSYLVANIA,1950,2500.0
4,10104929,PENNSYLVANIA,1950,2500.0
...,...,...,...,...
9995,10093724,OKLAHOMA,1956,0.0
9996,10093725,OKLAHOMA,1956,0.0
9997,10093726,OKLAHOMA,1956,0.0
9998,10093727,OKLAHOMA,1956,250000.0


## Filter

In [28]:
# filter events only contain New York
(storm_events
 .filter(storm_events.STATE == "NEW YORK")
 .limit(10)
)

Unnamed: 0,EVENT_ID,STATE,STATE_FIPS,YEAR,MONTH_NAME,EVENT_TYPE,CZ_TYPE,CZ_FIPS,CZ_NAME,WFO,BEGIN_DATE_TIME,END_DATE_TIME,INJURIES_DIRECT,INJURIES_INDIRECT,DEATHS_DIRECT,DEATHS_INDIRECT,DAMAGE_PROPERTY,DAMAGE_CROPS,SOURCE,DAMAGE_PROPERTY_CONVERTED
0,10075659,NEW YORK,36,1952,May,Tornado,C,31,ESSEX,,06-MAY-52 13:00:00,06-MAY-52 13:00:00,0,0,0,0,25K,0,,25000.0
1,10075660,NEW YORK,36,1954,July,Tornado,C,123,YATES,,14-JUL-54 16:30:00,14-JUL-54 16:30:00,0,0,0,0,2.5M,0,,2500000.0
2,10075661,NEW YORK,36,1954,September,Tornado,C,89,ST. LAWRENCE,,19-SEP-54 17:30:00,19-SEP-54 17:30:00,0,0,0,0,25K,0,,25000.0
3,10075701,NEW YORK,36,1958,June,Thunderstorm Wind,C,29,ERIE,,25-JUN-58 18:30:00,25-JUN-58 18:30:00,0,0,0,0,0,0,,0.0
4,10075702,NEW YORK,36,1958,June,Thunderstorm Wind,C,29,ERIE,,25-JUN-58 18:30:00,25-JUN-58 18:30:00,0,0,0,0,0,0,,0.0
5,10075703,NEW YORK,36,1958,June,Thunderstorm Wind,C,55,MONROE,,25-JUN-58 19:00:00,25-JUN-58 19:00:00,0,0,0,0,0,0,,0.0
6,10075704,NEW YORK,36,1958,June,Thunderstorm Wind,C,51,LIVINGSTON,,25-JUN-58 19:00:00,25-JUN-58 19:00:00,0,0,0,0,0,0,,0.0
7,10075697,NEW YORK,36,1958,May,Thunderstorm Wind,C,17,CHENANGO,,18-MAY-58 14:00:00,18-MAY-58 14:00:00,0,0,0,0,0,0,,0.0
8,10075698,NEW YORK,36,1958,May,Thunderstorm Wind,C,11,CAYUGA,,18-MAY-58 14:15:00,18-MAY-58 14:15:00,0,0,0,0,0,0,,0.0
9,10075699,NEW YORK,36,1958,May,Hail,C,117,WAYNE,,18-MAY-58 14:30:00,18-MAY-58 14:30:00,0,0,0,0,0,0,,0.0


In [29]:
storm_events.filter(
    (storm_events.STATE == "NEW YORK")
    & (storm_events.DAMAGE_PROPERTY_CONVERTED > 90_000_000)
)

Unnamed: 0,EVENT_ID,STATE,STATE_FIPS,YEAR,MONTH_NAME,EVENT_TYPE,CZ_TYPE,CZ_FIPS,CZ_NAME,WFO,BEGIN_DATE_TIME,END_DATE_TIME,INJURIES_DIRECT,INJURIES_INDIRECT,DEATHS_DIRECT,DEATHS_INDIRECT,DAMAGE_PROPERTY,DAMAGE_CROPS,SOURCE,DAMAGE_PROPERTY_CONVERTED
0,5516309,NEW YORK,36,2006,June,Flash Flood,C,107,TIOGA,BGM,27-JUN-06 14:10:00,28-JUN-06 14:00:00,0,0,0,0,100M,,EMERGENCY MANAGER,100000000.0
1,5519036,NEW YORK,36,2006,June,Flash Flood,C,105,SULLIVAN,BGM,27-JUN-06 04:00:00,28-JUN-06 14:00:00,0,0,1,0,100M,,EMERGENCY MANAGER,100000000.0
2,5519040,NEW YORK,36,2006,June,Flash Flood,C,7,BROOME,BGM,27-JUN-06 16:00:00,28-JUN-06 14:00:00,0,0,0,0,200M,,EMERGENCY MANAGER,200000000.0
3,5519034,NEW YORK,36,2006,June,Flash Flood,C,25,DELAWARE,BGM,27-JUN-06 08:10:00,28-JUN-06 13:00:00,0,0,2,0,250M,,LAW ENFORCEMENT,250000000.0
4,1266,NEW YORK,36,2006,October,Lake-Effect Snow,Z,10,NORTHERN ERIE,BUF,12-OCT-06 14:30:00,13-OCT-06 07:00:00,0,0,1,0,130.00M,0.00K,Law Enforcement,130000000.0
5,206970,NEW YORK,36,2009,December,High Wind,Z,7,JEFFERSON,BUF,09-DEC-09 13:00:00,10-DEC-09 09:00:00,0,0,0,0,100.00M,0.00K,Utility Company,100000000.0
6,348760,NEW YORK,36,2011,September,Flood,C,107,TIOGA,BGM,07-SEP-11 16:54:00,11-SEP-11 03:00:00,0,0,0,0,309.00M,0.00K,River/Stream Gage,309000000.0
7,348756,NEW YORK,36,2011,September,Flood,C,7,BROOME,BGM,07-SEP-11 14:23:00,11-SEP-11 03:45:00,0,0,0,0,170.00M,0.00K,River/Stream Gage,170000000.0
8,349312,NEW YORK,36,2011,September,Flood,C,7,BROOME,BGM,07-SEP-11 12:22:00,11-SEP-11 23:40:00,0,0,0,0,160.00M,0.00K,River/Stream Gage,160000000.0
9,995048,NEW YORK,36,2021,September,Flash Flood,C,119,WESTCHESTER,OKX,01-SEP-21 21:30:00,01-SEP-21 22:00:00,0,0,0,0,92.60M,0.00K,Emergency Manager,92600000.0


In [30]:
# chaining with select and filter methods
new_york = (
    storm_events
    .select(["STATE", "EVENT_TYPE", "YEAR", "DAMAGE_PROPERTY", "DAMAGE_PROPERTY_CONVERTED"])
    .filter(
        (storm_events["STATE"] == "NEW YORK")
        & (storm_events.DAMAGE_PROPERTY_CONVERTED > 90_000_000))
)
    
new_york

Unnamed: 0,STATE,EVENT_TYPE,YEAR,DAMAGE_PROPERTY,DAMAGE_PROPERTY_CONVERTED
0,NEW YORK,Flash Flood,2006,100M,100000000.0
1,NEW YORK,Flash Flood,2006,100M,100000000.0
2,NEW YORK,Flash Flood,2006,200M,200000000.0
3,NEW YORK,Flash Flood,2006,250M,250000000.0
4,NEW YORK,Lake-Effect Snow,2006,130.00M,130000000.0
5,NEW YORK,High Wind,2009,100.00M,100000000.0
6,NEW YORK,Flood,2011,309.00M,309000000.0
7,NEW YORK,Flood,2011,170.00M,170000000.0
8,NEW YORK,Flood,2011,160.00M,160000000.0
9,NEW YORK,Flash Flood,2021,92.60M,92600000.0


## Mutate
Adding columns is performed using the `mutate` method. Adding other ways to mutate columns as well.

In [31]:
# add column
conversion = storm_events.DAMAGE_PROPERTY_CONVERTED / 1000

storm_events.mutate(DAMAGE_PROPERTY_CONVERTED_K = conversion)

Unnamed: 0,EVENT_ID,STATE,STATE_FIPS,YEAR,MONTH_NAME,EVENT_TYPE,CZ_TYPE,CZ_FIPS,CZ_NAME,WFO,...,END_DATE_TIME,INJURIES_DIRECT,INJURIES_INDIRECT,DEATHS_DIRECT,DEATHS_INDIRECT,DAMAGE_PROPERTY,DAMAGE_CROPS,SOURCE,DAMAGE_PROPERTY_CONVERTED,DAMAGE_PROPERTY_CONVERTED_K
0,10096222,OKLAHOMA,40,1950,April,Tornado,C,149,WASHITA,,...,28-APR-50 14:45:00,0,0,0,0,250K,0,,250000.0,250.0
1,10120412,TEXAS,48,1950,April,Tornado,C,93,COMANCHE,,...,29-APR-50 15:30:00,0,0,0,0,25K,0,,25000.0,25.0
2,10104927,PENNSYLVANIA,42,1950,July,Tornado,C,77,LEHIGH,,...,05-JUL-50 18:00:00,2,0,0,0,25K,0,,25000.0,25.0
3,10104928,PENNSYLVANIA,42,1950,July,Tornado,C,43,DAUPHIN,,...,05-JUL-50 18:30:00,0,0,0,0,2.5K,0,,2500.0,2.5
4,10104929,PENNSYLVANIA,42,1950,July,Tornado,C,39,CRAWFORD,,...,24-JUL-50 14:40:00,0,0,0,0,2.5K,0,,2500.0,2.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,10093724,OKLAHOMA,40,1956,April,Thunderstorm Wind,C,103,NOBLE,,...,08-APR-56 23:00:00,0,0,0,0,0,0,,0.0,0.0
9996,10093725,OKLAHOMA,40,1956,April,Thunderstorm Wind,C,65,JACKSON,,...,08-APR-56 23:00:00,0,0,0,0,0,0,,0.0,0.0
9997,10093726,OKLAHOMA,40,1956,April,Thunderstorm Wind,C,83,LOGAN,,...,08-APR-56 23:02:00,0,0,0,0,0,0,,0.0,0.0
9998,10093727,OKLAHOMA,40,1956,April,Tornado,C,119,PAYNE,,...,08-APR-56 23:30:00,0,0,0,0,250K,0,,250000.0,250.0


In [32]:
# add a "$" to the beginning of the damage field
cast_col = storm_events.DAMAGE_PROPERTY_CONVERTED.cast("int").cast("str")
dollars = ("$" + cast_col).name("DAMAGE_PROPERTY_CONVERTED_DOLLARS")

# pass name expression "dollars" to mutate fuction to create new column
storm_events = storm_events.mutate(dollars)

storm_events

Unnamed: 0,EVENT_ID,STATE,STATE_FIPS,YEAR,MONTH_NAME,EVENT_TYPE,CZ_TYPE,CZ_FIPS,CZ_NAME,WFO,...,END_DATE_TIME,INJURIES_DIRECT,INJURIES_INDIRECT,DEATHS_DIRECT,DEATHS_INDIRECT,DAMAGE_PROPERTY,DAMAGE_CROPS,SOURCE,DAMAGE_PROPERTY_CONVERTED,DAMAGE_PROPERTY_CONVERTED_DOLLARS
0,10096222,OKLAHOMA,40,1950,April,Tornado,C,149,WASHITA,,...,28-APR-50 14:45:00,0,0,0,0,250K,0,,250000.0,$250000
1,10120412,TEXAS,48,1950,April,Tornado,C,93,COMANCHE,,...,29-APR-50 15:30:00,0,0,0,0,25K,0,,25000.0,$25000
2,10104927,PENNSYLVANIA,42,1950,July,Tornado,C,77,LEHIGH,,...,05-JUL-50 18:00:00,2,0,0,0,25K,0,,25000.0,$25000
3,10104928,PENNSYLVANIA,42,1950,July,Tornado,C,43,DAUPHIN,,...,05-JUL-50 18:30:00,0,0,0,0,2.5K,0,,2500.0,$2500
4,10104929,PENNSYLVANIA,42,1950,July,Tornado,C,39,CRAWFORD,,...,24-JUL-50 14:40:00,0,0,0,0,2.5K,0,,2500.0,$2500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,10093724,OKLAHOMA,40,1956,April,Thunderstorm Wind,C,103,NOBLE,,...,08-APR-56 23:00:00,0,0,0,0,0,0,,0.0,$0
9996,10093725,OKLAHOMA,40,1956,April,Thunderstorm Wind,C,65,JACKSON,,...,08-APR-56 23:00:00,0,0,0,0,0,0,,0.0,$0
9997,10093726,OKLAHOMA,40,1956,April,Thunderstorm Wind,C,83,LOGAN,,...,08-APR-56 23:02:00,0,0,0,0,0,0,,0.0,$0
9998,10093727,OKLAHOMA,40,1956,April,Tornado,C,119,PAYNE,,...,08-APR-56 23:30:00,0,0,0,0,250K,0,,250000.0,$250000


In [33]:
# rename columns, example below
temp_table = storm_events.select("EVENT_TYPE", "DAMAGE_PROPERTY_CONVERTED")

renamed_table = temp_table.relabel(dict(
    EVENT_TYPE="aaa",
    DAMAGE_PROPERTY_CONVERTED="bbb"
))

renamed_table

Unnamed: 0,aaa,bbb
0,Tornado,250000.0
1,Tornado,25000.0
2,Tornado,25000.0
3,Tornado,2500.0
4,Tornado,2500.0
...,...,...
9995,Thunderstorm Wind,0.0
9996,Thunderstorm Wind,0.0
9997,Thunderstorm Wind,0.0
9998,Tornado,250000.0


In [34]:
# sanity check
renamed_table.schema()

ibis.Schema {
  aaa  string
  bbb  float64
}

In [35]:
# convert columns to lowercase, source: stackoverflow from Ian Cook
table_cols_lower = storm_events.relabel(dict(zip(
    storm_events.columns,
    [x.lower() for x in storm_events.columns]
)))

table_cols_lower

Unnamed: 0,event_id,state,state_fips,year,month_name,event_type,cz_type,cz_fips,cz_name,wfo,...,end_date_time,injuries_direct,injuries_indirect,deaths_direct,deaths_indirect,damage_property,damage_crops,source,damage_property_converted,damage_property_converted_dollars
0,10096222,OKLAHOMA,40,1950,April,Tornado,C,149,WASHITA,,...,28-APR-50 14:45:00,0,0,0,0,250K,0,,250000.0,$250000
1,10120412,TEXAS,48,1950,April,Tornado,C,93,COMANCHE,,...,29-APR-50 15:30:00,0,0,0,0,25K,0,,25000.0,$25000
2,10104927,PENNSYLVANIA,42,1950,July,Tornado,C,77,LEHIGH,,...,05-JUL-50 18:00:00,2,0,0,0,25K,0,,25000.0,$25000
3,10104928,PENNSYLVANIA,42,1950,July,Tornado,C,43,DAUPHIN,,...,05-JUL-50 18:30:00,0,0,0,0,2.5K,0,,2500.0,$2500
4,10104929,PENNSYLVANIA,42,1950,July,Tornado,C,39,CRAWFORD,,...,24-JUL-50 14:40:00,0,0,0,0,2.5K,0,,2500.0,$2500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,10093724,OKLAHOMA,40,1956,April,Thunderstorm Wind,C,103,NOBLE,,...,08-APR-56 23:00:00,0,0,0,0,0,0,,0.0,$0
9996,10093725,OKLAHOMA,40,1956,April,Thunderstorm Wind,C,65,JACKSON,,...,08-APR-56 23:00:00,0,0,0,0,0,0,,0.0,$0
9997,10093726,OKLAHOMA,40,1956,April,Thunderstorm Wind,C,83,LOGAN,,...,08-APR-56 23:02:00,0,0,0,0,0,0,,0.0,$0
9998,10093727,OKLAHOMA,40,1956,April,Tornado,C,119,PAYNE,,...,08-APR-56 23:30:00,0,0,0,0,250K,0,,250000.0,$250000


## Order By

In [36]:
# order by DAMAGE_PROPERTY_CONVERTED in ascending order
(storm_events
 .sort_by(storm_events.DAMAGE_PROPERTY_CONVERTED)
 .select(
    "YEAR",
    "STATE",
    "EVENT_TYPE",
    "DAMAGE_PROPERTY_CONVERTED")
 .limit(10)
)

Unnamed: 0,YEAR,STATE,EVENT_TYPE,DAMAGE_PROPERTY_CONVERTED
0,2007,ALABAMA,Hail,0.0
1,2007,TEXAS,Heavy Rain,0.0
2,2007,KENTUCKY,Drought,0.0
3,2007,PENNSYLVANIA,Coastal Flood,0.0
4,2007,PENNSYLVANIA,Coastal Flood,0.0
5,2007,MONTANA,Thunderstorm Wind,0.0
6,2007,MONTANA,Thunderstorm Wind,0.0
7,2007,MONTANA,Hail,0.0
8,2007,MONTANA,Hail,0.0
9,2007,COLORADO,Hail,0.0


In [37]:
# order by DAMAGE_PROPERTY_CONVERTED in descending order
(storm_events
 .sort_by(ibis.desc("DAMAGE_PROPERTY_CONVERTED"))
 .select(
    "YEAR",
    "STATE",
    "EVENT_TYPE",
    "DAMAGE_PROPERTY_CONVERTED")
 .limit(10)
)

Unnamed: 0,YEAR,STATE,EVENT_TYPE,DAMAGE_PROPERTY_CONVERTED
0,2020,WASHINGTON,High Wind,950000000.0
1,2013,ILLINOIS,Tornado,910000000.0
2,2012,TEXAS,Hail,900000000.0
3,2020,LOUISIANA,Hurricane,900000000.0
4,2010,ARIZONA,Hail,900000000.0
5,2004,FLORIDA,High Wind,881000000.0
6,2005,LOUISIANA,Hurricane (Typhoon),850000000.0
7,2005,LOUISIANA,Hurricane (Typhoon),850000000.0
8,2020,LOUISIANA,Hurricane,800000000.0
9,2017,PUERTO RICO,Flash Flood,750000000.0


In [38]:
# order by DAMAGE_PROPERTY_CONVERTED and YEAR both in descending order
(storm_events
 .sort_by([
     ibis.desc("DAMAGE_PROPERTY_CONVERTED"),
     ibis.desc("YEAR")])
 .select(
    "YEAR",
    "STATE",
    "EVENT_TYPE",
    "DAMAGE_PROPERTY_CONVERTED")
 .limit(10)
)

Unnamed: 0,YEAR,STATE,EVENT_TYPE,DAMAGE_PROPERTY_CONVERTED
0,2020,WASHINGTON,High Wind,950000000.0
1,2013,ILLINOIS,Tornado,910000000.0
2,2020,LOUISIANA,Hurricane,900000000.0
3,2012,TEXAS,Hail,900000000.0
4,2010,ARIZONA,Hail,900000000.0
5,2004,FLORIDA,High Wind,881000000.0
6,2005,LOUISIANA,Hurricane (Typhoon),850000000.0
7,2005,LOUISIANA,Hurricane (Typhoon),850000000.0
8,2020,LOUISIANA,Hurricane,800000000.0
9,2021,LOUISIANA,Storm Surge/Tide,750000000.0


## Aggregate

In [39]:
# count rows
storm_events.count()

1295193

In [40]:
storm_events.DAMAGE_PROPERTY_CONVERTED.min()

0.0

In [41]:
storm_events.DAMAGE_PROPERTY_CONVERTED.max()

950000000.0

In [42]:
storm_events.DAMAGE_PROPERTY_CONVERTED.mean()

222760.43361035068

In [43]:
storm_events.DAMAGE_PROPERTY_CONVERTED.sum()

288517754289.09094

In [44]:
# mean and max of DAMAGE_PROPERTY_CONVERTED column
storm_events.aggregate([
    storm_events.DAMAGE_PROPERTY_CONVERTED.mean(),
    storm_events.DAMAGE_PROPERTY_CONVERTED.max()
])

Unnamed: 0,mean,max
0,222760.43361,950000000.0


## Group By

In [45]:
type(storm_events.group_by("EVENT_TYPE"))

ibis.expr.types.groupby.GroupedTable

In [46]:
# sum of DAMAGE_PROPERTY by EVENT_TYPE and YEAR
(storm_events
 .group_by(["EVENT_TYPE", "YEAR"])
 .aggregate(storm_events.DAMAGE_PROPERTY_CONVERTED.sum().name("DAMAGE_PROPERTY"))
 .sort_by(ibis.desc("DAMAGE_PROPERTY"))
 .limit(10)
)

Unnamed: 0,EVENT_TYPE,YEAR,DAMAGE_PROPERTY
0,Flash Flood,2017,19118690000.0
1,Hurricane (Typhoon),2005,11112730000.0
2,Hurricane,2021,7588400000.0
3,Hurricane,2018,7143780000.0
4,Hurricane,2020,6702710000.0
5,Tornado,2011,3884651000.0
6,Hurricane (Typhoon),2008,3663098000.0
7,Flash Flood,2016,3645402000.0
8,Tropical Storm,2017,3645017000.0
9,Flood,2011,3621686000.0


In [47]:
# sum and mean of DAMAGE_PROPERTY by EVENT_TYPE and YEAR
(storm_events
 .group_by(["EVENT_TYPE", "YEAR"])
 .aggregate([
     storm_events.DAMAGE_PROPERTY_CONVERTED.mean().name("DAMAGE_PROPERTY_AVG"),
     storm_events.DAMAGE_PROPERTY_CONVERTED.sum().name("DAMAGE_PROPERTY")])
 .sort_by(ibis.desc("DAMAGE_PROPERTY_AVG"))
 .limit(10)
)

Unnamed: 0,EVENT_TYPE,YEAR,DAMAGE_PROPERTY_AVG,DAMAGE_PROPERTY
0,Hurricane,2016,201733300.0,605200000.0
1,Hurricane,2021,199694700.0,7588400000.0
2,Hurricane,2020,113605300.0,6702710000.0
3,Hurricane,2018,106623600.0,7143780000.0
4,Hurricane,2017,82106490.0,3037940000.0
5,Tsunami,2009,81000000.0,81000000.0
6,Storm Surge/Tide,2021,72908670.0,3572525000.0
7,Hurricane (Typhoon),2008,50876350.0,3663098000.0
8,Storm Surge/Tide,2018,47735090.0,1050172000.0
9,Hurricane (Typhoon),1997,47223570.0,661130000.0


In [48]:
# sum and mean of DAMAGE_PROPERTY by EVENT_TYPE
(storm_events
 .group_by(["EVENT_TYPE"])
 .aggregate([
     storm_events.DAMAGE_PROPERTY_CONVERTED.mean().name("DAMAGE_PROPERTY_AVG"),
     storm_events.DAMAGE_PROPERTY_CONVERTED.sum().name("DAMAGE_PROPERTY")])
 .sort_by(ibis.desc("DAMAGE_PROPERTY_AVG"))
 .limit(10)
)

Unnamed: 0,EVENT_TYPE,DAMAGE_PROPERTY_AVG,DAMAGE_PROPERTY
0,Hurricane,105236400.0,25151500000.0
1,Hurricane (Typhoon),23424310.0,24642370000.0
2,Storm Surge/Tide,10454420.0,9837608000.0
3,Tsunami,4146368.0,157562000.0
4,Tropical Storm,2325792.0,9733441000.0
5,Wildfire,1789479.0,10140980000.0
6,Ice Storm,928176.9,5897636000.0
7,Tornado,924067.0,61363590000.0
8,Coastal Flood,702319.8,1927868000.0
9,Debris Flow,679634.6,1296743000.0


## Cast

In [49]:
# cast DAMAGE_PROPERTY_CONVERTED to int
expr = storm_events.DAMAGE_PROPERTY_CONVERTED.cast("int32")

storm_events.mutate(DAMAGE_PROPERTY_CONVERTED_INT=expr)

Unnamed: 0,EVENT_ID,STATE,STATE_FIPS,YEAR,MONTH_NAME,EVENT_TYPE,CZ_TYPE,CZ_FIPS,CZ_NAME,WFO,...,INJURIES_DIRECT,INJURIES_INDIRECT,DEATHS_DIRECT,DEATHS_INDIRECT,DAMAGE_PROPERTY,DAMAGE_CROPS,SOURCE,DAMAGE_PROPERTY_CONVERTED,DAMAGE_PROPERTY_CONVERTED_DOLLARS,DAMAGE_PROPERTY_CONVERTED_INT
0,10096222,OKLAHOMA,40,1950,April,Tornado,C,149,WASHITA,,...,0,0,0,0,250K,0,,250000.0,$250000,250000
1,10120412,TEXAS,48,1950,April,Tornado,C,93,COMANCHE,,...,0,0,0,0,25K,0,,25000.0,$25000,25000
2,10104927,PENNSYLVANIA,42,1950,July,Tornado,C,77,LEHIGH,,...,2,0,0,0,25K,0,,25000.0,$25000,25000
3,10104928,PENNSYLVANIA,42,1950,July,Tornado,C,43,DAUPHIN,,...,0,0,0,0,2.5K,0,,2500.0,$2500,2500
4,10104929,PENNSYLVANIA,42,1950,July,Tornado,C,39,CRAWFORD,,...,0,0,0,0,2.5K,0,,2500.0,$2500,2500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,10093724,OKLAHOMA,40,1956,April,Thunderstorm Wind,C,103,NOBLE,,...,0,0,0,0,0,0,,0.0,$0,0
9996,10093725,OKLAHOMA,40,1956,April,Thunderstorm Wind,C,65,JACKSON,,...,0,0,0,0,0,0,,0.0,$0,0
9997,10093726,OKLAHOMA,40,1956,April,Thunderstorm Wind,C,83,LOGAN,,...,0,0,0,0,0,0,,0.0,$0,0
9998,10093727,OKLAHOMA,40,1956,April,Tornado,C,119,PAYNE,,...,0,0,0,0,250K,0,,250000.0,$250000,250000


## Other

In [50]:
# distinct values
storm_events.select("EVENT_TYPE").distinct()

Unnamed: 0,EVENT_TYPE
0,Tornado
1,Frost/Freeze
2,Extreme Cold/Wind Chill
3,Dense Smoke
4,Cold/Wind Chill
...,...
64,Coastal Flood
65,Avalanche
66,Astronomical Low Tide
67,Freezing Fog


In [51]:
# value counts of EVENT_TYPE
(storm_events
 .EVENT_TYPE
 .value_counts()
 .sort_by(ibis.desc("count"))
 .limit(20)
)

Unnamed: 0,EVENT_TYPE,count
0,Thunderstorm Wind,408310
1,Hail,242063
2,Flash Flood,79247
3,Tornado,66406
4,High Wind,59177
5,Winter Weather,52422
6,Flood,51073
7,Winter Storm,49599
8,Drought,43288
9,Heavy Snow,38561


In [52]:
# value counts of boolean expression
value_counts = (storm_events.DAMAGE_PROPERTY_CONVERTED > 100_000_000).name("major_damage")

storm_events.mutate(value_counts).major_damage.value_counts()

Unnamed: 0,major_damage,count
0,False,1294724
1,True,469


In [53]:
# convert to sql statement
expr = (
    storm_events
    .group_by(["YEAR", "EVENT_TYPE"])
    .aggregate(storm_events.DAMAGE_PROPERTY_CONVERTED.sum())
    .sort_by(ibis.desc("sum"))
    .limit(10)
)

ibis.to_sql(expr, dialect="postgres")

'SELECT\n  t0."YEAR",\n  t0."EVENT_TYPE",\n  t0.sum\nFROM (\n  SELECT\n    t1."YEAR" AS "YEAR",\n    t1."EVENT_TYPE" AS "EVENT_TYPE",\n    SUM(t1."DAMAGE_PROPERTY_CONVERTED") AS sum\n  FROM (\n    SELECT\n      t2."EVENT_ID" AS "EVENT_ID",\n      t2."STATE" AS "STATE",\n      t2."STATE_FIPS" AS "STATE_FIPS",\n      t2."YEAR" AS "YEAR",\n      t2."MONTH_NAME" AS "MONTH_NAME",\n      t2."EVENT_TYPE" AS "EVENT_TYPE",\n      t2."CZ_TYPE" AS "CZ_TYPE",\n      t2."CZ_FIPS" AS "CZ_FIPS",\n      t2."CZ_NAME" AS "CZ_NAME",\n      t2."WFO" AS "WFO",\n      t2."BEGIN_DATE_TIME" AS "BEGIN_DATE_TIME",\n      t2."END_DATE_TIME" AS "END_DATE_TIME",\n      t2."INJURIES_DIRECT" AS "INJURIES_DIRECT",\n      t2."INJURIES_INDIRECT" AS "INJURIES_INDIRECT",\n      t2."DEATHS_DIRECT" AS "DEATHS_DIRECT",\n      t2."DEATHS_INDIRECT" AS "DEATHS_INDIRECT",\n      t2."DAMAGE_PROPERTY" AS "DAMAGE_PROPERTY",\n      t2."DAMAGE_CROPS" AS "DAMAGE_CROPS",\n      t2."SOURCE" AS "SOURCE",\n      t2."DAMAGE_PROPERTY_CONVE

In [54]:
# is in
storm_events[storm_events.EVENT_TYPE.isin(["Hail"])]

Unnamed: 0,EVENT_ID,STATE,STATE_FIPS,YEAR,MONTH_NAME,EVENT_TYPE,CZ_TYPE,CZ_FIPS,CZ_NAME,WFO,...,END_DATE_TIME,INJURIES_DIRECT,INJURIES_INDIRECT,DEATHS_DIRECT,DEATHS_INDIRECT,DAMAGE_PROPERTY,DAMAGE_CROPS,SOURCE,DAMAGE_PROPERTY_CONVERTED,DAMAGE_PROPERTY_CONVERTED_DOLLARS
0,10071654,NEBRASKA,31,1958,June,Hail,C,165,SIOUX,,...,18-JUN-58 14:00:00,0,0,0,0,0,0,,0.0,$0
1,10071660,NEBRASKA,31,1958,June,Hail,C,145,RED WILLOW,,...,30-JUN-58 19:10:00,0,0,0,0,0,0,,0.0,$0
2,10121841,TEXAS,48,1958,July,Hail,C,153,FLOYD,,...,31-JUL-58 19:45:00,0,0,0,0,0,0,,0.0,$0
3,10124075,TENNESSEE,47,1958,April,Hail,C,145,ROANE,,...,24-APR-58 17:50:00,0,0,0,0,0,0,,0.0,$0
4,10028944,KANSAS,20,1958,May,Hail,C,167,RUSSELL,,...,24-MAY-58 15:57:00,0,0,0,0,0,0,,0.0,$0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,10136016,TEXAS,48,1969,June,Hail,C,205,HARTLEY,,...,17-JUN-69 18:00:00,0,0,0,0,0,0,,0.0,$0
9996,10136017,TEXAS,48,1969,June,Hail,C,375,POTTER,,...,17-JUN-69 20:55:00,0,0,0,0,0,0,,0.0,$0
9997,10136018,TEXAS,48,1969,June,Hail,C,191,HALL,,...,17-JUN-69 22:45:00,0,0,0,0,0,0,,0.0,$0
9998,10136019,TEXAS,48,1969,June,Hail,C,75,CHILDRESS,,...,17-JUN-69 23:45:00,0,0,0,0,0,0,,0.0,$0


In [55]:
# not in
storm_events[storm_events.EVENT_TYPE.notin(["Hail"])]

Unnamed: 0,EVENT_ID,STATE,STATE_FIPS,YEAR,MONTH_NAME,EVENT_TYPE,CZ_TYPE,CZ_FIPS,CZ_NAME,WFO,...,END_DATE_TIME,INJURIES_DIRECT,INJURIES_INDIRECT,DEATHS_DIRECT,DEATHS_INDIRECT,DAMAGE_PROPERTY,DAMAGE_CROPS,SOURCE,DAMAGE_PROPERTY_CONVERTED,DAMAGE_PROPERTY_CONVERTED_DOLLARS
0,10096222,OKLAHOMA,40,1950,April,Tornado,C,149,WASHITA,,...,28-APR-50 14:45:00,0,0,0,0,250K,0,,250000.0,$250000
1,10120412,TEXAS,48,1950,April,Tornado,C,93,COMANCHE,,...,29-APR-50 15:30:00,0,0,0,0,25K,0,,25000.0,$25000
2,10104927,PENNSYLVANIA,42,1950,July,Tornado,C,77,LEHIGH,,...,05-JUL-50 18:00:00,2,0,0,0,25K,0,,25000.0,$25000
3,10104928,PENNSYLVANIA,42,1950,July,Tornado,C,43,DAUPHIN,,...,05-JUL-50 18:30:00,0,0,0,0,2.5K,0,,2500.0,$2500
4,10104929,PENNSYLVANIA,42,1950,July,Tornado,C,39,CRAWFORD,,...,24-JUL-50 14:40:00,0,0,0,0,2.5K,0,,2500.0,$2500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,10029164,KANSAS,20,1962,July,Thunderstorm Wind,C,145,PAWNEE,,...,14-JUL-62 21:00:00,0,0,0,0,0,0,,0.0,$0
9996,10029165,KANSAS,20,1962,July,Thunderstorm Wind,C,9,BARTON,,...,14-JUL-62 21:30:00,0,0,0,0,0,0,,0.0,$0
9997,10029166,KANSAS,20,1962,July,Thunderstorm Wind,C,169,SALINE,,...,14-JUL-62 23:30:00,0,0,0,0,0,0,,0.0,$0
9998,9988125,FLORIDA,12,1962,May,Thunderstorm Wind,C,105,POLK,,...,11-MAY-62 17:00:00,0,0,0,0,0,0,,0.0,$0


In [56]:
# is null
storm_events[storm_events.SOURCE.isnull()]

Unnamed: 0,EVENT_ID,STATE,STATE_FIPS,YEAR,MONTH_NAME,EVENT_TYPE,CZ_TYPE,CZ_FIPS,CZ_NAME,WFO,...,END_DATE_TIME,INJURIES_DIRECT,INJURIES_INDIRECT,DEATHS_DIRECT,DEATHS_INDIRECT,DAMAGE_PROPERTY,DAMAGE_CROPS,SOURCE,DAMAGE_PROPERTY_CONVERTED,DAMAGE_PROPERTY_CONVERTED_DOLLARS
0,10096222,OKLAHOMA,40,1950,April,Tornado,C,149,WASHITA,,...,28-APR-50 14:45:00,0,0,0,0,250K,0,,250000.0,$250000
1,10120412,TEXAS,48,1950,April,Tornado,C,93,COMANCHE,,...,29-APR-50 15:30:00,0,0,0,0,25K,0,,25000.0,$25000
2,10104927,PENNSYLVANIA,42,1950,July,Tornado,C,77,LEHIGH,,...,05-JUL-50 18:00:00,2,0,0,0,25K,0,,25000.0,$25000
3,10104928,PENNSYLVANIA,42,1950,July,Tornado,C,43,DAUPHIN,,...,05-JUL-50 18:30:00,0,0,0,0,2.5K,0,,2500.0,$2500
4,10104929,PENNSYLVANIA,42,1950,July,Tornado,C,39,CRAWFORD,,...,24-JUL-50 14:40:00,0,0,0,0,2.5K,0,,2500.0,$2500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,10093724,OKLAHOMA,40,1956,April,Thunderstorm Wind,C,103,NOBLE,,...,08-APR-56 23:00:00,0,0,0,0,0,0,,0.0,$0
9996,10093725,OKLAHOMA,40,1956,April,Thunderstorm Wind,C,65,JACKSON,,...,08-APR-56 23:00:00,0,0,0,0,0,0,,0.0,$0
9997,10093726,OKLAHOMA,40,1956,April,Thunderstorm Wind,C,83,LOGAN,,...,08-APR-56 23:02:00,0,0,0,0,0,0,,0.0,$0
9998,10093727,OKLAHOMA,40,1956,April,Tornado,C,119,PAYNE,,...,08-APR-56 23:30:00,0,0,0,0,250K,0,,250000.0,$250000


In [57]:
# not null
storm_events[storm_events.SOURCE.notnull()]

Unnamed: 0,EVENT_ID,STATE,STATE_FIPS,YEAR,MONTH_NAME,EVENT_TYPE,CZ_TYPE,CZ_FIPS,CZ_NAME,WFO,...,END_DATE_TIME,INJURIES_DIRECT,INJURIES_INDIRECT,DEATHS_DIRECT,DEATHS_INDIRECT,DAMAGE_PROPERTY,DAMAGE_CROPS,SOURCE,DAMAGE_PROPERTY_CONVERTED,DAMAGE_PROPERTY_CONVERTED_DOLLARS
0,990000001,FLORIDA,12,1972,June,Tornado,C,43,GLADES,,...,18-JUN-72 15:17:00,6,0,0,0,190K,5K,Manual Input,190000.0,$190000
1,990000002,FLORIDA,12,1972,June,Tornado,C,51,HENDRY,,...,18-JUN-72 15:13:00,1,0,1,0,10K,5K,Manual Input,10000.0,$10000
2,990000003,FLORIDA,12,1972,June,Tornado,C,93,OKEECHOBEE,,...,18-JUN-72 22:55:00,44,0,6,0,500K,0,Manual Input,500000.0,$500000
3,10032307,Kentucky,21,1974,April,Tornado,C,215,SPENCER,,...,03-APR-74 17:31:00,0,0,0,0,25K,0,WFO,25000.0,$25000
4,10032301,Kentucky,21,1974,April,Tornado,C,93,HARDIN,,...,03-APR-74 16:55:00,57,0,2,0,250K,0,WFO,250000.0,$250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,5678048,OKLAHOMA,40,1998,October,Tornado,C,3,ALFALFA,OUN,...,04-OCT-98 14:43:00,0,0,0,0,55K,,UNKNOWN,55000.0,$55000
9996,5678049,OKLAHOMA,40,1998,October,Tornado,C,11,BLAINE,OUN,...,04-OCT-98 17:12:00,0,0,0,0,50K,,UNKNOWN,50000.0,$50000
9997,5673341,NEBRASKA,31,1998,November,High Wind,Z,77,FILLMORE,GID,...,10-NOV-98 16:00:00,0,0,0,0,0,0,"AWOS,ASOS,MESONET,ETC",0.0,$0
9998,5676260,PENNSYLVANIA,42,1998,November,Heat,Z,67,CHESTER,PHI,...,30-NOV-98 23:59:00,0,0,0,0,0,0,"AWOS,ASOS,MESONET,ETC",0.0,$0


In [58]:
# dropna
storm_events.dropna(subset=["SOURCE"])

Unnamed: 0,EVENT_ID,STATE,STATE_FIPS,YEAR,MONTH_NAME,EVENT_TYPE,CZ_TYPE,CZ_FIPS,CZ_NAME,WFO,...,END_DATE_TIME,INJURIES_DIRECT,INJURIES_INDIRECT,DEATHS_DIRECT,DEATHS_INDIRECT,DAMAGE_PROPERTY,DAMAGE_CROPS,SOURCE,DAMAGE_PROPERTY_CONVERTED,DAMAGE_PROPERTY_CONVERTED_DOLLARS
0,990000001,FLORIDA,12,1972,June,Tornado,C,43,GLADES,,...,18-JUN-72 15:17:00,6,0,0,0,190K,5K,Manual Input,190000.0,$190000
1,990000002,FLORIDA,12,1972,June,Tornado,C,51,HENDRY,,...,18-JUN-72 15:13:00,1,0,1,0,10K,5K,Manual Input,10000.0,$10000
2,990000003,FLORIDA,12,1972,June,Tornado,C,93,OKEECHOBEE,,...,18-JUN-72 22:55:00,44,0,6,0,500K,0,Manual Input,500000.0,$500000
3,10032307,Kentucky,21,1974,April,Tornado,C,215,SPENCER,,...,03-APR-74 17:31:00,0,0,0,0,25K,0,WFO,25000.0,$25000
4,10032301,Kentucky,21,1974,April,Tornado,C,93,HARDIN,,...,03-APR-74 16:55:00,57,0,2,0,250K,0,WFO,250000.0,$250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,5678048,OKLAHOMA,40,1998,October,Tornado,C,3,ALFALFA,OUN,...,04-OCT-98 14:43:00,0,0,0,0,55K,,UNKNOWN,55000.0,$55000
9996,5678049,OKLAHOMA,40,1998,October,Tornado,C,11,BLAINE,OUN,...,04-OCT-98 17:12:00,0,0,0,0,50K,,UNKNOWN,50000.0,$50000
9997,5673341,NEBRASKA,31,1998,November,High Wind,Z,77,FILLMORE,GID,...,10-NOV-98 16:00:00,0,0,0,0,0,0,"AWOS,ASOS,MESONET,ETC",0.0,$0
9998,5676260,PENNSYLVANIA,42,1998,November,Heat,Z,67,CHESTER,PHI,...,30-NOV-98 23:59:00,0,0,0,0,0,0,"AWOS,ASOS,MESONET,ETC",0.0,$0


In [59]:
# contains
storm_events[storm_events.EVENT_TYPE.lower().contains("flood")]

Unnamed: 0,EVENT_ID,STATE,STATE_FIPS,YEAR,MONTH_NAME,EVENT_TYPE,CZ_TYPE,CZ_FIPS,CZ_NAME,WFO,...,END_DATE_TIME,INJURIES_DIRECT,INJURIES_INDIRECT,DEATHS_DIRECT,DEATHS_INDIRECT,DAMAGE_PROPERTY,DAMAGE_CROPS,SOURCE,DAMAGE_PROPERTY_CONVERTED,DAMAGE_PROPERTY_CONVERTED_DOLLARS
0,10347267,SOUTH CAROLINA,45,1994,April,THUNDERSTORM WINDS/FLOODING,C,33,DILLON,,...,15-APR-94 15:40:00,0,0,0,0,5K,0,,5000.0,$5000
1,10335845,NEVADA,32,1995,June,HAIL FLOODING,C,0,NVZ003 - 004,,...,01-JUN-95 19:10:00,0,0,0,0,0,0,,0.0,$0
2,10335846,NEVADA,32,1995,August,THUNDERSTORM WINDS/FLASH FLOOD,C,0,NVZ003 - 004,,...,05-AUG-95 21:00:00,0,0,0,0,0,0,,0.0,$0
3,10355544,TEXAS,48,1995,April,THUNDERSTORM WINDS/ FLOOD,C,489,WILLACY,,...,04-APR-95 19:30:00,0,0,0,0,10K,0,,10000.0,$10000
4,10355545,TEXAS,48,1995,April,THUNDERSTORM WINDS/ FLOOD,C,489,WILLACY,,...,04-APR-95 20:30:00,0,0,0,0,10K,30K,,10000.0,$10000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,5306778,TEXAS,48,2002,July,Flash Flood,C,49,BROWN,SJT,...,03-JUL-02 11:30:00,0,0,0,0,7K,,LAW ENFORCEMENT,7000.0,$7000
9996,5307484,OREGON,41,2002,July,Flash Flood,C,63,WALLOWA,PDT,...,19-JUL-02 16:00:00,0,0,0,0,1K,,GENERAL PUBLIC,1000.0,$1000
9997,5306783,TEXAS,48,2002,July,Flash Flood,C,83,COLEMAN,SJT,...,04-JUL-02 05:00:00,0,0,0,0,4K,,LAW ENFORCEMENT,4000.0,$4000
9998,5306523,ALABAMA,1,2002,July,Flash Flood,C,73,JEFFERSON,BMX,...,12-JUL-02 22:00:00,0,0,0,0,25K,0K,EMERGENCY MANAGER,25000.0,$25000
