# 05_02: pandas DataFrames and Series

In [1]:
import math
import collections
import dataclasses
import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as pp

In [2]:
!cat nobels.csv

1901,Chemistry,Jacobus H. van 't Hoff,1852-08-30
1901,Literature,Sully Prudhomme,1839-03-16
1901,Medicine,Emil von Behring,1854-03-15
1901,Peace,Frédéric Passy,1822-05-20
1901,Peace,Henry Dunant,1828-05-08
1901,Physics,Wilhelm Conrad Röntgen,1845-03-27
1902,Chemistry,Emil Fischer,1852-10-09
1902,Literature,Theodor Mommsen,1817-11-30
1902,Medicine,Ronald Ross,1857-05-13
1902,Peace,Albert Gobat,1843-05-21
1902,Peace,Élie Ducommun,1833-02-19
1902,Physics,Hendrik A. Lorentz,1853-07-18
1902,Physics,Pieter Zeeman,1865-05-25
1903,Chemistry,Svante Arrhenius,1859-02-19
1903,Literature,Bjørnstjerne Bjørnson,1832-12-08
1903,Medicine,Niels Ryberg Finsen,1860-12-15
1903,Peace,Randal Cremer,1828-03-18
1903,Physics,Henri Becquerel,1852-12-15
1903,Physics,Marie Curie,1867-11-07
1903,Physics,Pierre Curie,1859-05-15
1904,Chemistry,Sir William Ramsay,1852-10-02
1904,Literature,Frédéric Mistral,1830-09-08
1904,Literature,José Echegaray,1832-04-19
1904,Medicine,Ivan Pavlov,1849-09-14
1904,Peace,Institute o

In [3]:
nobels = pd.read_csv('nobels.csv', names=['year', 'discipline', 'nobelist', 'DOB'],
                     dtype_backend='pyarrow', engine='pyarrow') # use arrow for backend and parsing

In [4]:
nobels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype               
---  ------      --------------  -----               
 0   year        1000 non-null   int64[pyarrow]      
 1   discipline  1000 non-null   string[pyarrow]     
 2   nobelist    1000 non-null   string[pyarrow]     
 3   DOB         956 non-null    date32[day][pyarrow]
dtypes: date32[day][pyarrow](1), int64[pyarrow](1), string[pyarrow](2)
memory usage: 42.9 KB


In [5]:
nobels.head()

Unnamed: 0,year,discipline,nobelist,DOB
0,1901,Chemistry,Jacobus H. van 't Hoff,1852-08-30
1,1901,Literature,Sully Prudhomme,1839-03-16
2,1901,Medicine,Emil von Behring,1854-03-15
3,1901,Peace,Frédéric Passy,1822-05-20
4,1901,Peace,Henry Dunant,1828-05-08


In [6]:
nobels.tail()

Unnamed: 0,year,discipline,nobelist,DOB
995,2023,Medicine,Katalin Karikó,1955-01-17
996,2023,Peace,Narges Mohammadi,1972-04-21
997,2023,Physics,Anne L’Huillier,1958-08-16
998,2023,Physics,Ferenc Krausz,1962-05-17
999,2023,Physics,Pierre Agostini,1941-07-23


In [7]:
len(nobels)

1000

In [8]:
nobels.columns

Index(['year', 'discipline', 'nobelist', 'DOB'], dtype='object')

In [9]:
nobels.dtypes

year                int64[pyarrow]
discipline         string[pyarrow]
nobelist           string[pyarrow]
DOB           date32[day][pyarrow]
dtype: object

In [10]:
nobels.index

RangeIndex(start=0, stop=1000, step=1)

In [11]:
nobels['discipline']

0       Chemistry
1      Literature
2        Medicine
3           Peace
4           Peace
          ...    
995      Medicine
996         Peace
997       Physics
998       Physics
999       Physics
Name: discipline, Length: 1000, dtype: string[pyarrow]

In [12]:
nobels.nobelist

0      Jacobus H. van 't Hoff
1             Sully Prudhomme
2            Emil von Behring
3              Frédéric Passy
4                Henry Dunant
                ...          
995            Katalin Karikó
996          Narges Mohammadi
997           Anne L’Huillier
998             Ferenc Krausz
999           Pierre Agostini
Name: nobelist, Length: 1000, dtype: string[pyarrow]

In [13]:
nobels.discipline.values[:50]

<ArrowExtensionArray>
[ 'Chemistry', 'Literature',   'Medicine',      'Peace',      'Peace',
    'Physics',  'Chemistry', 'Literature',   'Medicine',      'Peace',
      'Peace',    'Physics',    'Physics',  'Chemistry', 'Literature',
   'Medicine',      'Peace',    'Physics',    'Physics',    'Physics',
  'Chemistry', 'Literature', 'Literature',   'Medicine',      'Peace',
    'Physics',  'Chemistry', 'Literature',   'Medicine',      'Peace',
    'Physics',  'Chemistry', 'Literature',   'Medicine',   'Medicine',
      'Peace',    'Physics',  'Chemistry', 'Literature',   'Medicine',
      'Peace',      'Peace',    'Physics',  'Chemistry', 'Literature',
   'Medicine',   'Medicine',      'Peace',      'Peace',    'Physics']
Length: 50, dtype: string[pyarrow]

In [14]:
nobels.year.values.to_numpy()[:50]

array([1901, 1901, 1901, 1901, 1901, 1901, 1902, 1902, 1902, 1902, 1902,
       1902, 1902, 1903, 1903, 1903, 1903, 1903, 1903, 1903, 1904, 1904,
       1904, 1904, 1904, 1904, 1905, 1905, 1905, 1905, 1905, 1906, 1906,
       1906, 1906, 1906, 1906, 1907, 1907, 1907, 1907, 1907, 1907, 1908,
       1908, 1908, 1908, 1908, 1908, 1908])

In [15]:
nobels.discipline.unique()

<ArrowExtensionArray>
['Chemistry', 'Literature', 'Medicine', 'Peace', 'Physics', 'Economics']
Length: 6, dtype: string[pyarrow]

In [16]:
nobels.nobelist.value_counts()

nobelist
International Committee of the Red Cross                       3
Marie Curie                                                    2
Linus Pauling                                                  2
Office of the United Nations High Commissioner for Refugees    2
John Bardeen                                                   2
                                                              ..
Katalin Karikó                                                 1
Narges Mohammadi                                               1
Anne L’Huillier                                                1
Ferenc Krausz                                                  1
Pierre Agostini                                                1
Name: count, Length: 992, dtype: int64[pyarrow]

In [17]:
nobels[(nobels.discipline == 'Physics') & (nobels.year > 2019)]

Unnamed: 0,year,discipline,nobelist,DOB
959,2020,Physics,Andrea Ghez,1965-06-16
960,2020,Physics,Reinhard Genzel,1952-03-24
961,2020,Physics,Roger Penrose,1931-08-08
972,2021,Physics,Giorgio Parisi,1948-08-04
973,2021,Physics,Klaus Hasselmann,1931-10-25
974,2021,Physics,Syukuro Manabe,1931-09-21
986,2022,Physics,Alain Aspect,1947-06-15
987,2022,Physics,Anton Zeilinger,1945-05-20
988,2022,Physics,John Clauser,1942-12-01
997,2023,Physics,Anne L’Huillier,1958-08-16


In [18]:
nobels.query('discipline == "Chemistry" and year < 1910')

  nobels.query('discipline == "Chemistry" and year < 1910')


Unnamed: 0,year,discipline,nobelist,DOB
0,1901,Chemistry,Jacobus H. van 't Hoff,1852-08-30
6,1902,Chemistry,Emil Fischer,1852-10-09
13,1903,Chemistry,Svante Arrhenius,1859-02-19
20,1904,Chemistry,Sir William Ramsay,1852-10-02
26,1905,Chemistry,Adolf von Baeyer,1835-10-31
31,1906,Chemistry,Henri Moissan,1852-09-28
37,1907,Chemistry,Eduard Buchner,1860-05-20
43,1908,Chemistry,Ernest Rutherford,1871-08-30
50,1909,Chemistry,Wilhelm Ostwald,1853-09-02


In [19]:
nobels['Curie' in nobels.nobelist]

KeyError: False

In [20]:
nobels[nobels.nobelist.str.contains('Curie')]

Unnamed: 0,year,discipline,nobelist,DOB
18,1903,Physics,Marie Curie,1867-11-07
19,1903,Physics,Pierre Curie,1859-05-15
62,1911,Chemistry,Marie Curie,1867-11-07
179,1935,Chemistry,Irène Joliot-Curie,1897-09-12


In [21]:
disco = np.load('discography.npy')

In [22]:
disco

array([('David Bowie', '1969-11-14', 17),
       ('The Man Who Sold the World', '1970-11-04',  3),
       ('Hunky Dory', '1971-12-17',  5),
       ('Ziggy Stardust', '1972-06-16',  1),
       ('Aladdin Sane', '1973-04-13',  1), ('Pin Ups', '1973-10-19',  1),
       ('Diamond Dogs', '1974-05-24',  1),
       ('Young Americans', '1975-03-07',  2),
       ('Station To Station', '1976-01-23',  5),
       ('Low', '1977-01-14',  2), ('Heroes', '1977-10-14',  3),
       ('Lodger', '1979-05-18',  4)],
      dtype=[('title', '<U32'), ('release', '<M8[D]'), ('toprank', '<i8')])

In [23]:
disco_df = pd.DataFrame(disco)

In [24]:
disco_df.head()

Unnamed: 0,title,release,toprank
0,David Bowie,1969-11-14,17
1,The Man Who Sold the World,1970-11-04,3
2,Hunky Dory,1971-12-17,5
3,Ziggy Stardust,1972-06-16,1
4,Aladdin Sane,1973-04-13,1


In [25]:
disco_df.dtypes

title             object
release    datetime64[s]
toprank            int64
dtype: object

In [26]:
disco_df = disco_df.convert_dtypes(dtype_backend='pyarrow')

In [27]:
disco_df.dtypes

title            string[pyarrow]
release    timestamp[s][pyarrow]
toprank           int64[pyarrow]
dtype: object

In [28]:
pd.DataFrame([{'title': 'David Bowie', 'year': 1969},
              {'title': 'The Man Who Sold the World', 'year': 1970},
              {'title': 'Hunky Dory', 'year': 1971}])

Unnamed: 0,title,year
0,David Bowie,1969
1,The Man Who Sold the World,1970
2,Hunky Dory,1971


In [29]:
pd.DataFrame([('Ziggy Stardust', 1), ('Aladdin Sane', 1), ('Pin Ups', 1)], columns=['title', 'toprank'])

Unnamed: 0,title,toprank
0,Ziggy Stardust,1
1,Aladdin Sane,1
2,Pin Ups,1


In [30]:
pd.DataFrame({'title': ['David Bowie', 'The Man Who Sold the World', 'Hunky Dory',
                        'Ziggy Stardust', 'Aladdin Sane', 'Pin Ups', 'Diamond Dogs',
                        'Young Americans', 'Station To Station', 'Low', 'Heroes', 'Lodger'],
              'release': ['1969-11-14', '1970-11-04', '1971-12-17', '1972-06-16',
                          '1973-04-13', '1973-10-19', '1974-05-24', '1975-03-07',
                          '1976-01-23', '1977-01-14', '1977-10-14', '1979-05-18']})

Unnamed: 0,title,release
0,David Bowie,1969-11-14
1,The Man Who Sold the World,1970-11-04
2,Hunky Dory,1971-12-17
3,Ziggy Stardust,1972-06-16
4,Aladdin Sane,1973-04-13
5,Pin Ups,1973-10-19
6,Diamond Dogs,1974-05-24
7,Young Americans,1975-03-07
8,Station To Station,1976-01-23
9,Low,1977-01-14
