In [1]:
!pip install polars -U

Collecting polars
  Downloading polars-0.20.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (28.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m28.6/28.6 MB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: polars
  Attempting uninstall: polars
    Found existing installation: polars 0.17.3
    Uninstalling polars-0.17.3:
      Successfully uninstalled polars-0.17.3
Successfully installed polars-0.20.3


# Polars - wczytywanie danych / tworzenie obiektów DataFrame

## Autor: Marian Witkowski marian.witkowski[at]gmail.com

### Niniejszy materiał może być używany w celach dydaktyczych pod warunkem poinformowania o autorze.

Przykład użycia najczęstszych opcji w metodzie pd.read_csv() w Pandas:

- ładowanie danych z CSV
- ładowanie danych z CSV bez wiersza nagłówkego
- ładowanie wybranych kolumn
- pobierania zadanej liczby wierszy, pomijanie wierszy początkowych
- interpretacja danych jako NaN
- ładowanie danych z określaniem typów dla kolumn

In [2]:
import datetime
import numpy as np
import polars as pl
import warnings

warnings.filterwarnings("ignore")


## Ładowanie danych z pliku CSV


In [3]:
url = "https://raw.githubusercontent.com/marianwitkowski/bs-datasets/main/f500.csv"
df = pl.read_csv(url, separator=",")
df.head()

company,rank,revenues,revenue_change,profits,assets,profit_change,ceo,industry,sector,previous_rank,country,hq_location,website,years_on_global_500_list,employees,total_stockholder_equity
str,i64,i64,f64,f64,i64,f64,str,str,str,i64,str,str,str,i64,i64,i64
"""Walmart""",1,485873,0.8,13643.0,198825,-7.2,"""C. Douglas McM…","""General Mercha…","""Retailing""",1,"""USA""","""Bentonville, A…","""http://www.wal…",23,2300000,77798
"""State Grid""",2,315199,-4.4,9571.3,489838,-6.2,"""Kou Wei""","""Utilities""","""Energy""",2,"""China""","""Beijing, China…","""http://www.sgc…",17,926067,209456
"""Sinopec Group""",3,267518,-9.1,1257.9,310726,-65.0,"""Wang Yupu""","""Petroleum Refi…","""Energy""",4,"""China""","""Beijing, China…","""http://www.sin…",19,713288,106523
"""China National…",4,262573,-12.3,1867.5,585619,-73.7,"""Zhang Jianhua""","""Petroleum Refi…","""Energy""",3,"""China""","""Beijing, China…","""http://www.cnp…",17,1512048,301893
"""Toyota Motor""",5,254694,7.7,16899.3,437575,-12.3,"""Akio Toyoda""","""Motor Vehicles…","""Motor Vehicles…",8,"""Japan""","""Toyota, Japan""","""http://www.toy…",23,364445,157210


## Ładowanie danych z pliku CSV bez wiersza z nazwami kolumn


In [4]:
url = "https://raw.githubusercontent.com/marianwitkowski/bs-datasets/main/f500nh.csv"
df = pl.read_csv(url, separator=",", has_header=False)
df.head()

column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10,column_11,column_12,column_13,column_14,column_15,column_16,column_17
str,i64,i64,f64,f64,i64,f64,str,str,str,i64,str,str,str,i64,i64,i64
"""Walmart""",1,485873,0.8,13643.0,198825,-7.2,"""C. Douglas McM…","""General Mercha…","""Retailing""",1,"""USA""","""Bentonville, A…","""http://www.wal…",23,2300000,77798
"""State Grid""",2,315199,-4.4,9571.3,489838,-6.2,"""Kou Wei""","""Utilities""","""Energy""",2,"""China""","""Beijing, China…","""http://www.sgc…",17,926067,209456
"""Sinopec Group""",3,267518,-9.1,1257.9,310726,-65.0,"""Wang Yupu""","""Petroleum Refi…","""Energy""",4,"""China""","""Beijing, China…","""http://www.sin…",19,713288,106523
"""China National…",4,262573,-12.3,1867.5,585619,-73.7,"""Zhang Jianhua""","""Petroleum Refi…","""Energy""",3,"""China""","""Beijing, China…","""http://www.cnp…",17,1512048,301893
"""Toyota Motor""",5,254694,7.7,16899.3,437575,-12.3,"""Akio Toyoda""","""Motor Vehicles…","""Motor Vehicles…",8,"""Japan""","""Toyota, Japan""","""http://www.toy…",23,364445,157210



## Ładowanie danych z pliku CSV bez wiersza z nazwami kolumn, przypisanie kolumn

In [5]:
columns = ['company', 'rank', 'revenues', 'revenue_change', 'profits', 'assets',
       'profit_change', 'ceo', 'industry', 'sector', 'previous_rank',
       'country', 'hq_location', 'website', 'years_on_global_500_list',
       'employees', 'total_stockholder_equity']
url = "https://raw.githubusercontent.com/marianwitkowski/bs-datasets/main/f500nh.csv"
df = pl.read_csv(url, separator=",", has_header=False, new_columns=columns)
df.head()

company,rank,revenues,revenue_change,profits,assets,profit_change,ceo,industry,sector,previous_rank,country,hq_location,website,years_on_global_500_list,employees,total_stockholder_equity
str,i64,i64,f64,f64,i64,f64,str,str,str,i64,str,str,str,i64,i64,i64
"""Walmart""",1,485873,0.8,13643.0,198825,-7.2,"""C. Douglas McM…","""General Mercha…","""Retailing""",1,"""USA""","""Bentonville, A…","""http://www.wal…",23,2300000,77798
"""State Grid""",2,315199,-4.4,9571.3,489838,-6.2,"""Kou Wei""","""Utilities""","""Energy""",2,"""China""","""Beijing, China…","""http://www.sgc…",17,926067,209456
"""Sinopec Group""",3,267518,-9.1,1257.9,310726,-65.0,"""Wang Yupu""","""Petroleum Refi…","""Energy""",4,"""China""","""Beijing, China…","""http://www.sin…",19,713288,106523
"""China National…",4,262573,-12.3,1867.5,585619,-73.7,"""Zhang Jianhua""","""Petroleum Refi…","""Energy""",3,"""China""","""Beijing, China…","""http://www.cnp…",17,1512048,301893
"""Toyota Motor""",5,254694,7.7,16899.3,437575,-12.3,"""Akio Toyoda""","""Motor Vehicles…","""Motor Vehicles…",8,"""Japan""","""Toyota, Japan""","""http://www.toy…",23,364445,157210


## Ładowanie wybranych kolumn z pliku CSV


In [6]:
url = "https://raw.githubusercontent.com/marianwitkowski/bs-datasets/main/f500.csv"
df = pl.read_csv(url, separator=",", columns=['company', 'rank', 'revenues', 'revenue_change', 'profits'])
df.head()

company,rank,revenues,revenue_change,profits
str,i64,i64,f64,f64
"""Walmart""",1,485873,0.8,13643.0
"""State Grid""",2,315199,-4.4,9571.3
"""Sinopec Group""",3,267518,-9.1,1257.9
"""China National…",4,262573,-12.3,1867.5
"""Toyota Motor""",5,254694,7.7,16899.3


## Ładowanie pliku CSV - pobieranie określonej liczby wierszy


In [7]:
url = "https://raw.githubusercontent.com/marianwitkowski/bs-datasets/main/f500.csv"
df = pl.read_csv(url, separator=",", n_rows=10 )
df

company,rank,revenues,revenue_change,profits,assets,profit_change,ceo,industry,sector,previous_rank,country,hq_location,website,years_on_global_500_list,employees,total_stockholder_equity
str,i64,i64,f64,f64,i64,f64,str,str,str,i64,str,str,str,i64,i64,i64
"""Walmart""",1,485873,0.8,13643.0,198825,-7.2,"""C. Douglas McM…","""General Mercha…","""Retailing""",1,"""USA""","""Bentonville, A…","""http://www.wal…",23,2300000,77798
"""State Grid""",2,315199,-4.4,9571.3,489838,-6.2,"""Kou Wei""","""Utilities""","""Energy""",2,"""China""","""Beijing, China…","""http://www.sgc…",17,926067,209456
"""Sinopec Group""",3,267518,-9.1,1257.9,310726,-65.0,"""Wang Yupu""","""Petroleum Refi…","""Energy""",4,"""China""","""Beijing, China…","""http://www.sin…",19,713288,106523
"""China National…",4,262573,-12.3,1867.5,585619,-73.7,"""Zhang Jianhua""","""Petroleum Refi…","""Energy""",3,"""China""","""Beijing, China…","""http://www.cnp…",17,1512048,301893
"""Toyota Motor""",5,254694,7.7,16899.3,437575,-12.3,"""Akio Toyoda""","""Motor Vehicles…","""Motor Vehicles…",8,"""Japan""","""Toyota, Japan""","""http://www.toy…",23,364445,157210
"""Volkswagen""",6,240264,1.5,5937.3,432116,,"""Matthias Mulle…","""Motor Vehicles…","""Motor Vehicles…",7,"""Germany""","""Wolfsburg, Ger…","""http://www.vol…",23,626715,97753
"""Royal Dutch Sh…",7,240033,-11.8,4575.0,411275,135.9,"""Ben van Beurde…","""Petroleum Refi…","""Energy""",5,"""Netherlands""","""The Hague, Net…","""http://www.she…",23,89000,186646
"""Berkshire Hath…",8,223604,6.1,24074.0,620854,,"""Warren E. Buff…","""Insurance: Pro…","""Financials""",11,"""USA""","""Omaha, NE""","""http://www.ber…",21,367700,283001
"""Apple""",9,215639,-7.7,45687.0,321686,-14.4,"""Timothy D. Coo…","""Computers, Off…","""Technology""",9,"""USA""","""Cupertino, CA""","""http://www.app…",15,116000,128249
"""Exxon Mobil""",10,205004,-16.7,7840.0,330314,-51.5,"""Darren W. Wood…","""Petroleum Refi…","""Energy""",6,"""USA""","""Irving, TX""","""http://www.exx…",23,72700,167325


## Ładowanie pliku CSV - pomijanie linii początkowych / końcowych oraz linii komentarzy


In [9]:
url = "https://raw.githubusercontent.com/marianwitkowski/bs-datasets/main/f500.csv"
df = pl.read_csv(url, separator=",", skip_rows_after_header=10, comment_prefix="#", has_header=False)
df

column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10,column_11,column_12,column_13,column_14,column_15,column_16,column_17
str,i64,i64,f64,f64,i64,f64,str,str,str,i64,str,str,str,i64,i64,i64
"""Exxon Mobil""",10,205004,-16.7,7840.0,330314,-51.5,"""Darren W. Wood…","""Petroleum Refi…","""Energy""",6,"""USA""","""Irving, TX""","""http://www.exx…",23,72700,167325
"""McKesson""",11,198533,3.1,5070.0,60969,124.5,"""John H. Hammer…","""Wholesalers: H…","""Wholesalers""",12,"""USA""","""San Francisco,…","""http://www.mck…",23,64500,11095
"""BP""",12,186606,-17.4,115.0,263316,,"""Robert W. Dudl…","""Petroleum Refi…","""Energy""",10,"""Britain""","""London, Britai…","""http://www.bp.…",23,74500,95286
"""UnitedHealth G…",13,184840,17.7,7017.0,122810,20.7,"""Stephen J. Hem…","""Health Care: I…","""Health Care""",17,"""USA""","""Minnetonka, MN…","""http://www.uni…",21,230000,38274
"""CVS Health""",14,177526,15.8,5317.0,94462,1.5,"""Larry J. Merlo…","""Health Care: P…","""Health Care""",18,"""USA""","""Woonsocket, RI…","""http://www.cvs…",22,204000,36830
"""Samsung Electr…",15,173957,-2.0,19316.5,217104,16.8,"""Oh-Hyun Kwon""","""Electronics, E…","""Technology""",13,"""South Korea""","""Suwon, South K…","""http://www.sam…",23,325000,154376
"""Glencore""",16,173883,2.0,1379.0,124600,,"""Ivan Glasenber…","""Mining, Crude-…","""Energy""",14,"""Switzerland""","""Baar, Switzerl…","""http://www.gle…",7,93123,44243
"""Daimler""",17,169483,2.2,9428.4,256262,0.9,"""Dieter Zetsche…","""Motor Vehicles…","""Motor Vehicles…",16,"""Germany""","""Stuttgart, Ger…","""http://www.dai…",23,282488,61116
"""General Motors…",18,166380,9.2,9427.0,221690,-2.7,"""Mary T. Barra""","""Motor Vehicles…","""Motor Vehicles…",20,"""USA""","""Detroit, MI""","""http://www.gm.…",23,225000,43836
"""AT&T""",19,163786,11.6,12976.0,403821,-2.8,"""Randall L. Ste…","""Telecommunicat…","""Telecommunicat…",23,"""USA""","""Dallas, TX""","""http://www.att…",23,268540,123135


## Ładowanie pliku CSV - interpretacja wartości NaN

In [10]:
url = "https://raw.githubusercontent.com/marianwitkowski/bs-datasets/main/f500.csv"
df = pl.read_csv(url, separator=",", null_values=['?','*'])
df.head()


company,rank,revenues,revenue_change,profits,assets,profit_change,ceo,industry,sector,previous_rank,country,hq_location,website,years_on_global_500_list,employees,total_stockholder_equity
str,i64,i64,f64,f64,i64,f64,str,str,str,i64,str,str,str,i64,i64,i64
"""Walmart""",1,485873,0.8,13643.0,198825,-7.2,"""C. Douglas McM…","""General Mercha…","""Retailing""",1,"""USA""","""Bentonville, A…","""http://www.wal…",23,2300000,77798
"""State Grid""",2,315199,-4.4,9571.3,489838,-6.2,"""Kou Wei""","""Utilities""","""Energy""",2,"""China""","""Beijing, China…","""http://www.sgc…",17,926067,209456
"""Sinopec Group""",3,267518,-9.1,1257.9,310726,-65.0,"""Wang Yupu""","""Petroleum Refi…","""Energy""",4,"""China""","""Beijing, China…","""http://www.sin…",19,713288,106523
"""China National…",4,262573,-12.3,1867.5,585619,-73.7,"""Zhang Jianhua""","""Petroleum Refi…","""Energy""",3,"""China""","""Beijing, China…","""http://www.cnp…",17,1512048,301893
"""Toyota Motor""",5,254694,7.7,16899.3,437575,-12.3,"""Akio Toyoda""","""Motor Vehicles…","""Motor Vehicles…",8,"""Japan""","""Toyota, Japan""","""http://www.toy…",23,364445,157210


## Ładowanie pliku CSV - określanie typu kolumn

In [15]:
url = "https://raw.githubusercontent.com/marianwitkowski/bs-datasets/main/f500.csv"
df = pl.read_csv(url, separator=",", dtypes={
    'rank' : pl.Int64, 'revenue_change' : pl.Float64,  'previous_rank' : pl.UInt16
})
df.head()

company,rank,revenues,revenue_change,profits,assets,profit_change,ceo,industry,sector,previous_rank,country,hq_location,website,years_on_global_500_list,employees,total_stockholder_equity
str,i64,i64,f64,f64,i64,f64,str,str,str,u16,str,str,str,i64,i64,i64
"""Walmart""",1,485873,0.8,13643.0,198825,-7.2,"""C. Douglas McM…","""General Mercha…","""Retailing""",1,"""USA""","""Bentonville, A…","""http://www.wal…",23,2300000,77798
"""State Grid""",2,315199,-4.4,9571.3,489838,-6.2,"""Kou Wei""","""Utilities""","""Energy""",2,"""China""","""Beijing, China…","""http://www.sgc…",17,926067,209456
"""Sinopec Group""",3,267518,-9.1,1257.9,310726,-65.0,"""Wang Yupu""","""Petroleum Refi…","""Energy""",4,"""China""","""Beijing, China…","""http://www.sin…",19,713288,106523
"""China National…",4,262573,-12.3,1867.5,585619,-73.7,"""Zhang Jianhua""","""Petroleum Refi…","""Energy""",3,"""China""","""Beijing, China…","""http://www.cnp…",17,1512048,301893
"""Toyota Motor""",5,254694,7.7,16899.3,437575,-12.3,"""Akio Toyoda""","""Motor Vehicles…","""Motor Vehicles…",8,"""Japan""","""Toyota, Japan""","""http://www.toy…",23,364445,157210
