Installing and importing packages
- similar to R-Studio, packages can be installed directly in the environment with `!pip install package_name`, if you are on your local install you can either use Anaconda in your console via `conda install package_name` or `pip install package_name`
- to import packages we simply call `import package_name` to import the whole package
  - additionally, we can extract only a few objects (to save on ram) by calling `from package_name import package_function`
  - to shorten package names we can import them with an alias: `import package_name as pn` -> useful for often used packages

Dataframes 
- work very similar to R (not a coincidence)
- accessed through the `pandas` package -> has to be imported!
- although we need functions for slicing:
  - .iloc[row, column] for index based slicing
  - .loc[row, 'column'] for text based sclicing


In [None]:
# starting with pandas:
import pandas as pd # -> widely used abbreviation

In [None]:
maja_dict = {
    'ids':[1,2,3,4,5],
    'name':['Maja', 'Flip', 'Thekla', 'Willi', 'Puck'],
    'age':[5, 10, 20, 5, 2],
    'types':['bee', 'grasshopper', 'spider', 'bee', 'fly'],
}

In [None]:
pd.DataFrame(maja_dict)

Unnamed: 0,ids,name,age,types
0,1,Maja,5,bee
1,2,Flip,10,grasshopper
2,3,Thekla,20,spider
3,4,Willi,5,bee
4,5,Puck,2,fly


In [None]:
# building a dataframe can be done in many different ways:
## by adding pandas series objects (every series is a row):
a = pd.Series([1, 'Maja', 5, 'bee'])
b = pd.Series([2, 'Flip', 10, 'grasshopper'])
c = pd.Series([3, 'Thekla', 20, 'spider'])
d = pd.Series([4, 'Willi', 5, 'bee'])
e = pd.Series([5, 'Puck', 2, 'fly'])
maja_df = pd.DataFrame([a,b,c,d,e])

In [None]:
maja_df

Unnamed: 0,0,1,2,3
0,1,Maja,5,bee
1,2,Flip,10,grasshopper
2,3,Thekla,20,spider
3,4,Willi,5,bee
4,5,Puck,2,fly


In [None]:
## with a dictionary including a key for every column:
maja_dict = {
    'id':[1,2,3,4,5],
    'name':['Maja', 'Flip', 'Thekla', 'Willi', 'Puck'],
    'age':[5, 10, 20, 5, 2],
    'type':['bee', 'grasshopper', 'spider', 'bee', 'fly'],
}
maja_df = pd.DataFrame(maja_dict)
print(maja_df)

   id    name  age         type
0   1    Maja    5          bee
1   2    Flip   10  grasshopper
2   3  Thekla   20       spider
3   4   Willi    5          bee
4   5    Puck    2          fly


In [None]:
## with a list of dictionaries, every dictionary is a row
# good for adding rows iteratively
maja_list = [
    {'id':1, 'name':'Maja', 'age':5, 'type':'bee'},
    {'id':2, 'name':'Flip', 'age':10, 'type':'grasshopper'},
    {'id':3, 'name':'Thekla', 'age':20, 'type':'spider'},
    {'id':4, 'name':'Willi', 'age':5, 'type':'bee'},
    {'id':5, 'name':'Puck', 'age':2, 'type':'fly'},
]
maja_df = pd.DataFrame(maja_list)
print(maja_df)

   id    name  age         type
0   1    Maja    5          bee
1   2    Flip   10  grasshopper
2   3  Thekla   20       spider
3   4   Willi    5          bee
4   5    Puck    2          fly


In [None]:
# slicing, indexing etc.:
## rows:
print(maja_df.iloc[0,:]) # first row, note the colon for columns (->everything)

id         1
name    Maja
age        5
type     bee
Name: 0, dtype: object


In [None]:

## columns:
print(maja_df['name']) # you can also use maja_df.column, but beware that some words have different meanings here!
print(maja_df.name)
print(maja_df.iloc[:,1])
print(maja_df.loc[:,'name'])

0      Maja
1      Flip
2    Thekla
3     Willi
4      Puck
Name: name, dtype: object
0      Maja
1      Flip
2    Thekla
3     Willi
4      Puck
Name: name, dtype: object
0      Maja
1      Flip
2    Thekla
3     Willi
4      Puck
Name: name, dtype: object
0      Maja
1      Flip
2    Thekla
3     Willi
4      Puck
Name: name, dtype: object


In [None]:
## both:
print(maja_df.iloc[2:4,1:3])
# try to extract only Theklas type!

     name  age
2  Thekla   20
3   Willi    5


In [None]:
# usefull operations:
## types of content:
print(maja_df.dtypes)

dtypes: id       int64
name    object
age      int64
type    object
dtype: object
Index(['id', 'name', 'age', 'type'], dtype='object')


In [None]:
## all columns (e.g. for inspecting large dataframes
print(maja_df.columns)

Index(['id', 'name', 'age', 'type', 'production_country', 'age*name'], dtype='object')


In [None]:
## summary statistics:
maja_df.describe()

Unnamed: 0,id,age
count,5.0,5.0
mean,3.0,8.4
std,1.581139,7.092249
min,1.0,2.0
25%,2.0,5.0
50%,3.0,5.0
75%,4.0,10.0
max,5.0,20.0


In [None]:
## assigning new, e.g. calculated, columns:
# adding the country of production for the whole dataframe
maja_df['production_country'] = 'Japan'
# or something ridiculous: age * name
maja_df['age*name'] = maja_df['age'] * maja_df['name']

In [None]:
maja_df

Unnamed: 0,id,name,age,type,production_country,age*name
0,1,Maja,5,bee,Japan,MajaMajaMajaMajaMaja
1,2,Flip,10,grasshopper,Japan,FlipFlipFlipFlipFlipFlipFlipFlipFlipFlip
2,3,Thekla,20,spider,Japan,TheklaTheklaTheklaTheklaTheklaTheklaTheklaThek...
3,4,Willi,5,bee,Japan,WilliWilliWilliWilliWilli
4,5,Puck,2,fly,Japan,PuckPuck


In [None]:
## looking at parts of the dataset:
print(maja_df.head(2))
print(maja_df.tail(2))

   id  name  age         type production_country  \
0   1  Maja    5          bee              Japan   
1   2  Flip   10  grasshopper              Japan   

                                   age*name  
0                      MajaMajaMajaMajaMaja  
1  FlipFlipFlipFlipFlipFlipFlipFlipFlipFlip  
   id   name  age type production_country                   age*name
3   4  Willi    5  bee              Japan  WilliWilliWilliWilliWilli
4   5   Puck    2  fly              Japan                   PuckPuck


In [None]:
## conditional filtering:
# only look at character older than 5:
print(
    maja_df[maja_df['age'] > 5] # I always read the first square brackets as "where..."
)

   id    name  age         type production_country  \
1   2    Flip   10  grasshopper              Japan   
2   3  Thekla   20       spider              Japan   

                                            age*name  
1           FlipFlipFlipFlipFlipFlipFlipFlipFlipFlip  
2  TheklaTheklaTheklaTheklaTheklaTheklaTheklaThek...  


In [None]:
# try to get everyone that is called Flip or Willi and older than 6:
print(
    maja_df[(maja_df['name'].isin(['Willi', 'Flip'])) & (maja_df['age'] > 6)]
)

# alternatively:
#print(
#    maja_df[
#        ((maja_df['name']=='Willi') | (maja_df['name']=='Flip')) & 
#        (maja_df['age'] > 6)
#    ]
#)

   id  name  age         type production_country  \
1   2  Flip   10  grasshopper              Japan   

                                   age*name  
1  FlipFlipFlipFlipFlipFlipFlipFlipFlipFlip  


In [None]:
# sorting dataframes
print(maja_df.sort_values('age')) # look at the options, what would "ascending=False" do?

   id    name  age         type production_country  \
4   5    Puck    2          fly              Japan   
0   1    Maja    5          bee              Japan   
3   4   Willi    5          bee              Japan   
1   2    Flip   10  grasshopper              Japan   
2   3  Thekla   20       spider              Japan   

                                            age*name  
4                                           PuckPuck  
0                               MajaMajaMajaMajaMaja  
3                          WilliWilliWilliWilliWilli  
1           FlipFlipFlipFlipFlipFlipFlipFlipFlipFlip  
2  TheklaTheklaTheklaTheklaTheklaTheklaTheklaThek...  


In [None]:
!pip install pydataset

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pydataset
  Downloading pydataset-0.2.0.tar.gz (15.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m46.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pydataset
  Building wheel for pydataset (setup.py) ... [?25l[?25hdone
  Created wheel for pydataset: filename=pydataset-0.2.0-py3-none-any.whl size=15939416 sha256=a59d90f30230ee334fd4fad5054dd9f4a9993c5d5e3541dc4423647650f8cbfa
  Stored in directory: /root/.cache/pip/wheels/2b/83/5c/073c3755e8b7704e4677557b2055e61026c1a2342149214c13
Successfully built pydataset
Installing collected packages: pydataset
Successfully installed pydataset-0.2.0


In [None]:
# you can of course chain options, similar to the tidy equivalent in R:
## import and assign sample data
from pydataset import data
new_df = data('mtcars')

In [None]:
# initial look at the data
print(new_df.head())

                    mpg  cyl   disp   hp  drat     wt   qsec  vs  am  gear  \
Mazda RX4          21.0    6  160.0  110  3.90  2.620  16.46   0   1     4   
Mazda RX4 Wag      21.0    6  160.0  110  3.90  2.875  17.02   0   1     4   
Datsun 710         22.8    4  108.0   93  3.85  2.320  18.61   1   1     4   
Hornet 4 Drive     21.4    6  258.0  110  3.08  3.215  19.44   1   0     3   
Hornet Sportabout  18.7    8  360.0  175  3.15  3.440  17.02   0   0     3   

                   carb  
Mazda RX4             4  
Mazda RX4 Wag         4  
Datsun 710            1  
Hornet 4 Drive        1  
Hornet Sportabout     2  


In [None]:
# explain all the parts of the following!
chained = new_df[['mpg', 'cyl', 'hp', 'gear']][new_df['cyl']>4].sort_values('hp', ascending=False)
print(chained.head(15))

                      mpg  cyl   hp  gear
Maserati Bora        15.0    8  335     5
Ford Pantera L       15.8    8  264     5
Camaro Z28           13.3    8  245     3
Duster 360           14.3    8  245     3
Chrysler Imperial    14.7    8  230     3
Lincoln Continental  10.4    8  215     3
Cadillac Fleetwood   10.4    8  205     3
Merc 450SL           17.3    8  180     3
Merc 450SLC          15.2    8  180     3
Merc 450SE           16.4    8  180     3
Hornet Sportabout    18.7    8  175     3
Pontiac Firebird     19.2    8  175     3
Ferrari Dino         19.7    6  175     5
Dodge Challenger     15.5    8  150     3
AMC Javelin          15.2    8  150     3


In [None]:
new_df.describe()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
count,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0
mean,20.090625,6.1875,230.721875,146.6875,3.596563,3.21725,17.84875,0.4375,0.40625,3.6875,2.8125
std,6.026948,1.785922,123.938694,68.562868,0.534679,0.978457,1.786943,0.504016,0.498991,0.737804,1.6152
min,10.4,4.0,71.1,52.0,2.76,1.513,14.5,0.0,0.0,3.0,1.0
25%,15.425,4.0,120.825,96.5,3.08,2.58125,16.8925,0.0,0.0,3.0,2.0
50%,19.2,6.0,196.3,123.0,3.695,3.325,17.71,0.0,0.0,4.0,2.0
75%,22.8,8.0,326.0,180.0,3.92,3.61,18.9,1.0,1.0,4.0,4.0
max,33.9,8.0,472.0,335.0,4.93,5.424,22.9,1.0,1.0,5.0,8.0


In [None]:
import os
os.getcwd()

'/content'

In [None]:
# exporting and importing data:
# keep in mind, that this is a colab space, your data might not be saved to your local machine!
maja_df.to_csv('maja_df.csv')

In [None]:
maja_df = pd.read_csv('maja_df.csv')