# Einführung in Data Handling

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Pandas

* "pandas is a Python package **providing fast, flexible, and expressive data structures** designed to make working with 'relational' or 'labeled' data both easy and intuitive."
* "built on top of NumPy"
* "It aims to be the fundamental high-level building block for doing practical, real world data analysis in Python."
* "pandas is well suited for many different kinds of data:
  * **Tabular data with heterogeneously-typed columns**, as in an SQL table or Excel spreadsheet
  * Ordered and unordered (not necessarily fixed-frequency) **time series data**.
  * Arbitrary **matrix data** (homogeneously typed or heterogeneous) with row and column labels
  * Any other form of observational / statistical data sets. The data actually need not be labeled at all to be placed into a pandas data structure"
* Primary **data structures**:
  * **Series** (1-dimensional)
  * **DataFrame** (2-dimensional)

In [2]:
pd.Series(np.arange(5), index=["a", "b", "c", "d", "e"])

a    0
b    1
c    2
d    3
e    4
dtype: int64

In [3]:
# creating a Series using a dict
d = {"a": 0.0, "b": 1.0, "c": 2.0}
pd.Series(d)

a    0.0
b    1.0
c    2.0
dtype: float64

In [4]:
# creating a Series from a dict but in a specified order
pd.Series(d, index=["b", "c", "d", "a"])

b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64

#### Series verhalten sich wie arrays oder dictionaries

In [5]:
s = pd.Series([0.25, 0.5, 0.75, 1.0], index=["a", "b", "c", "d"])
s

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [6]:
s["a"]

0.25

In [7]:
# select value at position
s[0]

0.25

In [8]:
# slice Series
s[:3]

a    0.25
b    0.50
c    0.75
dtype: float64

In [9]:
# select elements with a list of positional numbers (array-based indexing)
s[[3, 1]]

d    1.0
b    0.5
dtype: float64

In [10]:
# converting the Series to a pandas array (ExtensionArray) without the index
s.values

array([0.25, 0.5 , 0.75, 1.  ])

In [11]:
s.index

Index(['a', 'b', 'c', 'd'], dtype='object')

#### Initialiserung mit dict

In [12]:
population_dict = {
    "California": 38332521,
    "Texas": 26448193,
    "New York": 19651127,
    "Florida": 19552860,
    "Illinois": 12882135,
}
population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [13]:
# select value at index (label)
population["California"]

38332521

In [14]:
population["Texas"] = population["Texas"] + 2
population

California    38332521
Texas         26448195
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

### DataFrames

Das wichtigste pandas object.

In [15]:
area_dict = {
    "California": 423967,
    "Texas": 695662,
    "New York": 141297,
    "Florida": 170312,
    "Illinois": 149995,
}
area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [16]:
df = pd.DataFrame({"population": population, "area": area})
df

Unnamed: 0,population,area
California,38332521,423967
Texas,26448195,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [17]:
df.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [18]:
df.columns

Index(['population', 'area'], dtype='object')

In [19]:
df.shape

(5, 2)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, California to Illinois
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   population  5 non-null      int64
 1   area        5 non-null      int64
dtypes: int64(2)
memory usage: 292.0+ bytes


In [21]:
# holt die ersten beiden Zeilen
df.head(2)

Unnamed: 0,population,area
California,38332521,423967
Texas,26448195,695662


In [22]:
# holt die letzte Zeile
df.tail(1)

Unnamed: 0,population,area
Illinois,12882135,149995


In [23]:
# get a NumPy representation of the DataFrame
df.values

array([[38332521,   423967],
       [26448195,   695662],
       [19651127,   141297],
       [19552860,   170312],
       [12882135,   149995]])

In [24]:
# transpose the dataframe
df.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
population,38332521,26448195,19651127,19552860,12882135
area,423967,695662,141297,170312,149995


#### Select DataFrame columns
#### mit ```[]```

In [25]:
df["area"] # returns series

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [26]:
df[["area"]] # returns dataframe

Unnamed: 0,area
California,423967
Texas,695662
New York,141297
Florida,170312
Illinois,149995


In [27]:
# select multiple columns of a DataFrame
df[["population", "area"]]

Unnamed: 0,population,area
California,38332521,423967
Texas,26448195,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


##### Select rows using ```loc```

In [28]:
# select row
df.loc["New York"]

population    19651127
area            141297
Name: New York, dtype: int64

In [29]:
# select rows by starting index label
df.loc["New York":]

Unnamed: 0,population,area
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [30]:
df.loc["New York":"Florida"]

Unnamed: 0,population,area
New York,19651127,141297
Florida,19552860,170312


##### Select columns using ```loc```

In [31]:
# select columns until 'two' (inclusive end!)
df.loc[:, :"area"]

Unnamed: 0,population,area
California,38332521,423967
Texas,26448195,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [32]:
# select a single column
df.loc[:, "area"]

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [33]:
# select rows and columns
df.loc["New York":, "area"]

New York    141297
Florida     170312
Illinois    149995
Name: area, dtype: int64

In [34]:
# select single value from dataframe
df.loc["Florida", "area"]

170312

#### Select rows and columns by position
mit ```iloc[]```

In [35]:
# select row by integer location of the index
df.iloc[2]

population    19651127
area            141297
Name: New York, dtype: int64

In [36]:
# select rows by starting index integer
df.iloc[2:]

Unnamed: 0,population,area
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [37]:
# select second column (zero-based)
df.iloc[:, 1]

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [38]:
# slicing
df.iloc[2:4,1]

New York    141297
Florida     170312
Name: area, dtype: int64

### ```[]``` vs. ```.loc[]``` vs. ```.iloc[]```

For a DataFrame with uppercase letters as column labels ('A', 'B', 'C') and lowercase letters as row labels ('a', 'b', 'c', 'd') the following operations can be applied for selecting or slicing rows or columns (this table shows when exchanging the ```[]``` method with ```loc``` or ```iloc``` returns the same result):

| Operatation                        | ```[]``` method      | ```loc``` method | ```iloc``` method |
|:-----------------------------------|:---------------------|:-----------------|:------------------|
| Select a single column by label    | ```df['A']```        | ```df.loc[:, 'A']```        | -      |
| Select list of columns by label    | ```df[['A', 'C']]``` | ```df.loc[:, ['A', 'C']]``` | -      |
| Slice columns by label             | -                    | ```df.loc[:, 'A':'C']```    |        |
| Select a single column by position | -                    | -                           | ```df.iloc[:, 1]``` |
| Select list of columns by position | -                    | -                           | ```df.iloc[:, [0, 2]]``` |
| Slice columns by position          | -                    | -                           | ```df.iloc[:, 0:2]``` |
| Select a single row by label       | -                    | ```df.loc['b']```           | - |
| Select a list of rows by label     | -                    | ```df.loc[['b', 'd']]```    | - |
| Slice rows by label                | ```df['b':'d']```*   | ```df.loc['b':'d']```*      | - | 
| Select a single row by position    | -                    | -                           | ```df.iloc[1]```|
| Select a list of rows by position  | -                    | -                           | ```df.iloc[[1, 3]]``` |
| Slice rows by position             | ```df[1:4]```        | -                           | ```df.iloc[1:4]``` | 


\* inclusive end of the selection

Note that you could also combine the selection of rows and columns (for the ```loc``` and ```iloc``` methods but not the ```[]``` method).


#### Beim ändern von Werten in DataFrames:

### **Benutze immer ```loc``` oder ```iloc``` damit sich auch tatsächlich der Original DataFrame ändert**.

See https://stackoverflow.com/a/47098873/6270819

#### Werte in dataframes ändern

In [39]:
df

Unnamed: 0,population,area
California,38332521,423967
Texas,26448195,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [40]:
df.loc[["Florida", "Texas"]] = 0
df

Unnamed: 0,population,area
California,38332521,423967
Texas,0,0
New York,19651127,141297
Florida,0,0
Illinois,12882135,149995


In [41]:
# Set value for a columns
df.loc[:, "population"] = 0
df

Unnamed: 0,population,area
California,0,423967
Texas,0,0
New York,0,141297
Florida,0,0
Illinois,0,149995


In [42]:
df.loc[["California", "New York"], "area"] = 0
df

Unnamed: 0,population,area
California,0,0
Texas,0,0
New York,0,0
Florida,0,0
Illinois,0,149995


In [43]:
population_dict.values()

dict_values([38332521, 26448193, 19651127, 19552860, 12882135])

In [44]:
# reset
df.loc[:, "population"] = population_dict.values()
df.loc[:, "area"] = area_dict.values()
df

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


#### Neue Spalten einfügen

In [45]:
# füge eine neue Spalte ein
df["density"] = df["population"] / df["area"]
df

Unnamed: 0,population,area,density
California,38332521,423967,90.413926
Texas,26448193,695662,38.01874
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121
Illinois,12882135,149995,85.883763


In [46]:
# Füge boolschen Wert in die Spalte ein
df["high_density"] = df["density"] >= 100
df

Unnamed: 0,population,area,density,high_density
California,38332521,423967,90.413926,False
Texas,26448193,695662,38.01874,False
New York,19651127,141297,139.076746,True
Florida,19552860,170312,114.806121,True
Illinois,12882135,149995,85.883763,False


In [47]:
# einen einzigen Wert in eine Spalte schreiben 
df["country"] = "USA"
df

Unnamed: 0,population,area,density,high_density,country
California,38332521,423967,90.413926,False,USA
Texas,26448193,695662,38.01874,False,USA
New York,19651127,141297,139.076746,True,USA
Florida,19552860,170312,114.806121,True,USA
Illinois,12882135,149995,85.883763,False,USA


#### Lösche Zeilen und Spalten

In [48]:
# lösche Spalte mit drop
df.drop(columns=["country"])

Unnamed: 0,population,area,density,high_density
California,38332521,423967,90.413926,False
Texas,26448193,695662,38.01874,False
New York,19651127,141297,139.076746,True
Florida,19552860,170312,114.806121,True
Illinois,12882135,149995,85.883763,False


In [49]:
# lösche mehrere Spalten
df.drop(columns=["country", "high_density"])

Unnamed: 0,population,area,density
California,38332521,423967,90.413926
Texas,26448193,695662,38.01874
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121
Illinois,12882135,149995,85.883763


In [50]:
# lösche eine einzelne Zeile
df.drop(index="California")

Unnamed: 0,population,area,density,high_density,country
Texas,26448193,695662,38.01874,False,USA
New York,19651127,141297,139.076746,True,USA
Florida,19552860,170312,114.806121,True,USA
Illinois,12882135,149995,85.883763,False,USA


In [51]:
# lösche mehrere Zeilen
df.drop(index=["California", "Texas"])

Unnamed: 0,population,area,density,high_density,country
New York,19651127,141297,139.076746,True,USA
Florida,19552860,170312,114.806121,True,USA
Illinois,12882135,149995,85.883763,False,USA


#### Boolean Indexing

In [52]:
df

Unnamed: 0,population,area,density,high_density,country
California,38332521,423967,90.413926,False,USA
Texas,26448193,695662,38.01874,False,USA
New York,19651127,141297,139.076746,True,USA
Florida,19552860,170312,114.806121,True,USA
Illinois,12882135,149995,85.883763,False,USA


In [53]:
# Daten nach Bedingung auswählen
df[df["area"] > 400000]

Unnamed: 0,population,area,density,high_density,country
California,38332521,423967,90.413926,False,USA
Texas,26448193,695662,38.01874,False,USA


In [54]:
# äquivalent zu loc. loc access ist bevorzugt
df.loc[df["area"] > 400000]

Unnamed: 0,population,area,density,high_density,country
California,38332521,423967,90.413926,False,USA
Texas,26448193,695662,38.01874,False,USA


In [55]:
# use a list of booleans to access rows
df.loc[[False, True, True, False, False]]

Unnamed: 0,population,area,density,high_density,country
Texas,26448193,695662,38.01874,False,USA
New York,19651127,141297,139.076746,True,USA


In [56]:
# selecting rows that match a more complex criterion
criterion = df["area"] > 400000
criterion

California     True
Texas          True
New York      False
Florida       False
Illinois      False
Name: area, dtype: bool

In [57]:
df[criterion]

Unnamed: 0,population,area,density,high_density,country
California,38332521,423967,90.413926,False,USA
Texas,26448193,695662,38.01874,False,USA


In [58]:
# multiple criteria
# large numbers can be typed with a _
df[criterion & (df["population"] > 30_000_000)]

Unnamed: 0,population,area,density,high_density,country
California,38332521,423967,90.413926,False,USA


# Pandas Übungen

## 1. Slicing DataFrames


In [59]:
population_dict = {
    "California": 38332521,
    "Texas": 26448193,
    "New York": 19651127,
    "Florida": 19552860,
    "Illinois": 12882135,
}
area_dict = {
    "California": 423967,
    "Texas": 695662,
    "New York": 141297,
    "Florida": 170312,
    "Illinois": 149995,
}
df = pd.DataFrame({"population": population_dict, "area": area_dict})
df

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


1. Gebe die area Floridas zurück
2. Wähle alle Zeilen mit einer Dichte < 90
3. Wähle die population für Staaten mit hoher Dichte (>= 100)
4. Wähle alle Staaten mit Dichte > 90 und population >  20000000

## 2. Adding Columns
1. Füge eine Spalte mit den Wahlergebnissen der 2020 Wahl ein (Florida und Texas wählten Trump, die anderen für Biden)
2. Füge eine Spalte mit boolschen Werten ein, welche True ist wenn Biden gewählt würde und 80 < density < 100


In [2]:
import pandas as pd

In [5]:
pd.read_csv("TraceZG_Vario mit Easy.trc", delimiter="  ", header=20)

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,1),Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 31,00,00.1,00.2,00.3,00.4,00.5,00.6,00.7,Unnamed: 40
0,,,,,,2),,,,,...,,00,00,00,00,00,0.0,1.0,,
1,,,,,,3),,,,,...,,00,00,00,00,00,0.0,0.0,0.0,
2,,,,,,4),,,,,...,,00,00,00,00,,,,,
3,,,,,,5),,,,,...,,00,00,6B,0E,00,0.0,0.0,0.0,
4,,,,,,6),,,,,...,00,00,00,00,00,00,0.0,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4045,,,4047),,,,,,8453.699,1.0,...,,,,,,,,,,
4046,,,4048),,,,,,8454.168,1.0,...,00,00,00,,,,,,,
4047,,,4049),,,,,,8454.637,1.0,...,00,01,,,,,,,,
4048,,,4050),,,,,,8455.149,1.0,...,00,00,00,,,,,,,
