- What is a dataframe (pandas)
- create df
- vectorized operations
- access rows, columns, cells. (.loc, .iloc, .at)
- change values
- sort values.

In [1]:
import pandas as pd

In [3]:
from typing import List
data_list: List[int] = [1,2,3,4,5]

s1 = pd.Series(data_list)
s1

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [5]:
s1 = pd.Series(data=data_list, index=["a", "b", "c", "d", "e"])
s1

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [6]:
s1["a"]

np.int64(1)

In [7]:
s1 = s1 + 23
s1

a    24
b    25
c    26
d    27
e    28
dtype: int64

In [8]:
type(s1)

pandas.core.series.Series

In [10]:
# DF = collection of Series objects

# dictionary creation
data = {
    "name": ["Nate", "Rebecca", "Edwin", "Preston"],
    "age": [39, 40, 11, 7],
    "year": ["Senior", "Junior", "Sophomore", "Freshman"],
}
students_df = pd.DataFrame(data=data)
students_df = students_df.set_index("name")
students_df

Unnamed: 0_level_0,age,year
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Nate,39,Senior
Rebecca,40,Junior
Edwin,11,Sophomore
Preston,7,Freshman


In [11]:
students_df = students_df.reset_index()
students_df

Unnamed: 0,name,age,year
0,Nate,39,Senior
1,Rebecca,40,Junior
2,Edwin,11,Sophomore
3,Preston,7,Freshman


In [20]:
# Row-based creation
import random

names = ["Messi", "Ronaldo", "Neymar", "Pulisc", "Mbappe"]
goals_per_game = [random.random() for _ in names]
games = [round(random.random() * 1000) for _ in names]

cols = ["name", "goals/game", "games"]
stats = [[names[i], goals_per_game[i], games[i]] for i in range(5)]

stats_df = pd.DataFrame(data=stats, columns=cols)
stats_df

Unnamed: 0,name,goals/game,games
0,Messi,0.42452,276
1,Ronaldo,0.621889,196
2,Neymar,0.488137,279
3,Pulisc,0.084806,922
4,Mbappe,0.056769,819


In [21]:
stats_df["new_col"] = 0
stats_df

Unnamed: 0,name,goals/game,games,new_col
0,Messi,0.42452,276,0
1,Ronaldo,0.621889,196,0
2,Neymar,0.488137,279,0
3,Pulisc,0.084806,922,0
4,Mbappe,0.056769,819,0


In [22]:
stats_df = stats_df.drop(columns=["new_col"])
stats_df

Unnamed: 0,name,goals/game,games
0,Messi,0.42452,276
1,Ronaldo,0.621889,196
2,Neymar,0.488137,279
3,Pulisc,0.084806,922
4,Mbappe,0.056769,819


In [23]:
stats_df['goals'] = stats_df["goals/game"] * stats_df["games"]
stats_df

Unnamed: 0,name,goals/game,games,goals
0,Messi,0.42452,276,117.167644
1,Ronaldo,0.621889,196,121.890302
2,Neymar,0.488137,279,136.190266
3,Pulisc,0.084806,922,78.19142
4,Mbappe,0.056769,819,46.494168


In [24]:
stats_df = stats_df.rename(columns={"goals/game": "goals_p_game"})
# stats_df.rename(columns={"goals/game": "goals_p_game"}, inplace=True)
stats_df

Unnamed: 0,name,goals_p_game,games,goals
0,Messi,0.42452,276,117.167644
1,Ronaldo,0.621889,196,121.890302
2,Neymar,0.488137,279,136.190266
3,Pulisc,0.084806,922,78.19142
4,Mbappe,0.056769,819,46.494168


In [25]:
stats_df["name"]

0      Messi
1    Ronaldo
2     Neymar
3     Pulisc
4     Mbappe
Name: name, dtype: object

In [26]:
stats_df["name"] = stats_df["name"].str.lower()
stats_df

Unnamed: 0,name,goals_p_game,games,goals
0,messi,0.42452,276,117.167644
1,ronaldo,0.621889,196,121.890302
2,neymar,0.488137,279,136.190266
3,pulisc,0.084806,922,78.19142
4,mbappe,0.056769,819,46.494168


In [27]:
stats_df[["name", "games"]]

Unnamed: 0,name,games
0,messi,276
1,ronaldo,196
2,neymar,279
3,pulisc,922
4,mbappe,819


In [None]:
# filtering with .loc

stats_df.loc[0,"name"]


'messi'

In [30]:
stats_df.loc[0]

name                 messi
goals_p_game       0.42452
games                  276
goals           117.167644
Name: 0, dtype: object

In [31]:
stats_df.loc[:, "name"]

0      messi
1    ronaldo
2     neymar
3     pulisc
4     mbappe
Name: name, dtype: object

In [None]:
stats_df.loc[1:2, "name"] # in df, second item after colon is inclusive

1    ronaldo
2     neymar
Name: name, dtype: object

In [None]:
l = [1,2,3,4,5]
l[1:2] # in a list the second item is exclusive

[2]

In [37]:
stats_df.loc[:,"name":"goals"]

Unnamed: 0,name,goals_p_game,games,goals
0,messi,0.42452,276,117.167644
1,ronaldo,0.621889,196,121.890302
2,neymar,0.488137,279,136.190266
3,pulisc,0.084806,922,78.19142
4,mbappe,0.056769,819,46.494168


In [39]:
# boolean logic with .loc

stats_df.loc[stats_df["goals_p_game"] > 0.5]

Unnamed: 0,name,goals_p_game,games,goals
1,ronaldo,0.621889,196,121.890302


In [40]:
stats_df.loc[stats_df["goals_p_game"] > 0.5, "name"]

1    ronaldo
Name: name, dtype: object

In [41]:
stats_df.loc[stats_df["name"] == "messi", "name"] = "ronaldo"
stats_df.loc[stats_df["goals_p_game"] > 0.5, "name"] = "messi"
stats_df

Unnamed: 0,name,goals_p_game,games,goals
0,ronaldo,0.42452,276,117.167644
1,messi,0.621889,196,121.890302
2,neymar,0.488137,279,136.190266
3,pulisc,0.084806,922,78.19142
4,mbappe,0.056769,819,46.494168


In [None]:
# .iloc , .at, .iat