# Introduction into Pandas

In [1]:
import pandas as pd
import numpy as np

In [2]:
s = pd.Series([1,2,3,4,5])
print(s, s.index, s.values)

0    1
1    2
2    3
3    4
4    5
dtype: int64 RangeIndex(start=0, stop=5, step=1) [1 2 3 4 5]


In [3]:
fruits = ['apples', 'oranges', 'cherries', 'pears']
fruits2 = ['raspberries', 'oranges', 'cherries', 'pears']

quantities = [20,33,52,10]
s = pd.Series(quantities, index=fruits)
s2 = pd.Series([10,10,10,10], index=fruits)
s3 = pd.Series([1,1,1,1], index=fruits2)
s + s2 + s3

apples          NaN
cherries       63.0
oranges        44.0
pears          21.0
raspberries     NaN
dtype: float64

In [4]:
s = pd.Series([20,33,52,10], index=['apples', 'oranges', 'cherries', 'pears'] )
print(s['apples'])
print(s[['apples', 'pears']])

20
apples    20
pears     10
dtype: int64


In [5]:
print( s.apply(np.log), "\n" )
print( s.apply( lambda x: x if x>50 else x+10 ) )

apples      2.995732
oranges     3.496508
cherries    3.951244
pears       2.302585
dtype: float64 

apples      30
oranges     43
cherries    52
pears       20
dtype: int64


In [6]:
print( s[s>30], "\n" )
print( "apples" in s )

oranges     33
cherries    52
dtype: int64 

True


In [7]:
cities = {"London":    8615246, 
          "Berlin":    3562166, 
          "Madrid":    3165235, 
          "Rome":      2874038, 
          "Paris":     2273305, 
          "Vienna":    1805681, 
          "Bucharest": 1803425, 
          "Hamburg":   1760433,
          "Budapest":  1754000,
          "Warsaw":    1740119,
          "Barcelona": 1602386,
          "Munich":    1493900,
          "Milan":     1350680}

city_series = pd.Series(cities)
city_series

London       8615246
Berlin       3562166
Madrid       3165235
Rome         2874038
Paris        2273305
Vienna       1805681
Bucharest    1803425
Hamburg      1760433
Budapest     1754000
Warsaw       1740119
Barcelona    1602386
Munich       1493900
Milan        1350680
dtype: int64

In [8]:
my_cities = ["London", "Paris", "Zurich", "Berlin", "Stuttgart", "Hamburg"]
my_cities_series = pd.Series(cities, index=my_cities)
print( my_cities_series, "\n" )
print(my_cities_series.isnull(), "\n")
print(my_cities_series.notnull())

London       8615246.0
Paris        2273305.0
Zurich             NaN
Berlin       3562166.0
Stuttgart          NaN
Hamburg      1760433.0
dtype: float64 

London       False
Paris        False
Zurich        True
Berlin       False
Stuttgart     True
Hamburg      False
dtype: bool 

London        True
Paris         True
Zurich       False
Berlin        True
Stuttgart    False
Hamburg       True
dtype: bool


In [9]:
d = {"a":23, "b":45, "c":None, "d":0}
s = pd.Series(d)
s

a    23.0
b    45.0
c     NaN
d     0.0
dtype: float64

In [10]:
print( my_cities_series.dropna() )

London     8615246.0
Paris      2273305.0
Berlin     3562166.0
Hamburg    1760433.0
dtype: float64


In [11]:
print( my_cities_series.fillna(0) )

London       8615246.0
Paris        2273305.0
Zurich             0.0
Berlin       3562166.0
Stuttgart          0.0
Hamburg      1760433.0
dtype: float64


In [12]:
missing_cities = {"Stuttgart":597939, "Zurich":378884}
my_cities_series.fillna(missing_cities)

London       8615246.0
Paris        2273305.0
Zurich        378884.0
Berlin       3562166.0
Stuttgart     597939.0
Hamburg      1760433.0
dtype: float64

In [13]:
cities = {"London":   8615246, 
          "Berlin":   3562166, 
          "Madrid":   3165235, 
          "Rome":     2874038, 
          "Paris":    2273305, 
          "Vienna":   1805681, 
          "Bucharest":1803425, 
          "Hamburg":  1760433,
          "Budapest": 1754000,
          "Warsaw":   1740119,
          "Barcelona":1602386,
          "Munich":   1493900,
          "Milan":    1350680}

my_cities = ["London", "Paris", "Zurich", "Berlin", "Stuttgart", "Hamburg"]
my_cities_series = pd.Series(cities, index=my_cities)
my_cities_series = my_cities_series.fillna(0).astype(int)
my_cities_series

London       8615246
Paris        2273305
Zurich             0
Berlin       3562166
Stuttgart          0
Hamburg      1760433
dtype: int64

# Pandas DataFrame

In [14]:
years = range(2014, 2018)

shop1 = pd.Series([2409.14, 2941.01, 3496.83, 3119.55], index=years)
shop2 = pd.Series([1203.45, 3441.62, 3007.83, 3619.53], index=years)
shop3 = pd.Series([3412.12, 3491.16, 3457.19, 1963.10], index=years)

shops_df = pd.concat([shop1, shop2, shop3], axis=1)
shops_df

Unnamed: 0,0,1,2
2014,2409.14,1203.45,3412.12
2015,2941.01,3441.62,3491.16
2016,3496.83,3007.83,3457.19
2017,3119.55,3619.53,1963.1


In [15]:
cities = ["Zurich", "Winterthur", "Freiburg"]
shops_df.columns = cities

"""
Alternative way
shop1.name = "Zürich"
shop2.name = "Winterthur"
shop3.name = "Freiburg"
"""
print(shop1)
shop1.name = "Zurich"
print(shop1)
shops_df2 = pd.concat([shop1, shop2, shop3], axis=1)
shops_df2

2014    2409.14
2015    2941.01
2016    3496.83
2017    3119.55
dtype: float64
2014    2409.14
2015    2941.01
2016    3496.83
2017    3119.55
Name: Zurich, dtype: float64


Unnamed: 0,Zurich,0,1
2014,2409.14,1203.45,3412.12
2015,2941.01,3441.62,3491.16
2016,3496.83,3007.83,3457.19
2017,3119.55,3619.53,1963.1


In [16]:
cities = {"name": ["London", "Berlin", "Madrid", "Rome", 
                   "Paris", "Vienna", "Bucharest", "Hamburg", 
                   "Budapest", "Warsaw", "Barcelona", 
                   "Munich", "Milan"],
          "population": [8615246, 3562166, 3165235, 2874038,
                         2273305, 1805681, 1803425, 1760433,
                         1754000, 1740119, 1602386, 1493900,
                         1350680],
          "country": ["England", "Germany", "Spain", "Italy",
                      "France", "Austria", "Romania", 
                      "Germany", "Hungary", "Poland", "Spain",
                      "Germany", "Italy"]}

city_frame = pd.DataFrame(cities)
city_frame

Unnamed: 0,name,population,country
0,London,8615246,England
1,Berlin,3562166,Germany
2,Madrid,3165235,Spain
3,Rome,2874038,Italy
4,Paris,2273305,France
5,Vienna,1805681,Austria
6,Bucharest,1803425,Romania
7,Hamburg,1760433,Germany
8,Budapest,1754000,Hungary
9,Warsaw,1740119,Poland


In [17]:
city_frame.columns

Index(['name', 'population', 'country'], dtype='object')

In [18]:
ordinals = ["first", "second", "third", "fourth",
            "fifth", "sixth", "seventh", "eigth",
            "ninth", "tenth", "eleventh", "twelvth",
            "thirteenth"]
city_frame = pd.DataFrame(cities, index=ordinals)
city_frame

Unnamed: 0,name,population,country
first,London,8615246,England
second,Berlin,3562166,Germany
third,Madrid,3165235,Spain
fourth,Rome,2874038,Italy
fifth,Paris,2273305,France
sixth,Vienna,1805681,Austria
seventh,Bucharest,1803425,Romania
eigth,Hamburg,1760433,Germany
ninth,Budapest,1754000,Hungary
tenth,Warsaw,1740119,Poland


In [19]:
city_frame = pd.DataFrame(cities, columns=["name", "country", "population"])
city_frame

Unnamed: 0,name,country,population
0,London,England,8615246
1,Berlin,Germany,3562166
2,Madrid,Spain,3165235
3,Rome,Italy,2874038
4,Paris,France,2273305
5,Vienna,Austria,1805681
6,Bucharest,Romania,1803425
7,Hamburg,Germany,1760433
8,Budapest,Hungary,1754000
9,Warsaw,Poland,1740119


In [20]:
city_frame.reindex(index=[0,2,4,6,8,10,12,1,3,5,7,9,11], columns=["country","name","population"])

Unnamed: 0,country,name,population
0,England,London,8615246
2,Spain,Madrid,3165235
4,France,Paris,2273305
6,Romania,Bucharest,1803425
8,Hungary,Budapest,1754000
10,Spain,Barcelona,1602386
12,Italy,Milan,1350680
1,Germany,Berlin,3562166
3,Italy,Rome,2874038
5,Austria,Vienna,1805681


In [21]:
city_frame.rename(columns={"name":"Soyadi","country":"Ulke","population":"Nufus"}, inplace=True)
city_frame

Unnamed: 0,Soyadi,Ulke,Nufus
0,London,England,8615246
1,Berlin,Germany,3562166
2,Madrid,Spain,3165235
3,Rome,Italy,2874038
4,Paris,France,2273305
5,Vienna,Austria,1805681
6,Bucharest,Romania,1803425
7,Hamburg,Germany,1760433
8,Budapest,Hungary,1754000
9,Warsaw,Poland,1740119


In [22]:
city_frame = pd.DataFrame(cities, columns=["name","population"], index=cities["country"])
city_frame

Unnamed: 0,name,population
England,London,8615246
Germany,Berlin,3562166
Spain,Madrid,3165235
Italy,Rome,2874038
France,Paris,2273305
Austria,Vienna,1805681
Romania,Bucharest,1803425
Germany,Hamburg,1760433
Hungary,Budapest,1754000
Poland,Warsaw,1740119


In [23]:
city_frame2 = pd.DataFrame(cities).set_index("country")
city_frame2

Unnamed: 0_level_0,name,population
country,Unnamed: 1_level_1,Unnamed: 2_level_1
England,London,8615246
Germany,Berlin,3562166
Spain,Madrid,3165235
Italy,Rome,2874038
France,Paris,2273305
Austria,Vienna,1805681
Romania,Bucharest,1803425
Germany,Hamburg,1760433
Hungary,Budapest,1754000
Poland,Warsaw,1740119


In [24]:
city_frame = pd.DataFrame(cities, columns=("name","population"), index=cities["country"])
city_frame.loc["Germany"]

Unnamed: 0,name,population
Germany,Berlin,3562166
Germany,Hamburg,1760433
Germany,Munich,1493900


In [25]:
city_frame.loc[["Germany", "France"]]

Unnamed: 0,name,population
Germany,Berlin,3562166
Germany,Hamburg,1760433
Germany,Munich,1493900
France,Paris,2273305


In [26]:
city_frame[ city_frame.population > 2000000 ]

Unnamed: 0,name,population
England,London,8615246
Germany,Berlin,3562166
Spain,Madrid,3165235
Italy,Rome,2874038
France,Paris,2273305


In [27]:
city_frame.loc[(city_frame.population>1500000) & (city_frame['name'].str.contains("m"))]

Unnamed: 0,name,population
Italy,Rome,2874038
Germany,Hamburg,1760433


In [28]:
city_frame.loc[(city_frame.population>3000000) | (city_frame["name"].str.contains("m"))]

Unnamed: 0,name,population
England,London,8615246
Germany,Berlin,3562166
Spain,Madrid,3165235
Italy,Rome,2874038
Germany,Hamburg,1760433


In [29]:
milan = ["Milan", 1399860]
city_frame.iloc[-1] = milan
city_frame.loc["Switzerland"] = ["Zurich", 415215]
city_frame

Unnamed: 0,name,population
England,London,8615246
Germany,Berlin,3562166
Spain,Madrid,3165235
Italy,Rome,2874038
France,Paris,2273305
Austria,Vienna,1805681
Romania,Bucharest,1803425
Germany,Hamburg,1760433
Hungary,Budapest,1754000
Poland,Warsaw,1740119


In [30]:
city_frame.iloc[[3,2,0,5,0]]

Unnamed: 0,name,population
Italy,Rome,2874038
Spain,Madrid,3165235
England,London,8615246
Austria,Vienna,1805681
England,London,8615246


In [31]:
years = range(2014,2019)
cities = ["Zürich", "Freiburg", "München", "Konstanz", "Saarbrücken"]
shops = pd.DataFrame(index=years)

for city in cities:
    shops.insert(loc=len(shops.columns), column=city, value=(np.random.uniform(0.7, 1, (5)) * 1000).round(2))
print( shops.sum(), "\n" )
print( shops.sum(axis=1) )
shops

Zürich         4356.19
Freiburg       4155.35
München        4426.93
Konstanz       4481.08
Saarbrücken    4321.69
dtype: float64 

2014    4041.96
2015    4472.58
2016    4327.44
2017    4507.62
2018    4391.64
dtype: float64


Unnamed: 0,Zürich,Freiburg,München,Konstanz,Saarbrücken
2014,735.51,768.47,964.09,842.54,731.35
2015,956.1,865.2,947.24,817.56,886.48
2016,764.96,810.87,840.83,971.78,939.0
2017,921.51,958.99,846.59,915.64,864.89
2018,978.11,751.82,828.18,933.56,899.97


In [32]:
s = shops.iloc[:, [0,2,3]]
print( s.sum() )
s

Zürich      4356.19
München     4426.93
Konstanz    4481.08
dtype: float64


Unnamed: 0,Zürich,München,Konstanz
2014,735.51,964.09,842.54
2015,956.1,947.24,817.56
2016,764.96,840.83,971.78
2017,921.51,846.59,915.64
2018,978.11,828.18,933.56


In [33]:
s.cumsum()

Unnamed: 0,Zürich,München,Konstanz
2014,735.51,964.09,842.54
2015,1691.61,1911.33,1660.1
2016,2456.57,2752.16,2631.88
2017,3378.08,3598.75,3547.52
2018,4356.19,4426.93,4481.08


In [34]:
cities = {"name": ["London", "Berlin", "Madrid", "Rome", 
                   "Paris", "Vienna", "Bucharest", "Hamburg", 
                   "Budapest", "Warsaw", "Barcelona", 
                   "Munich", "Milan"],
          "population": [8615246, 3562166, 3165235, 2874038,
                         2273305, 1805681, 1803425, 1760433,
                         1754000, 1740119, 1602386, 1493900,
                         1350680],
          "country": ["England", "Germany", "Spain", "Italy",
                      "France", "Austria", "Romania", 
                      "Germany", "Hungary", "Poland", "Spain",
                      "Germany", "Italy"]}

city_frame = pd.DataFrame(cities, columns=["country", "population", "cum_population"], index=cities["name"])
city_frame["cum_population"] = city_frame["population"].cumsum()
city_frame

Unnamed: 0,country,population,cum_population
London,England,8615246,8615246
Berlin,Germany,3562166,12177412
Madrid,Spain,3165235,15342647
Rome,Italy,2874038,18216685
Paris,France,2273305,20489990
Vienna,Austria,1805681,22295671
Bucharest,Romania,1803425,24099096
Hamburg,Germany,1760433,25859529
Budapest,Hungary,1754000,27613529
Warsaw,Poland,1740119,29353648


In [35]:
print(city_frame["population"], "\n")
print(city_frame.population, "\n")

London       8615246
Berlin       3562166
Madrid       3165235
Rome         2874038
Paris        2273305
Vienna       1805681
Bucharest    1803425
Hamburg      1760433
Budapest     1754000
Warsaw       1740119
Barcelona    1602386
Munich       1493900
Milan        1350680
Name: population, dtype: int64 

London       8615246
Berlin       3562166
Madrid       3165235
Rome         2874038
Paris        2273305
Vienna       1805681
Bucharest    1803425
Hamburg      1760433
Budapest     1754000
Warsaw       1740119
Barcelona    1602386
Munich       1493900
Milan        1350680
Name: population, dtype: int64 



In [36]:
area = [1572, 891.85, 605.77, 1285, 
        105.4, 414.6, 228, 755, 
        525.2, 517, 101.9, 310.4, 
        181.8]

city_frame["area"] = area
city_frame

Unnamed: 0,country,population,cum_population,area
London,England,8615246,8615246,1572.0
Berlin,Germany,3562166,12177412,891.85
Madrid,Spain,3165235,15342647,605.77
Rome,Italy,2874038,18216685,1285.0
Paris,France,2273305,20489990,105.4
Vienna,Austria,1805681,22295671,414.6
Bucharest,Romania,1803425,24099096,228.0
Hamburg,Germany,1760433,25859529,755.0
Budapest,Hungary,1754000,27613529,525.2
Warsaw,Poland,1740119,29353648,517.0


In [37]:
city_frame.sort_values(by="area", ascending=False)

Unnamed: 0,country,population,cum_population,area
London,England,8615246,8615246,1572.0
Rome,Italy,2874038,18216685,1285.0
Berlin,Germany,3562166,12177412,891.85
Hamburg,Germany,1760433,25859529,755.0
Madrid,Spain,3165235,15342647,605.77
Budapest,Hungary,1754000,27613529,525.2
Warsaw,Poland,1740119,29353648,517.0
Vienna,Austria,1805681,22295671,414.6
Munich,Germany,1493900,32449934,310.4
Bucharest,Romania,1803425,24099096,228.0


In [38]:
city_frame = pd.DataFrame(cities, columns=["country", "area", "population"], index=cities["name"])
some_areas = pd.Series([1572, 755, 181.8], index=["London", "Hamburg", "Milan"])
city_frame["area"] = some_areas
city_frame

Unnamed: 0,country,area,population
London,England,1572.0,8615246
Berlin,Germany,,3562166
Madrid,Spain,,3165235
Rome,Italy,,2874038
Paris,France,,2273305
Vienna,Austria,,1805681
Bucharest,Romania,,1803425
Hamburg,Germany,755.0,1760433
Budapest,Hungary,,1754000
Warsaw,Poland,,1740119


In [39]:
city_frame = pd.DataFrame(cities,
                          columns=["country", 
                                   "population"],
                          index=cities["name"])
city_frame.insert(loc=1, column="area", value=area)
city_frame

Unnamed: 0,country,area,population
London,England,1572.0,8615246
Berlin,Germany,891.85,3562166
Madrid,Spain,605.77,3165235
Rome,Italy,1285.0,2874038
Paris,France,105.4,2273305
Vienna,Austria,414.6,1805681
Bucharest,Romania,228.0,1803425
Hamburg,Germany,755.0,1760433
Budapest,Hungary,525.2,1754000
Warsaw,Poland,517.0,1740119


In [40]:
df = pd.DataFrame(columns=["lib","qty1","qty2"])
for i in range(5):
    df.loc[i] = ["name" + str(i)] + list(np.random.randint(10, size=2))
df

Unnamed: 0,lib,qty1,qty2
0,name0,5,0
1,name1,3,0
2,name2,2,3
3,name3,2,7
4,name4,5,7


In [41]:
growth = {"Switzerland": {"2010": 3.0, "2011": 1.8, "2012": 1.1, "2013": 1.9},
          "Germany": {"2010": 4.1, "2011": 3.6, "2012":	0.4, "2013": 0.1},
          "France": {"2010":2.0,  "2011":2.1, "2012": 0.3, "2013": 0.3},
          "Greece": {"2010":-5.4, "2011":-8.9, "2012":-6.6, "2013":	-3.3},
          "Italy": {"2010":1.7, "2011":	0.6, "2012":-2.3, "2013":-1.9}
          } 
growth_frame = pd.DataFrame(growth)
growth_frame

Unnamed: 0,Switzerland,Germany,France,Greece,Italy
2010,3.0,4.1,2.0,-5.4,1.7
2011,1.8,3.6,2.1,-8.9,0.6
2012,1.1,0.4,0.3,-6.6,-2.3
2013,1.9,0.1,0.3,-3.3,-1.9


In [42]:
growth_frame.T

Unnamed: 0,2010,2011,2012,2013
Switzerland,3.0,1.8,1.1,1.9
Germany,4.1,3.6,0.4,0.1
France,2.0,2.1,0.3,0.3
Greece,-5.4,-8.9,-6.6,-3.3
Italy,1.7,0.6,-2.3,-1.9


In [43]:
growth_frame2 = growth_frame.T.reindex(["Switzerland", "Italy", "Germany", "Greece"])
print(growth_frame2)

             2010  2011  2012  2013
Switzerland   3.0   1.8   1.1   1.9
Italy         1.7   0.6  -2.3  -1.9
Germany       4.1   3.6   0.4   0.1
Greece       -5.4  -8.9  -6.6  -3.3


In [44]:
names = ['Frank', 'Eve', 'Stella', 'Guido', 'Lara']
index = ["January", "February", "March",
         "April", "May", "June",
         "July", "August", "September",
         "October", "November", "December"]
df = pd.DataFrame((np.random.randn(12, 5)*1000).round(2), columns=names, index=index)
df

Unnamed: 0,Frank,Eve,Stella,Guido,Lara
January,206.98,-564.28,163.99,923.64,-614.23
February,-1131.44,116.49,-142.71,-543.57,-1344.22
March,-757.19,-809.97,-849.08,-527.05,-1099.11
April,1016.36,-603.81,-487.85,-1124.96,869.37
May,-1284.18,954.39,276.15,592.81,950.84
June,1295.55,-2885.43,-2606.23,235.36,-1047.4
July,604.56,1458.51,-1815.53,1305.05,3088.53
August,-1129.72,-1051.62,-1363.05,-778.97,836.34
September,-944.05,-807.43,1609.61,-67.41,-291.37
October,-743.88,233.39,1021.16,471.8,-1157.11


# Accessing and Changing values of DataFrames

In [45]:
first = ('Mike', 'Dorothee', 'Tom', 'Bill', 'Pete', 'Kate')
last = ('Meyer', 'Maier', 'Meyer', 'Mayer', 'Meyr', 'Mair')
job = ('data analyst', 'programmer', 'computer scientist', 
       'data scientist', 'accountant', 'psychiatrist')
language = ('Python', 'Perl', 'Java', 'Java', 'Cobol', 'Brainfuck')

df = pd.DataFrame(list(zip(last, job, language)), 
                  columns =['last', 'job', 'language'],
                  index=first) 
df

Unnamed: 0,last,job,language
Mike,Meyer,data analyst,Python
Dorothee,Maier,programmer,Perl
Tom,Meyer,computer scientist,Java
Bill,Mayer,data scientist,Java
Pete,Meyr,accountant,Cobol
Kate,Mair,psychiatrist,Brainfuck


In [46]:
print(df.loc['Bill', 'job'])
print(df.at['Bill', 'job'])

df.loc['Bill', 'job'] = 'data analyst'
print(df.loc['Bill', 'job'])

df.at['Pete', 'language'] = 'Python'

data scientist
data scientist
data analyst


In [47]:
s = pd.Series([27, 33, 13, 19])
s = s.replace(13, 42)
s

0    27
1    33
2    42
3    19
dtype: int64

In [48]:
s = pd.Series([0, 1, 2, 3, 4])
s.replace([0, 1, 2], 42, inplace=True)
s

0    42
1    42
2    42
3     3
4     4
dtype: int64

In [49]:
import pandas as pd

first = ('Mike', 'Dorothee', 'Tom', 'Bill', 'Pete', 'Kate')
last = ('Meyer', 'Maier', 'Meyer', 'Mayer', 'Meyr', 'Mair')
job = ('data analyst', 'programmer', 'computer scientist', 
       'data scientist', 'programmer', 'psychiatrist')
language = ('Python', 'Perl', 'Java', 'Pithon', 'Pythen', 'Brainfuck')

df = pd.DataFrame(list(zip(last, job, language)), 
                  columns =['last', 'job', 'language'],
                  index=first) 

df

Unnamed: 0,last,job,language
Mike,Meyer,data analyst,Python
Dorothee,Maier,programmer,Perl
Tom,Meyer,computer scientist,Java
Bill,Mayer,data scientist,Pithon
Pete,Meyr,programmer,Pythen
Kate,Mair,psychiatrist,Brainfuck


In [50]:
df.replace("programmer", "computer scientist", inplace=True)
df

Unnamed: 0,last,job,language
Mike,Meyer,data analyst,Python
Dorothee,Maier,computer scientist,Perl
Tom,Meyer,computer scientist,Java
Bill,Mayer,data scientist,Pithon
Pete,Meyr,computer scientist,Pythen
Kate,Mair,psychiatrist,Brainfuck


In [51]:
df.replace(to_replace=['Mike', 'Tom', 'Perl'],
           value= ['Michael', 'Thomas', 'Python'], 
           inplace=True)
df

Unnamed: 0,last,job,language
Mike,Meyer,data analyst,Python
Dorothee,Maier,computer scientist,Python
Tom,Meyer,computer scientist,Java
Bill,Mayer,data scientist,Pithon
Pete,Meyr,computer scientist,Pythen
Kate,Mair,psychiatrist,Brainfuck


In [52]:
df.replace(to_replace=[r'M[ea][iy]e?r', r'P[iy]th[eo]n'],
           value=['Mayer', 'Python'],
           regex=True, 
           inplace=True)
df

Unnamed: 0,last,job,language
Mike,Mayer,data analyst,Python
Dorothee,Mayer,computer scientist,Python
Tom,Mayer,computer scientist,Java
Bill,Mayer,data scientist,Python
Pete,Mayer,computer scientist,Python
Kate,Mayer,psychiatrist,Brainfuck


In [53]:
df = pd.DataFrame({'A': [0, 1, 2, 3, 4],
                   'B': ['foo', 'bar', 'bloo', 'blee', 'bloo'],
                   'C': ['green', 'red', 'blue', 'yellow', 'green']})

df.replace(to_replace={"A": {0: 42, 3: 33}, 'B': {'bloo': 'vloo'}}, inplace=True)
df

Unnamed: 0,A,B,C
0,42,foo,green
1,1,bar,red
2,2,vloo,blue
3,33,blee,yellow
4,4,vloo,green


In [54]:
df = pd.DataFrame({
    'name':['Ben', 'Kate', 'Agnes', 'Ashleigh', 'Tom'],
    'job':['programmer', 'NN', 'NN', 'engineer', 'teacher'],
    'language':['Java', 'Python', 'LN', 'LN', 'C']})

df.replace(to_replace='NN', value=None, method='ffill')

  df.replace(to_replace='NN', value=None, method='ffill')


Unnamed: 0,name,job,language
0,Ben,programmer,Java
1,Kate,programmer,Python
2,Agnes,programmer,LN
3,Ashleigh,engineer,LN
4,Tom,teacher,C


In [55]:
df.replace(to_replace=['NN', 'LN'],
           value=None,
           method='ffill')

  df.replace(to_replace=['NN', 'LN'],


Unnamed: 0,name,job,language
0,Ben,programmer,Java
1,Kate,programmer,Python
2,Agnes,programmer,Python
3,Ashleigh,engineer,Python
4,Tom,teacher,C


In [56]:
df.replace(['NN', 'LN'], value=None, method='bfill')

  df.replace(['NN', 'LN'], value=None, method='bfill')


Unnamed: 0,name,job,language
0,Ben,programmer,Java
1,Kate,engineer,Python
2,Agnes,engineer,C
3,Ashleigh,engineer,C
4,Tom,teacher,C


In [57]:
df.replace('NN', 
           value=None,
           inplace=True,
           method='bfill')
df.replace('LN', 
           value=None,
           inplace=True,
           method='ffill')
df

  df.replace('NN',
  df.replace('LN',


Unnamed: 0,name,job,language
0,Ben,programmer,Java
1,Kate,engineer,Python
2,Agnes,engineer,Python
3,Ashleigh,engineer,Python
4,Tom,teacher,C


In [58]:
df = pd.DataFrame([[1, 2], [4, 5], [7, 8]],
                  index=['cobra', 'viper', 'sidewinder'],
                  columns=['max_speed', 'shield'])

df.loc[['viper', 'sidewinder']]
df.loc['cobra':'viper', 'max_speed']

cobra    1
viper    4
Name: max_speed, dtype: int64

In [59]:
df.loc[lambda df: df['shield'] == 8]

Unnamed: 0,max_speed,shield
sidewinder,7,8


In [60]:
df.loc[['viper', 'sidewinder'], ['shield']] = 50
df

Unnamed: 0,max_speed,shield
cobra,1,2
viper,4,50
sidewinder,7,50


In [62]:
df.loc[df['shield'] > 35] = 0
df

Unnamed: 0,max_speed,shield
cobra,1,2
viper,0,0
sidewinder,0,0


In [76]:
url = "https://raw.githubusercontent.com/TomasBeuzen/toy-datasets/master/wine_1.csv"
a = pd.read_csv(url, index_col=0)
print(a)
a.info()

               Grape     Origin  Alcohol    pH Colour   Aroma
Bottle                                                       
1         Chardonnay  Australia    14.23  3.51  White  Floral
2       Pinot Grigio      Italy    13.20  3.30  White  Fruity
3        Pinot Blanc     France    13.16  3.16  White  Citrus
4             Shiraz      Chile    14.91  3.39    Red   Berry
5             Malbec  Argentina    13.83  3.28    Red  Fruity
<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, 1 to 5
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Grape    5 non-null      object 
 1   Origin   5 non-null      object 
 2   Alcohol  5 non-null      float64
 3   pH       5 non-null      float64
 4   Colour   5 non-null      object 
 5   Aroma    5 non-null      object 
dtypes: float64(2), object(4)
memory usage: 280.0+ bytes
