## **Lesson 1: Getting started with pandas**

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Series: Population by Country Code
country_population = pd.Series(
    data=[61, 46, 11, 65, 10],
    index=["IT", "ES", "GR", "FR", "PO"],
    name="Country Population"
)

data = {"Name": ["Avery Bradley", "John Holland", "Jonas Jerebko",
    "Jordan Mickey", "Terry Rozier", "Jared Sullinger", "Evan Turner"],
    "Team": ["Boston Celtics"] * 7,
    "Number": [0.0, 30.0, 8.0, None, 12.0, 7.0, 11.0],
    "Position": ["PG", "SG", "PF", "PF", "PG", "C", "SG"],
    "Age": [25.0, 27.0, 29.0, 21.0, 22.0, None, 27.0]
}
df_0 = pd.DataFrame(data)


In [3]:
country_population

IT    61
ES    46
GR    11
FR    65
PO    10
Name: Country Population, dtype: int64

In [4]:
df_0 

Unnamed: 0,Name,Team,Number,Position,Age
0,Avery Bradley,Boston Celtics,0.0,PG,25.0
1,John Holland,Boston Celtics,30.0,SG,27.0
2,Jonas Jerebko,Boston Celtics,8.0,PF,29.0
3,Jordan Mickey,Boston Celtics,,PF,21.0
4,Terry Rozier,Boston Celtics,12.0,PG,22.0
5,Jared Sullinger,Boston Celtics,7.0,C,
6,Evan Turner,Boston Celtics,11.0,SG,27.0


In [5]:
country_population.index 

Index(['IT', 'ES', 'GR', 'FR', 'PO'], dtype='object')

In [6]:
country_population.values

array([61, 46, 11, 65, 10])

In [7]:
country_population.shape

(5,)

In [8]:
df_0

Unnamed: 0,Name,Team,Number,Position,Age
0,Avery Bradley,Boston Celtics,0.0,PG,25.0
1,John Holland,Boston Celtics,30.0,SG,27.0
2,Jonas Jerebko,Boston Celtics,8.0,PF,29.0
3,Jordan Mickey,Boston Celtics,,PF,21.0
4,Terry Rozier,Boston Celtics,12.0,PG,22.0
5,Jared Sullinger,Boston Celtics,7.0,C,
6,Evan Turner,Boston Celtics,11.0,SG,27.0


In [9]:
df_0.index 

RangeIndex(start=0, stop=7, step=1)

In [10]:
df_0.columns

Index(['Name', 'Team', 'Number', 'Position', 'Age'], dtype='object')

In [11]:
df_0.values

array([['Avery Bradley', 'Boston Celtics', 0.0, 'PG', 25.0],
       ['John Holland', 'Boston Celtics', 30.0, 'SG', 27.0],
       ['Jonas Jerebko', 'Boston Celtics', 8.0, 'PF', 29.0],
       ['Jordan Mickey', 'Boston Celtics', nan, 'PF', 21.0],
       ['Terry Rozier', 'Boston Celtics', 12.0, 'PG', 22.0],
       ['Jared Sullinger', 'Boston Celtics', 7.0, 'C', nan],
       ['Evan Turner', 'Boston Celtics', 11.0, 'SG', 27.0]], dtype=object)

In [12]:
df_0.shape

(7, 5)

In [13]:
type(country_population)

pandas.core.series.Series

In [14]:
df_0

Unnamed: 0,Name,Team,Number,Position,Age
0,Avery Bradley,Boston Celtics,0.0,PG,25.0
1,John Holland,Boston Celtics,30.0,SG,27.0
2,Jonas Jerebko,Boston Celtics,8.0,PF,29.0
3,Jordan Mickey,Boston Celtics,,PF,21.0
4,Terry Rozier,Boston Celtics,12.0,PG,22.0
5,Jared Sullinger,Boston Celtics,7.0,C,
6,Evan Turner,Boston Celtics,11.0,SG,27.0


In [15]:
df_0.Name

0      Avery Bradley
1       John Holland
2      Jonas Jerebko
3      Jordan Mickey
4       Terry Rozier
5    Jared Sullinger
6        Evan Turner
Name: Name, dtype: object

In [16]:
type(df_0.Name)

pandas.core.series.Series

In [17]:
df_0.iloc[0]

Name         Avery Bradley
Team        Boston Celtics
Number                 0.0
Position                PG
Age                   25.0
Name: 0, dtype: object

In [18]:
type(df_0.iloc[0])

pandas.core.series.Series

In [19]:
# Create a Series from a list
s1 = pd.Series([10, 20, 30], index=['a', 'b', 'c'])
# Create a Series from a dictionary
s2 = pd.Series({'Hanoi': 1000, 'HCMC': 2000})

In [20]:
s1 

a    10
b    20
c    30
dtype: int64

In [21]:
s1[2] 

  s1[2]


np.int64(30)

In [22]:
s1['c']

np.int64(30)

In [23]:
data = {'Province': ['Hanoi', 'HCMC', 'Danang'],
        'Population (millions)': [8.05, 8.99, 1.13]}
df = pd.DataFrame(data)
df 

Unnamed: 0,Province,Population (millions)
0,Hanoi,8.05
1,HCMC,8.99
2,Danang,1.13


In [24]:
ind = df.index
type(ind)

pandas.core.indexes.range.RangeIndex

In [25]:
indc = df.columns
type(indc)

pandas.core.indexes.base.Index

In [26]:
ind[0] = 1000  # This will raise an error because index is immutable

TypeError: Index does not support mutable operations

In [None]:
indc  

Index(['Province', 'Population (millions)'], dtype='object')

In [None]:
indc[2] = 'New Column'  # This will raise an error because columns are immutable

TypeError: Index does not support mutable operations

In [None]:
indc 

Index(['Province', 'Population (millions)'], dtype='object')

In [None]:
indc_ex = indc.union({"New Column "})
indc_ex

Index(['New Column ', 'Population (millions)', 'Province'], dtype='object')

In [None]:
df

Unnamed: 0,Province,Population (millions)
0,Hanoi,8.05
1,HCMC,8.99
2,Danang,1.13


In [None]:
df.reindex(index=[1,2,3], fill_value=0)  # Reindexing with fill_value 

Unnamed: 0,Province,Population (millions)
1,HCMC,8.99
2,Danang,1.13
3,0,0.0


In [None]:
df_newc = df.reindex(columns=indc_ex, fill_value=0)  # Reindexing with new columns
df_newc

Unnamed: 0,New Column,Population (millions),Province
0,0,8.05,Hanoi
1,0,8.99,HCMC
2,0,1.13,Danang


In [None]:
df 

Unnamed: 0,Province,Population (millions)
0,Hanoi,8.05
1,HCMC,8.99
2,Danang,1.13


In [None]:
df_newc.drop(columns=['New Column '])  # Dropping the new column

Unnamed: 0,Population (millions),Province
0,8.05,Hanoi
1,8.99,HCMC
2,1.13,Danang


In [None]:
df_newc.drop('New Column ', axis=1)  # Another way to drop the column
df_newc 

Unnamed: 0,New Column,Population (millions),Province
0,0,8.05,Hanoi
1,0,8.99,HCMC
2,0,1.13,Danang


In [None]:
df_newc.drop(columns=['New Column '], inplace=True)  # Dropping the new column
df_newc

Unnamed: 0,Population (millions),Province
0,8.05,Hanoi
1,8.99,HCMC
2,1.13,Danang


In [None]:
df_newc.drop(0)

Unnamed: 0,Population (millions),Province
1,8.99,HCMC
2,1.13,Danang


In [None]:
df_newc 

Unnamed: 0,Population (millions),Province
0,8.05,Hanoi
1,8.99,HCMC
2,1.13,Danang


In [None]:
df_newc.drop(2,inplace=True)  # Dropping the row with index 2
df_newc

Unnamed: 0,Population (millions),Province
0,8.05,Hanoi
1,8.99,HCMC


In [None]:
df 

Unnamed: 0,Province,Population (millions)
0,Hanoi,8.05
1,HCMC,8.99
2,Danang,1.13


In [None]:
df.Province

0     Hanoi
1      HCMC
2    Danang
Name: Province, dtype: object

In [None]:
df.Population (millions)

AttributeError: 'DataFrame' object has no attribute 'Population'

In [None]:
df['Population (millions)']

0    8.05
1    8.99
2    1.13
Name: Population (millions), dtype: float64

In [None]:
df['Province']

0     Hanoi
1      HCMC
2    Danang
Name: Province, dtype: object

In [None]:
df_0 

Unnamed: 0,Name,Team,Number,Position,Age
0,Avery Bradley,Boston Celtics,0.0,PG,25.0
1,John Holland,Boston Celtics,30.0,SG,27.0
2,Jonas Jerebko,Boston Celtics,8.0,PF,29.0
3,Jordan Mickey,Boston Celtics,,PF,21.0
4,Terry Rozier,Boston Celtics,12.0,PG,22.0
5,Jared Sullinger,Boston Celtics,7.0,C,
6,Evan Turner,Boston Celtics,11.0,SG,27.0


In [None]:
df_0[['Name', 'Team']]

Unnamed: 0,Name,Team
0,Avery Bradley,Boston Celtics
1,John Holland,Boston Celtics
2,Jonas Jerebko,Boston Celtics
3,Jordan Mickey,Boston Celtics
4,Terry Rozier,Boston Celtics
5,Jared Sullinger,Boston Celtics
6,Evan Turner,Boston Celtics


In [None]:
df_0 

Unnamed: 0,Name,Team,Number,Position,Age
0,Avery Bradley,Boston Celtics,0.0,PG,25.0
1,John Holland,Boston Celtics,30.0,SG,27.0
2,Jonas Jerebko,Boston Celtics,8.0,PF,29.0
3,Jordan Mickey,Boston Celtics,,PF,21.0
4,Terry Rozier,Boston Celtics,12.0,PG,22.0
5,Jared Sullinger,Boston Celtics,7.0,C,
6,Evan Turner,Boston Celtics,11.0,SG,27.0


In [None]:
df_01 = df_0.set_index('Name')  # Setting 'Name' as the index
df_01

Unnamed: 0_level_0,Team,Number,Position,Age
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Avery Bradley,Boston Celtics,0.0,PG,25.0
John Holland,Boston Celtics,30.0,SG,27.0
Jonas Jerebko,Boston Celtics,8.0,PF,29.0
Jordan Mickey,Boston Celtics,,PF,21.0
Terry Rozier,Boston Celtics,12.0,PG,22.0
Jared Sullinger,Boston Celtics,7.0,C,
Evan Turner,Boston Celtics,11.0,SG,27.0


In [None]:
df_01.index

Index(['Avery Bradley', 'John Holland', 'Jonas Jerebko', 'Jordan Mickey',
       'Terry Rozier', 'Jared Sullinger', 'Evan Turner'],
      dtype='object', name='Name')

In [None]:
df_01

Unnamed: 0_level_0,Team,Number,Position,Age
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Avery Bradley,Boston Celtics,0.0,PG,25.0
John Holland,Boston Celtics,30.0,SG,27.0
Jonas Jerebko,Boston Celtics,8.0,PF,29.0
Jordan Mickey,Boston Celtics,,PF,21.0
Terry Rozier,Boston Celtics,12.0,PG,22.0
Jared Sullinger,Boston Celtics,7.0,C,
Evan Turner,Boston Celtics,11.0,SG,27.0


In [None]:
df_01.loc['Avery Bradley']  # Accessing a row by index label


Team        Boston Celtics
Number                 0.0
Position                PG
Age                   25.0
Name: Avery Bradley, dtype: object

In [None]:
df_01 

Unnamed: 0_level_0,Team,Number,Position,Age
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Avery Bradley,Boston Celtics,0.0,PG,25.0
John Holland,Boston Celtics,30.0,SG,27.0
Jonas Jerebko,Boston Celtics,8.0,PF,29.0
Jordan Mickey,Boston Celtics,,PF,21.0
Terry Rozier,Boston Celtics,12.0,PG,22.0
Jared Sullinger,Boston Celtics,7.0,C,
Evan Turner,Boston Celtics,11.0,SG,27.0


In [None]:
df_01.loc['Avery Bradley':'Jonas Jerebko']

Unnamed: 0_level_0,Team,Number,Position,Age
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Avery Bradley,Boston Celtics,0.0,PG,25.0
John Holland,Boston Celtics,30.0,SG,27.0
Jonas Jerebko,Boston Celtics,8.0,PF,29.0


In [None]:
df_01.iloc[0]

Team        Boston Celtics
Number                 0.0
Position                PG
Age                   25.0
Name: Avery Bradley, dtype: object

In [None]:
df_01.iloc[0:3]  # Accessing rows by integer location


Unnamed: 0_level_0,Team,Number,Position,Age
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Avery Bradley,Boston Celtics,0.0,PG,25.0
John Holland,Boston Celtics,30.0,SG,27.0
Jonas Jerebko,Boston Celtics,8.0,PF,29.0


In [None]:
df  

Unnamed: 0,Province,Population (millions)
0,Hanoi,8.05
1,HCMC,8.99
2,Danang,1.13


In [None]:
df.drop(0, inplace=True)  # Dropping the first row
df

Unnamed: 0,Province,Population (millions)
1,HCMC,8.99
2,Danang,1.13


In [None]:
df.loc[1]

Province                 HCMC
Population (millions)    8.99
Name: 1, dtype: object

In [None]:
df.iloc[1]

Province                 Danang
Population (millions)      1.13
Name: 2, dtype: object

In [None]:
df_01 

Unnamed: 0_level_0,Team,Number,Position,Age
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Avery Bradley,Boston Celtics,0.0,PG,25.0
John Holland,Boston Celtics,30.0,SG,27.0
Jonas Jerebko,Boston Celtics,8.0,PF,29.0
Jordan Mickey,Boston Celtics,,PF,21.0
Terry Rozier,Boston Celtics,12.0,PG,22.0
Jared Sullinger,Boston Celtics,7.0,C,
Evan Turner,Boston Celtics,11.0,SG,27.0


In [None]:
df_01.loc['John Holland','Number']

np.float64(30.0)

In [None]:
df_01.iloc[1,1]

np.float64(30.0)

In [None]:
df_01.loc['Jordan Mickey':"Terry Rozier",'Number':"Position"]

Unnamed: 0_level_0,Number,Position
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Jordan Mickey,,PF
Terry Rozier,12.0,PG


In [None]:
df_01.iloc[3:5, 1:3]  # Accessing a range of rows and columns by integer location 

Unnamed: 0_level_0,Number,Position
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Jordan Mickey,,PF
Terry Rozier,12.0,PG


In [None]:
df 

Unnamed: 0,Province,Population (millions)
1,HCMC,8.99
2,Danang,1.13


In [None]:
df_0 

Unnamed: 0,Name,Team,Number,Position,Age
0,Avery Bradley,Boston Celtics,0.0,PG,25.0
1,John Holland,Boston Celtics,30.0,SG,27.0
2,Jonas Jerebko,Boston Celtics,8.0,PF,29.0
3,Jordan Mickey,Boston Celtics,,PF,21.0
4,Terry Rozier,Boston Celtics,12.0,PG,22.0
5,Jared Sullinger,Boston Celtics,7.0,C,
6,Evan Turner,Boston Celtics,11.0,SG,27.0


In [None]:
df_0["Position"]=='SG'

0    False
1     True
2    False
3    False
4    False
5    False
6     True
Name: Position, dtype: bool

In [None]:
df_0[df_0["Position"]=='SG']

Unnamed: 0,Name,Team,Number,Position,Age
1,John Holland,Boston Celtics,30.0,SG,27.0
6,Evan Turner,Boston Celtics,11.0,SG,27.0


In [None]:
df_0[df_0["Position"]!='SG']

Unnamed: 0,Name,Team,Number,Position,Age
0,Avery Bradley,Boston Celtics,0.0,PG,25.0
2,Jonas Jerebko,Boston Celtics,8.0,PF,29.0
3,Jordan Mickey,Boston Celtics,,PF,21.0
4,Terry Rozier,Boston Celtics,12.0,PG,22.0
5,Jared Sullinger,Boston Celtics,7.0,C,


In [None]:
df_0[~(df_0["Position"]=='SG')]  # Using the bitwise NOT operator to filter rows

Unnamed: 0,Name,Team,Number,Position,Age
0,Avery Bradley,Boston Celtics,0.0,PG,25.0
2,Jonas Jerebko,Boston Celtics,8.0,PF,29.0
3,Jordan Mickey,Boston Celtics,,PF,21.0
4,Terry Rozier,Boston Celtics,12.0,PG,22.0
5,Jared Sullinger,Boston Celtics,7.0,C,


In [None]:
import seaborn as sns 
df_lab0 = sns.load_dataset("titanic").head(10)  # Load the Titanic dataset
df_lab0

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False


In [None]:
drop_columns = ['pclass','alive','embarked','embarked']
df_lab0.drop(columns=drop_columns, inplace=True)  # Dropping specified columns
df_lab0 

Unnamed: 0,survived,sex,age,sibsp,parch,fare,class,who,adult_male,deck,embark_town,alone
0,0,male,22.0,1,0,7.25,Third,man,True,,Southampton,False
1,1,female,38.0,1,0,71.2833,First,woman,False,C,Cherbourg,False
2,1,female,26.0,0,0,7.925,Third,woman,False,,Southampton,True
3,1,female,35.0,1,0,53.1,First,woman,False,C,Southampton,False
4,0,male,35.0,0,0,8.05,Third,man,True,,Southampton,True
5,0,male,,0,0,8.4583,Third,man,True,,Queenstown,True
6,0,male,54.0,0,0,51.8625,First,man,True,E,Southampton,True
7,0,male,2.0,3,1,21.075,Third,child,False,,Southampton,False
8,1,female,27.0,0,2,11.1333,Third,woman,False,,Southampton,False
9,1,female,14.0,1,0,30.0708,Second,child,False,,Cherbourg,False


In [None]:
mask = df_lab0['sex']=='male' 
df_lab0[mask]  # Filtering rows where

Unnamed: 0,survived,sex,age,sibsp,parch,fare,class,who,adult_male,deck,embark_town,alone
0,0,male,22.0,1,0,7.25,Third,man,True,,Southampton,False
4,0,male,35.0,0,0,8.05,Third,man,True,,Southampton,True
5,0,male,,0,0,8.4583,Third,man,True,,Queenstown,True
6,0,male,54.0,0,0,51.8625,First,man,True,E,Southampton,True
7,0,male,2.0,3,1,21.075,Third,child,False,,Southampton,False


In [None]:
df_lab0[~mask]

Unnamed: 0,survived,sex,age,sibsp,parch,fare,class,who,adult_male,deck,embark_town,alone
1,1,female,38.0,1,0,71.2833,First,woman,False,C,Cherbourg,False
2,1,female,26.0,0,0,7.925,Third,woman,False,,Southampton,True
3,1,female,35.0,1,0,53.1,First,woman,False,C,Southampton,False
8,1,female,27.0,0,2,11.1333,Third,woman,False,,Southampton,False
9,1,female,14.0,1,0,30.0708,Second,child,False,,Cherbourg,False


In [None]:
df_lab0[df_lab0['adult_male']]

Unnamed: 0,survived,sex,age,sibsp,parch,fare,class,who,adult_male,deck,embark_town,alone
0,0,male,22.0,1,0,7.25,Third,man,True,,Southampton,False
4,0,male,35.0,0,0,8.05,Third,man,True,,Southampton,True
5,0,male,,0,0,8.4583,Third,man,True,,Queenstown,True
6,0,male,54.0,0,0,51.8625,First,man,True,E,Southampton,True


In [None]:
df_lab0[mask]['survived'].sum()/df_lab0.shape[0]  # Calculating the survival rate for

np.float64(0.0)

In [None]:
df_lab0.shape 

(10, 12)

In [None]:
df_lab0[~mask]['survived'].sum()/df_lab0.shape[0] 

np.float64(0.5)

In [None]:
df_02 = df_01[['Number','Age']]
df_02

Unnamed: 0_level_0,Number,Age
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Avery Bradley,0.0,25.0
John Holland,30.0,27.0
Jonas Jerebko,8.0,29.0
Jordan Mickey,,21.0
Terry Rozier,12.0,22.0
Jared Sullinger,7.0,
Evan Turner,11.0,27.0


In [None]:
s1 = pd.Series([7.3, -2.5, 3.4], index=['a', 'c', 'd'])
s2 = pd.Series([-2.1, 3.6, -1.5], index=['a', 'c', 'e'])
print(s1 + s2)
# Use the add method with fill_value to handle NaN
print(s1.add(s2, fill_value=0))

a    5.2
c    1.1
d    NaN
e    NaN
dtype: float64
a    5.2
c    1.1
d    3.4
e   -1.5
dtype: float64


In [None]:
df_02 + pd.Series(index=['a',"b","MisMatch"], data= [-1,-1,2])

Unnamed: 0_level_0,Age,MisMatch,Number,a,b
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Avery Bradley,,,,,
John Holland,,,,,
Jonas Jerebko,,,,,
Jordan Mickey,,,,,
Terry Rozier,,,,,
Jared Sullinger,,,,,
Evan Turner,,,,,


In [None]:
df_02

Unnamed: 0_level_0,Number,Age
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Avery Bradley,0.0,25.0
John Holland,30.0,27.0
Jonas Jerebko,8.0,29.0
Jordan Mickey,,21.0
Terry Rozier,12.0,22.0
Jared Sullinger,7.0,
Evan Turner,11.0,27.0


In [None]:
df 

Unnamed: 0,Province,Population (millions)
1,HCMC,8.99
2,Danang,1.13


In [None]:
df.apply(lambda x: f"{x['Population (millions)'] } (millions)-{x['Province']}", axis = 1)

1      8.99 (millions)-HCMC
2    1.13 (millions)-Danang
dtype: object

In [None]:
df 

Unnamed: 0,Province,Population (millions)
1,HCMC,8.99
2,Danang,1.13


In [None]:
# using a function
format_func = lambda x: f'{x:.3f} million people'
print(df['Population (millions)'].map(format_func))

1    8.990 million people
2    1.130 million people
Name: Population (millions), dtype: object


In [None]:
df 

Unnamed: 0,Province,Population (millions)
1,HCMC,8.99
2,Danang,1.13


In [None]:
df['Province'].map({'Hanoi': 'Hà Nội',
                    'HCMC': 'Thành phố Hồ Chí Minh',
                    'Danang': 'Đà Nẵng'})


1    Thành phố Hồ Chí Minh
2                  Đà Nẵng
Name: Province, dtype: object

In [None]:
data = {'Employee': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
        'Score': [88, 76, 95, 65, 70],
        'Days_Off': [2, 5, 1, 8, 6]
}
df_lab1 = pd.DataFrame(data)
df_lab1

Unnamed: 0,Employee,Score,Days_Off
0,Alice,88,2
1,Bob,76,5
2,Charlie,95,1
3,David,65,8
4,Eva,70,6


In [None]:
np.median(df_lab1['Score'].values)

np.float64(76.0)

![image.png](attachment:image.png)
1. Define a function to evaluate performance based on Score and Days_Off

2. Apply the function to each row using .apply()


In [None]:
def evaluate(table):
    if table["Score"] >= 85 and table["Days_Off"] <= 3:
        return "Excellent"
    elif table["Score"] >= 70 and table["Days_Off"] <= 5:
        return "Good"
    elif table["Score"] >= 50:
        return "Average"
    return "Poor"

In [None]:
df_lab1["Performance"] = df_lab1.apply(evaluate, axis = 1)

In [None]:
df_lab1

Unnamed: 0,Employee,Score,Days_Off,Performance
0,Alice,88,2,Excellent
1,Bob,76,5,Good
2,Charlie,95,1,Excellent
3,David,65,8,Average
4,Eva,70,6,Average



3. Define another function that categorizes the Score as "High" or "Low“ (segmentation by median)

4. Apply the second function using .map() to the Score column


In [None]:
def evaluate_score(x):
    if x > np.median(df_lab1['Score'].values):
        return 'High'
    else:
        return 'Low'

In [None]:
df_lab1['Score'].map(evaluate_score)

0    High
1     Low
2    High
3     Low
4     Low
Name: Score, dtype: object

In [None]:
df_01.sort_index()

Unnamed: 0_level_0,Team,Number,Position,Age
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Avery Bradley,Boston Celtics,0.0,PG,25.0
Evan Turner,Boston Celtics,11.0,SG,27.0
Jared Sullinger,Boston Celtics,7.0,C,
John Holland,Boston Celtics,30.0,SG,27.0
Jonas Jerebko,Boston Celtics,8.0,PF,29.0
Jordan Mickey,Boston Celtics,,PF,21.0
Terry Rozier,Boston Celtics,12.0,PG,22.0


In [None]:
df_01.sort_values(by='Age', ascending=False) 

Unnamed: 0_level_0,Team,Number,Position,Age
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Jonas Jerebko,Boston Celtics,8.0,PF,29.0
John Holland,Boston Celtics,30.0,SG,27.0
Evan Turner,Boston Celtics,11.0,SG,27.0
Avery Bradley,Boston Celtics,0.0,PG,25.0
Terry Rozier,Boston Celtics,12.0,PG,22.0
Jordan Mickey,Boston Celtics,,PF,21.0
Jared Sullinger,Boston Celtics,7.0,C,


In [None]:
df_01 

Unnamed: 0_level_0,Team,Number,Position,Age
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Avery Bradley,Boston Celtics,0.0,PG,25.0
John Holland,Boston Celtics,30.0,SG,27.0
Jonas Jerebko,Boston Celtics,8.0,PF,29.0
Jordan Mickey,Boston Celtics,,PF,21.0
Terry Rozier,Boston Celtics,12.0,PG,22.0
Jared Sullinger,Boston Celtics,7.0,C,
Evan Turner,Boston Celtics,11.0,SG,27.0


In [None]:
df_01.rank()

Unnamed: 0_level_0,Team,Number,Position,Age
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Avery Bradley,4.0,1.0,4.5,3.0
John Holland,4.0,6.0,6.5,4.5
Jonas Jerebko,4.0,3.0,2.5,6.0
Jordan Mickey,4.0,,2.5,1.0
Terry Rozier,4.0,5.0,4.5,2.0
Jared Sullinger,4.0,2.0,1.0,
Evan Turner,4.0,4.0,6.5,4.5


In [None]:
df_01

Unnamed: 0_level_0,Team,Number,Position,Age
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Avery Bradley,Boston Celtics,0.0,PG,25.0
John Holland,Boston Celtics,30.0,SG,27.0
Jonas Jerebko,Boston Celtics,8.0,PF,29.0
Jordan Mickey,Boston Celtics,,PF,21.0
Terry Rozier,Boston Celtics,12.0,PG,22.0
Jared Sullinger,Boston Celtics,7.0,C,
Evan Turner,Boston Celtics,11.0,SG,27.0


In [None]:
df_01['Number'].rank(method='first')

Name
Avery Bradley      1.0
John Holland       6.0
Jonas Jerebko      3.0
Jordan Mickey      NaN
Terry Rozier       5.0
Jared Sullinger    2.0
Evan Turner        4.0
Name: Number, dtype: float64

: 

In [28]:
pd.Series([1,2,2,2,3])

0    1
1    2
2    2
3    2
4    3
dtype: int64

In [29]:
pd.Series([1,2,2,2,3]).rank()

0    1.0
1    3.0
2    3.0
3    3.0
4    5.0
dtype: float64

In [32]:
pd.Series([1,2,2,2,3]).rank(method='dense')

0    1.0
1    2.0
2    2.0
3    2.0
4    3.0
dtype: float64

In [27]:
pd.Series([1,2,2,2,3]).rank(method='first')

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
dtype: float64

In [33]:
data_num = pd.DataFrame(np.random.randn(5, 3), columns=['a', 'b', 'c'])

print(data_num.sum()) # Sum by column

print(data_num.mean(axis=1)) # Mean by row

print(data_num.describe()) # A summary of all key statistics


a    0.466438
b    2.042656
c    0.175207
dtype: float64
0   -0.175336
1    0.405332
2   -0.755733
3    1.491465
4   -0.070962
dtype: float64
              a         b         c
count  5.000000  5.000000  5.000000
mean   0.093288  0.408531  0.035041
std    0.914148  1.175606  0.972250
min   -0.873998 -1.090183 -0.973721
25%   -0.689181 -0.205605 -0.308463
50%   -0.011941  0.338589 -0.303017
75%    0.842992  0.991151  0.137707
max    1.198565  2.008704  1.622701


In [34]:
data_num

Unnamed: 0,a,b,c
0,-0.011941,-0.205605,-0.308463
1,1.198565,0.991151,-0.973721
2,-0.873998,-1.090183,-0.303017
3,0.842992,2.008704,1.622701
4,-0.689181,0.338589,0.137707


In [36]:
data_num['a'].corr(data_num['b'])

np.float64(0.7938773455258838)

In [37]:
data_num.cov()

Unnamed: 0,a,b,c
a,0.835667,0.853162,0.089531
b,0.853162,1.382049,0.665809
c,0.089531,0.665809,0.945271


In [38]:
data_num.corr() 

Unnamed: 0,a,b,c
a,1.0,0.793877,0.100735
b,0.793877,1.0,0.582518
c,0.100735,0.582518,1.0


In [39]:
df_0

Unnamed: 0,Name,Team,Number,Position,Age
0,Avery Bradley,Boston Celtics,0.0,PG,25.0
1,John Holland,Boston Celtics,30.0,SG,27.0
2,Jonas Jerebko,Boston Celtics,8.0,PF,29.0
3,Jordan Mickey,Boston Celtics,,PF,21.0
4,Terry Rozier,Boston Celtics,12.0,PG,22.0
5,Jared Sullinger,Boston Celtics,7.0,C,
6,Evan Turner,Boston Celtics,11.0,SG,27.0


In [42]:
df_0['Name'].unique()

array(['Avery Bradley', 'John Holland', 'Jonas Jerebko', 'Jordan Mickey',
       'Terry Rozier', 'Jared Sullinger', 'Evan Turner'], dtype=object)

In [43]:
df_0['Team'].unique()

array(['Boston Celtics'], dtype=object)

In [44]:
df_0['Position'].value_counts()

Position
PG    2
SG    2
PF    2
C     1
Name: count, dtype: int64

In [46]:
df_0[df_0['Position'].isin(['A','B',"PG"])]

Unnamed: 0,Name,Team,Number,Position,Age
0,Avery Bradley,Boston Celtics,0.0,PG,25.0
4,Terry Rozier,Boston Celtics,12.0,PG,22.0


In [49]:
import seaborn as sns

In [52]:
titanic = sns.load_dataset("titanic")
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [55]:
titanic.sort_values(by = "age", ascending= False).head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
630,1,1,male,80.0,0,0,30.0,S,First,man,True,A,Southampton,yes,True
851,0,3,male,74.0,0,0,7.775,S,Third,man,True,,Southampton,no,True
493,0,1,male,71.0,0,0,49.5042,C,First,man,True,,Cherbourg,no,True
96,0,1,male,71.0,0,0,34.6542,C,First,man,True,A,Cherbourg,no,True
116,0,3,male,70.5,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


In [58]:
suv_rate = titanic["survived"].mean()
print(suv_rate)

0.3838383838383838


In [60]:
rate = titanic['survived'].sum() / titanic.shape[0]

In [61]:
suv_rate - rate

np.float64(0.0)

In [63]:
s3 = pd.Series([1,2,3,np.nan])
s3 

0    1.0
1    2.0
2    3.0
3    NaN
dtype: float64

In [64]:
s3.mean()

np.float64(2.0)

In [65]:
s3.sum()/s3.shape[0]

np.float64(1.5)

In [79]:
def to_agegroup(age):
    if np.isnan(age):
        return "unknown"
    elif age <= 16:
        return "child"
    elif age <= 40:
        return "Adult"
    else:
        return "Senior"
    


In [80]:
titanic['age'].apply(to_agegroup)

0        Adult
1        Adult
2        Adult
3        Adult
4        Adult
        ...   
886      Adult
887      Adult
888    unknown
889      Adult
890      Adult
Name: age, Length: 891, dtype: object

In [82]:
nums_col = titanic.select_dtypes('number').columns
nums_col

Index(['survived', 'pclass', 'age', 'sibsp', 'parch', 'fare'], dtype='object')

In [89]:
corre = titanic[nums_col].corr()
corre

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
survived,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
pclass,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
age,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
sibsp,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
parch,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
fare,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


In [100]:
corre['survived'].sort_values()


pclass     -0.338481
age        -0.077221
sibsp      -0.035322
parch       0.081629
fare        0.257307
survived    1.000000
Name: survived, dtype: float64

In [104]:
titanic.embarked.value_counts()

embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [107]:
titanic.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [110]:
titanic['FamilySize'] = titanic['sibsp'] + titanic['parch']
titanic['Fare_per_person'] = titanic.apply(lambda x: x['fare']/(x['FamilySize']+1), axis=1)

In [111]:
titanic['Fare_per_person']

0       3.62500
1      35.64165
2       7.92500
3      26.55000
4       8.05000
         ...   
886    13.00000
887    30.00000
888     5.86250
889    30.00000
890     7.75000
Name: Fare_per_person, Length: 891, dtype: float64

In [112]:
titanic.FamilySize.value_counts()

FamilySize
0     537
1     161
2     102
3      29
5      22
4      15
6      12
10      7
7       6
Name: count, dtype: int64