# Data Modification

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import random as rd

%matplotlib inline

## A column and an index edition

In [2]:
mammals = pd.read_csv(
    './course-files/course-sources/mammals.csv'
)
mammals.head()

Unnamed: 0,name,body,brain
0,Arctic fox,3.385,44.5
1,Owl monkey,0.48,15.5
2,Mountain beaver,1.35,8.1
3,Cow,465.0,423.0
4,Grey wolf,36.33,119.5


In [3]:
mammals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62 entries, 0 to 61
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   name    62 non-null     object 
 1   body    62 non-null     float64
 2   brain   62 non-null     float64
dtypes: float64(2), object(1)
memory usage: 1.6+ KB


### Rename column by list and dict

In [4]:
mammals.columns

Index(['name', 'body', 'brain'], dtype='object')

In [12]:
mammals.columns = ['name', 'bodyKg', 'brainKg']  # by list
mammals.head()

Unnamed: 0,name,bodyKg,brainKg
0,Arctic fox,3.385,44.5
1,Owl monkey,0.48,15.5
2,Mountain beaver,1.35,8.1
3,Cow,465.0,423.0
4,Grey wolf,36.33,119.5


In [13]:
new_column_names = {'bodyKg': 'body[kg]', 'brainKg': 'brain[kg]'}
new_column_names

{'bodyKg': 'body[kg]', 'brainKg': 'brain[kg]'}

In [14]:
mammals.rename(columns=new_column_names, inplace=True)  # by dict
mammals.head()

Unnamed: 0,name,body[kg],brain[kg]
0,Arctic fox,3.385,44.5
1,Owl monkey,0.48,15.5
2,Mountain beaver,1.35,8.1
3,Cow,465.0,423.0
4,Grey wolf,36.33,119.5


### Rename index

In [15]:
mammals = pd.read_csv(
    './course-files/course-sources/mammals.csv',
    index_col='name'
)
mammals.head()

Unnamed: 0_level_0,body,brain
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Arctic fox,3.385,44.5
Owl monkey,0.48,15.5
Mountain beaver,1.35,8.1
Cow,465.0,423.0
Grey wolf,36.33,119.5


In [16]:
mammals.rename(index={'Cow': 'Land Cow'}, inplace=True)
mammals.head()

Unnamed: 0_level_0,body,brain
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Arctic fox,3.385,44.5
Owl monkey,0.48,15.5
Mountain beaver,1.35,8.1
Land Cow,465.0,423.0
Grey wolf,36.33,119.5


### Frame copy

In [17]:
mammals_2 = mammals  # IT IS NOT A COPY, it is only an next indicator [wskaźnik] to mammals

In [19]:
mammals_2.rename(index={'Grey wolf': 'Land Cow'}, inplace=True)
mammals_2.head()

Unnamed: 0_level_0,body,brain
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Arctic fox,3.385,44.5
Owl monkey,0.48,15.5
Mountain beaver,1.35,8.1
Land Cow,465.0,423.0
Land Cow,36.33,119.5


In [21]:
mammals.head()  # 2 data frames but 1 object of a data frame

Unnamed: 0_level_0,body,brain
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Arctic fox,3.385,44.5
Owl monkey,0.48,15.5
Mountain beaver,1.35,8.1
Land Cow,465.0,423.0
Land Cow,36.33,119.5


In [23]:
mammals.rename(index={'Land Cow': 'Cow'}, inplace=True)
mammals_2.head()

Unnamed: 0_level_0,body,brain
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Arctic fox,3.385,44.5
Owl monkey,0.48,15.5
Mountain beaver,1.35,8.1
Cow,465.0,423.0
Cow,36.33,119.5


In [42]:
mammals.iloc[4] = mammals.iloc[4].rename(index='Gray wolf')
mammals.iloc[4]

body      36.33
brain    119.50
Name: Cow, dtype: float64

## Data modification

In [None]:
# insurance:
#  * holders: clients [ubezpieczeni]
#  * claims: cases [odszkodowania]

In [43]:
insurance = pd.read_csv(
    './course-files/course-sources/Insurance.csv'
)
insurance.head()

Unnamed: 0,ID,District,Group,Age,Holders,Claims
0,1,1,<1l,<25,197,38
1,2,1,<1l,25-29,264,35
2,3,1,<1l,30-35,246,20
3,4,1,<1l,>35,1680,156
4,5,1,1-1.5l,<25,284,63


In [44]:
insurance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ID        64 non-null     int64 
 1   District  64 non-null     int64 
 2   Group     64 non-null     object
 3   Age       64 non-null     object
 4   Holders   64 non-null     int64 
 5   Claims    64 non-null     int64 
dtypes: int64(4), object(2)
memory usage: 3.1+ KB


In [45]:
insurance_copy = insurance.copy()
insurance_copy.loc[2, 'Claims']

20

In [46]:
insurance_copy.loc[2, 'Claims'] = 19
insurance_copy.head()

Unnamed: 0,ID,District,Group,Age,Holders,Claims
0,1,1,<1l,<25,197,38
1,2,1,<1l,25-29,264,35
2,3,1,<1l,30-35,246,19
3,4,1,<1l,>35,1680,156
4,5,1,1-1.5l,<25,284,63


In [47]:
insurance_copy.loc[2, 'Claims'] = insurance_copy.loc[2, 'Claims'] + 1
insurance_copy.head()

Unnamed: 0,ID,District,Group,Age,Holders,Claims
0,1,1,<1l,<25,197,38
1,2,1,<1l,25-29,264,35
2,3,1,<1l,30-35,246,20
3,4,1,<1l,>35,1680,156
4,5,1,1-1.5l,<25,284,63


In [48]:
insurance_copy.loc[2, 'Claims'] += 1
insurance_copy.head()

Unnamed: 0,ID,District,Group,Age,Holders,Claims
0,1,1,<1l,<25,197,38
1,2,1,<1l,25-29,264,35
2,3,1,<1l,30-35,246,21
3,4,1,<1l,>35,1680,156
4,5,1,1-1.5l,<25,284,63


In [56]:
is_younger_25 = insurance_copy['Age'] == '<25'
is_younger_25[is_younger_25 == True].size, is_younger_25.size

(16, 64)

In [57]:
insurance_copy.where(is_younger_25).dropna()

Unnamed: 0,ID,District,Group,Age,Holders,Claims
0,1.0,1.0,<1l,<25,197.0,38.0
4,5.0,1.0,1-1.5l,<25,284.0,63.0
8,9.0,1.0,1.5-2l,<25,133.0,19.0
12,13.0,1.0,>2l,<25,24.0,4.0
16,17.0,2.0,<1l,<25,85.0,22.0
20,21.0,2.0,1-1.5l,<25,149.0,25.0
24,25.0,2.0,1.5-2l,<25,66.0,14.0
28,29.0,2.0,>2l,<25,9.0,4.0
32,33.0,3.0,<1l,<25,35.0,5.0
36,37.0,3.0,1-1.5l,<25,53.0,10.0


In [58]:
insurance_copy[is_younger_25]

Unnamed: 0,ID,District,Group,Age,Holders,Claims
0,1,1,<1l,<25,197,38
4,5,1,1-1.5l,<25,284,63
8,9,1,1.5-2l,<25,133,19
12,13,1,>2l,<25,24,4
16,17,2,<1l,<25,85,22
20,21,2,1-1.5l,<25,149,25
24,25,2,1.5-2l,<25,66,14
28,29,2,>2l,<25,9,4
32,33,3,<1l,<25,35,5
36,37,3,1-1.5l,<25,53,10


### filtering more than 1 rows

In [60]:
insurance_copy.loc[is_younger_25, 'Holders']

0     197
4     284
8     133
12     24
16     85
20    149
24     66
28      9
32     35
36     53
40     24
44      7
48     20
52     31
56     18
60      3
Name: Holders, dtype: int64

In [61]:
insurance_copy.loc[is_younger_25, 'Holders'] + 100

0     297
4     384
8     233
12    124
16    185
20    249
24    166
28    109
32    135
36    153
40    124
44    107
48    120
52    131
56    118
60    103
Name: Holders, dtype: int64

In [62]:
insurance_copy.loc[is_younger_25, 'Holders'] = insurance_copy.loc[is_younger_25, 'Holders'] + 100
insurance_copy.head()

Unnamed: 0,ID,District,Group,Age,Holders,Claims
0,1,1,<1l,<25,297,38
1,2,1,<1l,25-29,264,35
2,3,1,<1l,30-35,246,21
3,4,1,<1l,>35,1680,156
4,5,1,1-1.5l,<25,384,63


### Shorter way to edit

In [65]:
insurance_copy.loc[is_younger_25, 'Holders'] = insurance_copy['Holders'] + 1000
insurance_copy.head()

Unnamed: 0,ID,District,Group,Age,Holders,Claims
0,1,1,<1l,<25,2197,38
1,2,1,<1l,25-29,264,35
2,3,1,<1l,30-35,246,21
3,4,1,<1l,>35,1680,156
4,5,1,1-1.5l,<25,2284,63


### The shortest wat to edit

In [63]:
insurance_copy.loc[is_younger_25, 'Holders'] += 1000
insurance_copy.head()

Unnamed: 0,ID,District,Group,Age,Holders,Claims
0,1,1,<1l,<25,1297,38
1,2,1,<1l,25-29,264,35
2,3,1,<1l,30-35,246,21
3,4,1,<1l,>35,1680,156
4,5,1,1-1.5l,<25,1384,63


## Adding and removing rows

In [67]:
sleep_time = pd.read_csv(
    './course-files/course-sources/sleep_time.csv',
    index_col='ID'
)
sleep_time.head()

Unnamed: 0_level_0,name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Cheetah,Acinonyx,carni,Carnivora,lc,12.1,,,11.9,,50.0
2,Owl monkey,Aotus,omni,Primates,,17.0,1.8,,7.0,0.0155,0.48
3,Mountain beaver,Aplodontia,herbi,Rodentia,nt,14.4,2.4,,9.6,,1.35
4,Greater short-tailed shrew,Blarina,omni,Soricomorpha,lc,14.9,2.3,0.133333,9.1,0.00029,0.019
5,Cow,Bos,herbi,Artiodactyla,domesticated,4.0,0.7,0.666667,20.0,0.423,600.0


In [68]:
sleep_time.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 83 entries, 1 to 83
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          83 non-null     object 
 1   genus         83 non-null     object 
 2   vore          76 non-null     object 
 3   order         83 non-null     object 
 4   conservation  54 non-null     object 
 5   sleep_total   83 non-null     float64
 6   sleep_rem     61 non-null     float64
 7   sleep_cycle   32 non-null     float64
 8   awake         83 non-null     float64
 9   brainwt       56 non-null     float64
 10  bodywt        83 non-null     float64
dtypes: float64(6), object(5)
memory usage: 7.8+ KB


### Column deleting

In [69]:
del sleep_time['bodywt']
sleep_time.head()

Unnamed: 0_level_0,name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,Cheetah,Acinonyx,carni,Carnivora,lc,12.1,,,11.9,
2,Owl monkey,Aotus,omni,Primates,,17.0,1.8,,7.0,0.0155
3,Mountain beaver,Aplodontia,herbi,Rodentia,nt,14.4,2.4,,9.6,
4,Greater short-tailed shrew,Blarina,omni,Soricomorpha,lc,14.9,2.3,0.133333,9.1,0.00029
5,Cow,Bos,herbi,Artiodactyla,domesticated,4.0,0.7,0.666667,20.0,0.423


In [70]:
sleep_time.drop(axis=1, columns=['awake', 'brainwt'], inplace=True)
sleep_time.head()

Unnamed: 0_level_0,name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Cheetah,Acinonyx,carni,Carnivora,lc,12.1,,
2,Owl monkey,Aotus,omni,Primates,,17.0,1.8,
3,Mountain beaver,Aplodontia,herbi,Rodentia,nt,14.4,2.4,
4,Greater short-tailed shrew,Blarina,omni,Soricomorpha,lc,14.9,2.3,0.133333
5,Cow,Bos,herbi,Artiodactyla,domesticated,4.0,0.7,0.666667


In [71]:
sleep_time.drop(axis='columns', columns='sleep_cycle', inplace=True)
sleep_time.head()

Unnamed: 0_level_0,name,genus,vore,order,conservation,sleep_total,sleep_rem
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Cheetah,Acinonyx,carni,Carnivora,lc,12.1,
2,Owl monkey,Aotus,omni,Primates,,17.0,1.8
3,Mountain beaver,Aplodontia,herbi,Rodentia,nt,14.4,2.4
4,Greater short-tailed shrew,Blarina,omni,Soricomorpha,lc,14.9,2.3
5,Cow,Bos,herbi,Artiodactyla,domesticated,4.0,0.7


In [74]:
# Delault `axis=0`, but if I send only parameter `column=...`, pandas knows that `axis=1`
sleep_time.drop(columns='sleep_rem', inplace=True)
sleep_time.head()

Unnamed: 0_level_0,name,genus,vore,order,conservation,sleep_total
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Cheetah,Acinonyx,carni,Carnivora,lc,12.1
2,Owl monkey,Aotus,omni,Primates,,17.0
3,Mountain beaver,Aplodontia,herbi,Rodentia,nt,14.4
4,Greater short-tailed shrew,Blarina,omni,Soricomorpha,lc,14.9
5,Cow,Bos,herbi,Artiodactyla,domesticated,4.0


### Deleting rows

In [75]:
sleep_time.drop(axis=0, labels=[1,2], inplace=True)
sleep_time.head()

Unnamed: 0_level_0,name,genus,vore,order,conservation,sleep_total
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3,Mountain beaver,Aplodontia,herbi,Rodentia,nt,14.4
4,Greater short-tailed shrew,Blarina,omni,Soricomorpha,lc,14.9
5,Cow,Bos,herbi,Artiodactyla,domesticated,4.0
6,Three-toed sloth,Bradypus,herbi,Pilosa,,14.4
7,Northern fur seal,Callorhinus,carni,Carnivora,vu,8.7


In [76]:
sleep_time.drop(axis='rows', labels=3, inplace=True)
sleep_time.head()

Unnamed: 0_level_0,name,genus,vore,order,conservation,sleep_total
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4,Greater short-tailed shrew,Blarina,omni,Soricomorpha,lc,14.9
5,Cow,Bos,herbi,Artiodactyla,domesticated,4.0
6,Three-toed sloth,Bradypus,herbi,Pilosa,,14.4
7,Northern fur seal,Callorhinus,carni,Carnivora,vu,8.7
8,Vesper mouse,Calomys,,Rodentia,,7.0


In [77]:
sleep_time.drop(labels=4, inplace=True)
sleep_time.head()

Unnamed: 0_level_0,name,genus,vore,order,conservation,sleep_total
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5,Cow,Bos,herbi,Artiodactyla,domesticated,4.0
6,Three-toed sloth,Bradypus,herbi,Pilosa,,14.4
7,Northern fur seal,Callorhinus,carni,Carnivora,vu,8.7
8,Vesper mouse,Calomys,,Rodentia,,7.0
9,Dog,Canis,carni,Carnivora,domesticated,10.1


#### The shortest

In [78]:
sleep_time.drop(5, inplace=True)
sleep_time.head()

Unnamed: 0_level_0,name,genus,vore,order,conservation,sleep_total
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6,Three-toed sloth,Bradypus,herbi,Pilosa,,14.4
7,Northern fur seal,Callorhinus,carni,Carnivora,vu,8.7
8,Vesper mouse,Calomys,,Rodentia,,7.0
9,Dog,Canis,carni,Carnivora,domesticated,10.1
10,Roe deer,Capreolus,herbi,Artiodactyla,lc,3.0


### Adding rows

In [82]:
# append: a dict with row, have to `ignore_index=True` if dict, doesn't have `inplace=True`
sleep_time = sleep_time.append(
    dict(name='Pola', genus='psiur', vore='omni', order=np.NaN, sleep_total=15),
    ignore_index=True
)

sleep_time.tail()

Unnamed: 0,name,genus,vore,order,conservation,sleep_total
74,Bottle-nosed dolphin,Tursiops,carni,Cetacea,,5.2
75,Genet,Genetta,carni,Carnivora,,6.3
76,Arctic fox,Vulpes,carni,Carnivora,,12.5
77,Red fox,Vulpes,carni,Carnivora,,9.8
78,Pola,psiur,omni,,,15.0


In [85]:
last_3 = sleep_time.iloc[-3:]
last_3

Unnamed: 0,name,genus,vore,order,conservation,sleep_total
76,Arctic fox,Vulpes,carni,Carnivora,,12.5
77,Red fox,Vulpes,carni,Carnivora,,9.8
78,Pola,psiur,omni,,,15.0


In [86]:
sleep_time.append(last_3).tail(10)

Unnamed: 0,name,genus,vore,order,conservation,sleep_total
72,Tenrec,Tenrec,omni,Afrosoricida,,15.6
73,Tree shrew,Tupaia,omni,Scandentia,,8.9
74,Bottle-nosed dolphin,Tursiops,carni,Cetacea,,5.2
75,Genet,Genetta,carni,Carnivora,,6.3
76,Arctic fox,Vulpes,carni,Carnivora,,12.5
77,Red fox,Vulpes,carni,Carnivora,,9.8
78,Pola,psiur,omni,,,15.0
76,Arctic fox,Vulpes,carni,Carnivora,,12.5
77,Red fox,Vulpes,carni,Carnivora,,9.8
78,Pola,psiur,omni,,,15.0


## Index rebuild
set_index(), reset_index()

In [2]:
mammals = pd.read_csv(
    './course-files/course-sources/mammals.csv'
)
mammals.head()

Unnamed: 0,name,body,brain
0,Arctic fox,3.385,44.5
1,Owl monkey,0.48,15.5
2,Mountain beaver,1.35,8.1
3,Cow,465.0,423.0
4,Grey wolf,36.33,119.5


In [3]:
mammals = pd.read_csv(
    './course-files/course-sources/mammals.csv',
    index_col='name'
)
mammals.head()

Unnamed: 0_level_0,body,brain
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Arctic fox,3.385,44.5
Owl monkey,0.48,15.5
Mountain beaver,1.35,8.1
Cow,465.0,423.0
Grey wolf,36.33,119.5


In [4]:
mammals = pd.read_csv(
    './course-files/course-sources/mammals.csv',
#     index_col='name'
)
mammals.head()

Unnamed: 0,name,body,brain
0,Arctic fox,3.385,44.5
1,Owl monkey,0.48,15.5
2,Mountain beaver,1.35,8.1
3,Cow,465.0,423.0
4,Grey wolf,36.33,119.5


### Choosing an index from an columns

In [5]:
mammals.set_index('name', inplace=True)
mammals.head()

Unnamed: 0_level_0,body,brain
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Arctic fox,3.385,44.5
Owl monkey,0.48,15.5
Mountain beaver,1.35,8.1
Cow,465.0,423.0
Grey wolf,36.33,119.5


### Reset an index to default

In [7]:
mammals.reset_index(inplace=True)
mammals.head()

Unnamed: 0,name,body,brain
0,Arctic fox,3.385,44.5
1,Owl monkey,0.48,15.5
2,Mountain beaver,1.35,8.1
3,Cow,465.0,423.0
4,Grey wolf,36.33,119.5


### Lost data during changing index

In [8]:
mammals.set_index('brain', inplace=True)
mammals.head()

Unnamed: 0_level_0,name,body
brain,Unnamed: 1_level_1,Unnamed: 2_level_1
44.5,Arctic fox,3.385
15.5,Owl monkey,0.48
8.1,Mountain beaver,1.35
423.0,Cow,465.0
119.5,Grey wolf,36.33


In [9]:
mammals.set_index('name', inplace=True)
mammals.head()

Unnamed: 0_level_0,body
name,Unnamed: 1_level_1
Arctic fox,3.385
Owl monkey,0.48
Mountain beaver,1.35
Cow,465.0
Grey wolf,36.33


#### How to manage it?

In [10]:
mammals = pd.read_csv(
    './course-files/course-sources/mammals.csv',
#     index_col='name'
)
mammals.head()

Unnamed: 0,name,body,brain
0,Arctic fox,3.385,44.5
1,Owl monkey,0.48,15.5
2,Mountain beaver,1.35,8.1
3,Cow,465.0,423.0
4,Grey wolf,36.33,119.5


In [11]:
mammals.set_index('brain', inplace=True)
mammals.head()

Unnamed: 0_level_0,name,body
brain,Unnamed: 1_level_1,Unnamed: 2_level_1
44.5,Arctic fox,3.385
15.5,Owl monkey,0.48
8.1,Mountain beaver,1.35
423.0,Cow,465.0
119.5,Grey wolf,36.33


In [12]:
mammals.reset_index(inplace=True)
mammals.head()

Unnamed: 0,brain,name,body
0,44.5,Arctic fox,3.385
1,15.5,Owl monkey,0.48
2,8.1,Mountain beaver,1.35
3,423.0,Cow,465.0
4,119.5,Grey wolf,36.33


In [13]:
mammals.reset_index().head()  # `reset_index()` add current index to data as a column

Unnamed: 0,index,brain,name,body
0,0,44.5,Arctic fox,3.385
1,1,15.5,Owl monkey,0.480
2,2,8.1,Mountain beaver,1.350
3,3,423.0,Cow,465.000
4,4,119.5,Grey wolf,36.330
...,...,...,...,...
57,57,169.0,Brazilian tapir,160.000
58,58,2.6,Tenrec,0.900
59,59,11.4,Phalanger,1.620
60,60,2.5,Tree shrew,0.104


In [14]:
mammals.set_index('name', inplace=True)
mammals.head()

Unnamed: 0_level_0,brain,body
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Arctic fox,44.5,3.385
Owl monkey,15.5,0.48
Mountain beaver,8.1,1.35
Cow,423.0,465.0
Grey wolf,119.5,36.33


In [15]:
mammals.loc['Cow']

brain    423.0
body     465.0
Name: Cow, dtype: float64

`loc[..]` return row by index value. If we sort the index `loc` works faster.

In [19]:
mammals.sort_index(inplace=True)
mammals.iloc[10:15]

Unnamed: 0_level_0,brain,body
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Chinchilla,6.4,0.425
Cow,423.0,465.0
Desert hedgehog,2.4,0.55
Donkey,419.0,187.1
E. American mole,1.2,0.075


In [20]:
mammals['Cat':'Donkey']  # slice if index is sorted

Unnamed: 0_level_0,brain,body
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Cat,25.6,3.3
Chimpanzee,440.0,52.16
Chinchilla,6.4,0.425
Cow,423.0,465.0
Desert hedgehog,2.4,0.55
Donkey,419.0,187.1


In [22]:
mammals.sort_values('brain')['Cat':'Donkey']  # slice if index is not sorted

Unnamed: 0_level_0,brain,body
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Cat,25.6,3.3
Raccoon,39.2,4.288
Arctic fox,44.5,3.385
Red fox,50.4,4.235
Kangaroo,56.0,35.0
Verbet,58.0,4.19
Giant armadillo,81.0,60.0
Roe deer,98.2,14.83
Goat,115.0,27.66
Patas monkey,115.0,10.0


In [28]:
mammals['A':'Dz']  # filtering by index by start_with

Unnamed: 0_level_0,brain,body
name,Unnamed: 1_level_1,Unnamed: 2_level_1
African elephant,5712.0,6654.0
African giant pouched rat,6.6,1.0
Arctic fox,44.5,3.385
Arctic ground squirrel,5.7,0.92
Asian elephant,4603.0,2547.0
Baboon,179.5,10.55
Big brown bat,0.3,0.023
Brazilian tapir,169.0,160.0
Cat,25.6,3.3
Chimpanzee,440.0,52.16


## Operation on text columns
Before work with text an good option is convert whole text to lower or upper case only in your data set.

In [3]:
ptrans = pd.read_csv(
    './course-files/course-sources/PublicTransitExpenses.csv',
    usecols=['Agency', 'Reporter Type', 'Total Operating Expenses']
)
ptrans.head()

Unnamed: 0,Agency,Reporter Type,Total Operating Expenses
0,Washington County Commissioners,Reduced Reporter,$122524.00
1,Washington County Commissioners,Reduced Reporter,$272715.00
2,"Texoma Area Paratransit System, Inc",Full Reporter,$7295.00
3,Kalispel Tribe of Indians,Reduced Reporter,$37416.00
4,Kalispel Tribe of Indians,Reduced Reporter,$345789.00


In [4]:
ptrans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17844 entries, 0 to 17843
Data columns (total 3 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Agency                    17844 non-null  object
 1   Reporter Type             17844 non-null  object
 2   Total Operating Expenses  17844 non-null  object
dtypes: object(3)
memory usage: 418.3+ KB


In [5]:
ptrans['Agency'].str.contains('washington')

0        False
1        False
2        False
3        False
4        False
         ...  
17839    False
17840    False
17841    False
17842    False
17843    False
Name: Agency, Length: 17844, dtype: bool

In [6]:
ptrans[ptrans['Agency'].str.contains('washington')]

Unnamed: 0,Agency,Reporter Type,Total Operating Expenses


In [7]:
contains_washington = ptrans['Agency'].str.lower().str.contains('washington')
contains_washington.head()

0     True
1     True
2    False
3    False
4    False
Name: Agency, dtype: bool

In [8]:
ptrans[contains_washington].head()

Unnamed: 0,Agency,Reporter Type,Total Operating Expenses
0,Washington County Commissioners,Reduced Reporter,$122524.00
1,Washington County Commissioners,Reduced Reporter,$272715.00
199,Washington State Ferries,Full Reporter,$16215777.00
213,Washington State Ferries,Full Reporter,$11132493.00
364,Washington State Ferries,Full Reporter,$11750411.00


In [42]:
ends_with_ferries = ptrans['Agency'].str.lower().str.rstrip().str.endswith('ferries')
ends_with_ferries.head()

0    False
1    False
2    False
3    False
4    False
Name: Agency, dtype: bool

In [43]:
ptrans[ends_with_ferries].head()

Unnamed: 0,Agency,Reporter Type,Total Operating Expenses
199,Washington State Ferries,Full Reporter,$16215777.00
213,Washington State Ferries,Full Reporter,$11132493.00
364,Washington State Ferries,Full Reporter,$11750411.00
397,Washington State Ferries,Full Reporter,$22400732.00
604,Washington State Ferries,Full Reporter,$17715787.00


In [44]:
ptrans.set_index('Agency', inplace=True)
ptrans.head()

Unnamed: 0_level_0,Reporter Type,Total Operating Expenses
Agency,Unnamed: 1_level_1,Unnamed: 2_level_1
Washington County Commissioners,Reduced Reporter,$122524.00
Washington County Commissioners,Reduced Reporter,$272715.00
"Texoma Area Paratransit System, Inc",Full Reporter,$7295.00
Kalispel Tribe of Indians,Reduced Reporter,$37416.00
Kalispel Tribe of Indians,Reduced Reporter,$345789.00


In [45]:
ptrans.index = ptrans.index.str.strip().str.upper()
ptrans.head()

Unnamed: 0_level_0,Reporter Type,Total Operating Expenses
Agency,Unnamed: 1_level_1,Unnamed: 2_level_1
WASHINGTON COUNTY COMMISSIONERS,Reduced Reporter,$122524.00
WASHINGTON COUNTY COMMISSIONERS,Reduced Reporter,$272715.00
"TEXOMA AREA PARATRANSIT SYSTEM, INC",Full Reporter,$7295.00
KALISPEL TRIBE OF INDIANS,Reduced Reporter,$37416.00
KALISPEL TRIBE OF INDIANS,Reduced Reporter,$345789.00


### Split data to an new column

In [46]:
ptrans['Reporter Type'].str.split(' ').head()

Agency
WASHINGTON COUNTY COMMISSIONERS        [Reduced, Reporter]
WASHINGTON COUNTY COMMISSIONERS        [Reduced, Reporter]
TEXOMA AREA PARATRANSIT SYSTEM, INC       [Full, Reporter]
KALISPEL TRIBE OF INDIANS              [Reduced, Reporter]
KALISPEL TRIBE OF INDIANS              [Reduced, Reporter]
Name: Reporter Type, dtype: object

In [48]:
ptrans['Reporter Type'].str.split(' ').str[0].head()

Agency
WASHINGTON COUNTY COMMISSIONERS        Reduced
WASHINGTON COUNTY COMMISSIONERS        Reduced
TEXOMA AREA PARATRANSIT SYSTEM, INC       Full
KALISPEL TRIBE OF INDIANS              Reduced
KALISPEL TRIBE OF INDIANS              Reduced
Name: Reporter Type, dtype: object

In [49]:
ptrans['Reporter Type'].str.split(' ', expand=True).head()

Unnamed: 0_level_0,0,1
Agency,Unnamed: 1_level_1,Unnamed: 2_level_1
WASHINGTON COUNTY COMMISSIONERS,Reduced,Reporter
WASHINGTON COUNTY COMMISSIONERS,Reduced,Reporter
"TEXOMA AREA PARATRANSIT SYSTEM, INC",Full,Reporter
KALISPEL TRIBE OF INDIANS,Reduced,Reporter
KALISPEL TRIBE OF INDIANS,Reduced,Reporter


In [50]:
ptrans[['ReportType1', 'ReportType2']] = ptrans['Reporter Type'].str.split(' ', expand=True)
ptrans.head()

Unnamed: 0_level_0,Reporter Type,Total Operating Expenses,ReportType1,ReportType2
Agency,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
WASHINGTON COUNTY COMMISSIONERS,Reduced Reporter,$122524.00,Reduced,Reporter
WASHINGTON COUNTY COMMISSIONERS,Reduced Reporter,$272715.00,Reduced,Reporter
"TEXOMA AREA PARATRANSIT SYSTEM, INC",Full Reporter,$7295.00,Full,Reporter
KALISPEL TRIBE OF INDIANS,Reduced Reporter,$37416.00,Reduced,Reporter
KALISPEL TRIBE OF INDIANS,Reduced Reporter,$345789.00,Reduced,Reporter


#### How to handle how many splits do?

In [51]:
ptrans['Agency2'] = ptrans.index
ptrans.head()

Unnamed: 0_level_0,Reporter Type,Total Operating Expenses,ReportType1,ReportType2,Agency2
Agency,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
WASHINGTON COUNTY COMMISSIONERS,Reduced Reporter,$122524.00,Reduced,Reporter,WASHINGTON COUNTY COMMISSIONERS
WASHINGTON COUNTY COMMISSIONERS,Reduced Reporter,$272715.00,Reduced,Reporter,WASHINGTON COUNTY COMMISSIONERS
"TEXOMA AREA PARATRANSIT SYSTEM, INC",Full Reporter,$7295.00,Full,Reporter,"TEXOMA AREA PARATRANSIT SYSTEM, INC"
KALISPEL TRIBE OF INDIANS,Reduced Reporter,$37416.00,Reduced,Reporter,KALISPEL TRIBE OF INDIANS
KALISPEL TRIBE OF INDIANS,Reduced Reporter,$345789.00,Reduced,Reporter,KALISPEL TRIBE OF INDIANS


In [52]:
ptrans['Agency2'].str.split(' ', expand=True, n=5).head()  # n0-5 = 6 splits

Unnamed: 0_level_0,0,1,2,3,4,5
Agency,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
WASHINGTON COUNTY COMMISSIONERS,WASHINGTON,COUNTY,COMMISSIONERS,,,
WASHINGTON COUNTY COMMISSIONERS,WASHINGTON,COUNTY,COMMISSIONERS,,,
"TEXOMA AREA PARATRANSIT SYSTEM, INC",TEXOMA,AREA,PARATRANSIT,"SYSTEM,",INC,
KALISPEL TRIBE OF INDIANS,KALISPEL,TRIBE,OF,INDIANS,,
KALISPEL TRIBE OF INDIANS,KALISPEL,TRIBE,OF,INDIANS,,


### Custom text base on other column

In [53]:
def get_comment(row):
    reporter_type = row['Reporter Type']
    cost = float(row['Total Operating Expenses'].replace('$',''))
    
    if cost > 200000:
        comment = 'CLASS A'
    else:
        comment = 'CLASS B'
    
    return (reporter_type + '/' + comment)


`apply()`:
* `axis=`:
** 0 or 'index': apply function to each column
** 1 or 'column': apply function to each row

In [54]:
ptrans.apply(get_comment, axis=1).head()

Agency
WASHINGTON COUNTY COMMISSIONERS        Reduced Reporter/CLASS B
WASHINGTON COUNTY COMMISSIONERS        Reduced Reporter/CLASS A
TEXOMA AREA PARATRANSIT SYSTEM, INC       Full Reporter/CLASS B
KALISPEL TRIBE OF INDIANS              Reduced Reporter/CLASS B
KALISPEL TRIBE OF INDIANS              Reduced Reporter/CLASS A
dtype: object

## Preparing data after import

### Step 0: import file, see head, and info about types and memory usage

In [55]:
# For a big file pandas cannot rapidly read small part of file to choose types.
# `low_memody=False` allow for get more memory to check better types.
ptrans = pd.read_csv('./course-files/course-sources/PublicTransitExpenses.csv')
ptrans.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,5 digit NTD ID,4 digit NTD ID,Agency,Reporter Type,Subrecipient Type,Organization Type,2015 Total Mode Vehicles (VOMS),Mode,Type of Service,Primary UZA Code,...,Casualty and Liability Costs,Taxes,PT Funds In Report,PT Funds Reported Separately,Miscellaneous Expenses,Reduced Reporter Total OE,Total Operating Expenses,Total Operating Expenses (No Funds Reported Separately),ADA Related Expenses,Location 1
0,30098,3098,Washington County Commissioners,Reduced Reporter,,,,DR,PT,,...,$0.00,$0.00,$0.00,$0.00,$0.00,$122524.00,$122524.00,$122524.00,$0.00,
1,30098,3098,Washington County Commissioners,Reduced Reporter,,,,MB,PT,,...,$0.00,$0.00,$0.00,$0.00,$0.00,$272715.00,$272715.00,$272715.00,$0.00,
2,60107,6107,"Texoma Area Paratransit System, Inc",Full Reporter,,,,CB,PT,,...,,,$3398.00,,,,$7295.00,$7295.00,,
3,9,0T09,Kalispel Tribe of Indians,Reduced Reporter,,Tribe,4.0,DR,DO,0.0,...,,,,,,$37416.00,$37416.00,$37416.00,,"Usk, WA\n"
4,9,0T09,Kalispel Tribe of Indians,Reduced Reporter,,Tribe,4.0,MB,DO,0.0,...,,,,,,$345789.00,$345789.00,$345789.00,,"Usk, WA\n"


In [56]:
ptrans.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17844 entries, 0 to 17843
Data columns (total 39 columns):
 #   Column                                                   Non-Null Count  Dtype  
---  ------                                                   --------------  -----  
 0   5 digit NTD ID                                           17844 non-null  object 
 1   4 digit NTD ID                                           17719 non-null  object 
 2   Agency                                                   17844 non-null  object 
 3   Reporter Type                                            17844 non-null  object 
 4   Subrecipient Type                                        3072 non-null   object 
 5   Organization Type                                        17759 non-null  object 
 6   2015 Total Mode Vehicles (VOMS)                          17522 non-null  float64
 7   Mode                                                     17844 non-null  object 
 8   Type of Service           

### Step 1: choose important columns, use low_memory if file needs

In [92]:
ptrans = pd.read_csv(
    './course-files/course-sources/PublicTransitExpenses.csv',
    usecols=['Agency', 'Reporter Type', 'Organization Type', 'Rail (Y/N)', 'Fixed Route (Y/N)', 'Service Costs', 'Tires and Tubes', 'Total Operating Expenses', 'Service Area Population'],
    low_memory=False
)
ptrans.head()

Unnamed: 0,Agency,Reporter Type,Organization Type,Service Area Population,Rail (Y/N),Fixed Route (Y/N),Service Costs,Tires and Tubes,Total Operating Expenses
0,Washington County Commissioners,Reduced Reporter,,,,,$0.00,$0.00,$122524.00
1,Washington County Commissioners,Reduced Reporter,,,,,$0.00,$0.00,$272715.00
2,"Texoma Area Paratransit System, Inc",Full Reporter,,,,,,,$7295.00
3,Kalispel Tribe of Indians,Reduced Reporter,Tribe,,N,N,,,$37416.00
4,Kalispel Tribe of Indians,Reduced Reporter,Tribe,,N,Y,,,$345789.00


In [93]:
ptrans.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17844 entries, 0 to 17843
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Agency                    17844 non-null  object 
 1   Reporter Type             17844 non-null  object 
 2   Organization Type         17759 non-null  object 
 3   Service Area Population   14418 non-null  float64
 4   Rail (Y/N)                17318 non-null  object 
 5   Fixed Route (Y/N)         17318 non-null  object 
 6   Service Costs             10262 non-null  object 
 7   Tires and Tubes           5615 non-null   object 
 8   Total Operating Expenses  17844 non-null  object 
dtypes: float64(1), object(8)
memory usage: 9.5 MB


### Step 2: change columns' names to better for work with

In [94]:
new_columns_names = {
#     'Agency',
    'Reporter Type'          : 'ReporterType',
    'Organization Type'      : 'OrgType',
    'Rail (Y/N)'             : 'isRail',
    'Fixed Route (Y/N)'      : 'isFixedRoute',
    'Service Costs'          : 'ServiceCosts',
    'Tires and Tubes'        : 'TiresTubesCost',
    'Total Operating Expenses':'TotalExpenses',
    'Service Area Population' : 'Population'
}
ptrans.rename(columns=new_columns_names, inplace=True)
ptrans.head()

Unnamed: 0,Agency,ReporterType,OrgType,Population,isRail,isFixedRoute,ServiceCosts,TiresTubesCost,TotalExpenses
0,Washington County Commissioners,Reduced Reporter,,,,,$0.00,$0.00,$122524.00
1,Washington County Commissioners,Reduced Reporter,,,,,$0.00,$0.00,$272715.00
2,"Texoma Area Paratransit System, Inc",Full Reporter,,,,,,,$7295.00
3,Kalispel Tribe of Indians,Reduced Reporter,Tribe,,N,N,,,$37416.00
4,Kalispel Tribe of Indians,Reduced Reporter,Tribe,,N,Y,,,$345789.00


### Step 3: columns types conversion

#### category

In [95]:
ptrans['ReporterType'].nunique()

4

In [96]:
ptrans['ReporterType'].value_counts()

Full Reporter       13345
Rural Reporter       3116
Reduced Reporter     1313
Separate Service       70
Name: ReporterType, dtype: int64

In [97]:
ptrans['OrgType'].nunique()

14

In [98]:
ptrans['Agency'].nunique()

2226

In [99]:
len(ptrans)

17844

In [100]:
ptrans.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17844 entries, 0 to 17843
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Agency          17844 non-null  object 
 1   ReporterType    17844 non-null  object 
 2   OrgType         17759 non-null  object 
 3   Population      14418 non-null  float64
 4   isRail          17318 non-null  object 
 5   isFixedRoute    17318 non-null  object 
 6   ServiceCosts    10262 non-null  object 
 7   TiresTubesCost  5615 non-null   object 
 8   TotalExpenses   17844 non-null  object 
dtypes: float64(1), object(8)
memory usage: 9.5 MB


In [101]:
ptrans['ReporterType'] = ptrans['ReporterType'].astype('category')
ptrans.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17844 entries, 0 to 17843
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   Agency          17844 non-null  object  
 1   ReporterType    17844 non-null  category
 2   OrgType         17759 non-null  object  
 3   Population      14418 non-null  float64 
 4   isRail          17318 non-null  object  
 5   isFixedRoute    17318 non-null  object  
 6   ServiceCosts    10262 non-null  object  
 7   TiresTubesCost  5615 non-null   object  
 8   TotalExpenses   17844 non-null  object  
dtypes: category(1), float64(1), object(7)
memory usage: 8.3 MB


In [102]:
ptrans['Agency'] = ptrans['Agency'].astype('category')
ptrans.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17844 entries, 0 to 17843
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   Agency          17844 non-null  category
 1   ReporterType    17844 non-null  category
 2   OrgType         17759 non-null  object  
 3   Population      14418 non-null  float64 
 4   isRail          17318 non-null  object  
 5   isFixedRoute    17318 non-null  object  
 6   ServiceCosts    10262 non-null  object  
 7   TiresTubesCost  5615 non-null   object  
 8   TotalExpenses   17844 non-null  object  
dtypes: category(2), float64(1), object(6)
memory usage: 7.1 MB


In [103]:
ptrans['OrgType'] = ptrans['OrgType'].astype('category')
ptrans.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17844 entries, 0 to 17843
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   Agency          17844 non-null  category
 1   ReporterType    17844 non-null  category
 2   OrgType         17759 non-null  category
 3   Population      14418 non-null  float64 
 4   isRail          17318 non-null  object  
 5   isFixedRoute    17318 non-null  object  
 6   ServiceCosts    10262 non-null  object  
 7   TiresTubesCost  5615 non-null   object  
 8   TotalExpenses   17844 non-null  object  
dtypes: category(3), float64(1), object(5)
memory usage: 5.1 MB


#### an convertion to number types

In [104]:
ptrans['Population'].fillna(0, inplace=True)
ptrans.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17844 entries, 0 to 17843
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   Agency          17844 non-null  category
 1   ReporterType    17844 non-null  category
 2   OrgType         17759 non-null  category
 3   Population      17844 non-null  float64 
 4   isRail          17318 non-null  object  
 5   isFixedRoute    17318 non-null  object  
 6   ServiceCosts    10262 non-null  object  
 7   TiresTubesCost  5615 non-null   object  
 8   TotalExpenses   17844 non-null  object  
dtypes: category(3), float64(1), object(5)
memory usage: 5.1 MB


In [105]:
ptrans['Population'] = ptrans['Population'].astype('int')
ptrans.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17844 entries, 0 to 17843
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   Agency          17844 non-null  category
 1   ReporterType    17844 non-null  category
 2   OrgType         17759 non-null  category
 3   Population      17844 non-null  int32   
 4   isRail          17318 non-null  object  
 5   isFixedRoute    17318 non-null  object  
 6   ServiceCosts    10262 non-null  object  
 7   TiresTubesCost  5615 non-null   object  
 8   TotalExpenses   17844 non-null  object  
dtypes: category(3), int32(1), object(5)
memory usage: 5.1 MB


In [106]:
ptrans.head()

Unnamed: 0,Agency,ReporterType,OrgType,Population,isRail,isFixedRoute,ServiceCosts,TiresTubesCost,TotalExpenses
0,Washington County Commissioners,Reduced Reporter,,0,,,$0.00,$0.00,$122524.00
1,Washington County Commissioners,Reduced Reporter,,0,,,$0.00,$0.00,$272715.00
2,"Texoma Area Paratransit System, Inc",Full Reporter,,0,,,,,$7295.00
3,Kalispel Tribe of Indians,Reduced Reporter,Tribe,0,N,N,,,$37416.00
4,Kalispel Tribe of Indians,Reduced Reporter,Tribe,0,N,Y,,,$345789.00


In [107]:
ptrans[ptrans['Population']>0].head()

Unnamed: 0,Agency,ReporterType,OrgType,Population,isRail,isFixedRoute,ServiceCosts,TiresTubesCost,TotalExpenses
71,Reno-Sparks Indian Colony,Reduced Reporter,Tribe,1127,N,Y,,,$174810.00
92,City of Pocatello,Reduced Reporter,"City, County or Local Government Unit or Depar...",81730,N,Y,,,$1119899.00
93,Lee-Russell Council of Governments,Reduced Reporter,"MPO, COG or Other Planning Agency",193194,N,N,,,$1540633.00
94,"Aiken Area Council on Aging, Inc.",Reduced Reporter,Area Agency on Aging,160099,N,Y,,,$525325.00
101,City of Lawrence,Full Reporter,Consolidated Reporter,87643,N,Y,,,$937407.00


#### an conversion to boolean

In [116]:
ptrans['isRail'].replace(('N', 'Y'), (False, True))[165:170]

165    False
166    False
167      NaN
168    False
169     True
Name: isRail, dtype: object

In [117]:
ptrans['isRail'].replace(('N', 'Y'), (False, True), inplace=True)
ptrans['isFixedRoute'].replace(('N', 'Y'), (False, True), inplace=True)
ptrans.loc[165:170]

Unnamed: 0,Agency,ReporterType,OrgType,Population,isRail,isFixedRoute,ServiceCosts,TiresTubesCost,TotalExpenses
165,Intercity Transit,Full Reporter,Independent Public Agency or Authority of Tran...,171850,False,True,$96219.00,,$560088.00
166,Skagit Transit,Full Reporter,Independent Public Agency or Authority of Tran...,109198,False,True,$82206.00,$43737.00,$3310336.00
167,Pierce County Transportation Benefit Area Auth...,Full Reporter,Independent Public Agency or Authority of Tran...,557069,,,,,$789253.00
168,Lane Transit District,Full Reporter,Independent Public Agency or Authority of Tran...,302200,False,False,,,$129456.00
169,Central Puget Sound Regional Transit Authority,Full Reporter,Independent Public Agency or Authority of Tran...,2919000,True,True,$3283413.00,,$8361375.00
170,Snohomish County Public Transportation Benefit...,Full Reporter,Independent Public Agency or Authority of Tran...,722268,False,True,$332557.00,,$16037469.00


Decision to change Nan -> False should decide the business

In [118]:
ptrans['isRail'].fillna(False, inplace=True)
ptrans['isFixedRoute'].fillna(False, inplace=True)
ptrans.loc[165:170]

Unnamed: 0,Agency,ReporterType,OrgType,Population,isRail,isFixedRoute,ServiceCosts,TiresTubesCost,TotalExpenses
165,Intercity Transit,Full Reporter,Independent Public Agency or Authority of Tran...,171850,False,True,$96219.00,,$560088.00
166,Skagit Transit,Full Reporter,Independent Public Agency or Authority of Tran...,109198,False,True,$82206.00,$43737.00,$3310336.00
167,Pierce County Transportation Benefit Area Auth...,Full Reporter,Independent Public Agency or Authority of Tran...,557069,False,False,,,$789253.00
168,Lane Transit District,Full Reporter,Independent Public Agency or Authority of Tran...,302200,False,False,,,$129456.00
169,Central Puget Sound Regional Transit Authority,Full Reporter,Independent Public Agency or Authority of Tran...,2919000,True,True,$3283413.00,,$8361375.00
170,Snohomish County Public Transportation Benefit...,Full Reporter,Independent Public Agency or Authority of Tran...,722268,False,True,$332557.00,,$16037469.00


In [122]:
ptrans['isRail'] = ptrans['isRail'].astype('bool')
ptrans['isFixedRoute'] = ptrans['isRail'].astype('bool')
ptrans.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17844 entries, 0 to 17843
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   Agency          17844 non-null  category
 1   ReporterType    17844 non-null  category
 2   OrgType         17759 non-null  category
 3   Population      17844 non-null  int32   
 4   isRail          17844 non-null  bool    
 5   isFixedRoute    17844 non-null  bool    
 6   ServiceCosts    10262 non-null  object  
 7   TiresTubesCost  5615 non-null   object  
 8   TotalExpenses   17844 non-null  object  
dtypes: bool(2), category(3), int32(1), object(3)
memory usage: 3.2 MB


In [125]:
ptrans[ptrans['isRail']==True].head()

Unnamed: 0,Agency,ReporterType,OrgType,Population,isRail,isFixedRoute,ServiceCosts,TiresTubesCost,TotalExpenses
169,Central Puget Sound Regional Transit Authority,Full Reporter,Independent Public Agency or Authority of Tran...,2919000,True,True,$3283413.00,,$8361375.00
176,King County Department of Transportation - Met...,Full Reporter,"City, County or Local Government Unit or Depar...",2117125,True,True,$14764.00,,$1638751.00
182,King County Department of Transportation - Met...,Full Reporter,"City, County or Local Government Unit or Depar...",2117125,True,True,$3402.00,,$168178.00
183,Central Puget Sound Regional Transit Authority,Full Reporter,Independent Public Agency or Authority of Tran...,2919000,True,True,$914064.00,$0.00,$4143179.00
204,City of Seattle - Seattle Center Monorail Transit,Full Reporter,"City, County or Local Government Unit or Depar...",495500,True,True,,,$141966.00


#### Money conversion

In [129]:
# FutureWarning? '$' cannot be replaced ?
ptrans['ServiceCosts'] = ptrans['ServiceCosts'].str.replace('$','')
ptrans['TiresTubesCost'] = ptrans['TiresTubesCost'].str.replace('$','')
ptrans['TotalExpenses'] = ptrans['TotalExpenses'].str.replace('$','')
ptrans.head()

  ptrans['ServiceCosts'] = ptrans['ServiceCosts'].str.replace('$','')
  ptrans['TiresTubesCost'] = ptrans['TiresTubesCost'].str.replace('$','')
  ptrans['TotalExpenses'] = ptrans['TotalExpenses'].str.replace('$','')


Unnamed: 0,Agency,ReporterType,OrgType,Population,isRail,isFixedRoute,ServiceCosts,TiresTubesCost,TotalExpenses
0,Washington County Commissioners,Reduced Reporter,,0,False,False,0.0,0.0,122524.0
1,Washington County Commissioners,Reduced Reporter,,0,False,False,0.0,0.0,272715.0
2,"Texoma Area Paratransit System, Inc",Full Reporter,,0,False,False,,,7295.0
3,Kalispel Tribe of Indians,Reduced Reporter,Tribe,0,False,False,,,37416.0
4,Kalispel Tribe of Indians,Reduced Reporter,Tribe,0,False,False,,,345789.0


In [130]:
ptrans['ServiceCosts'] = ptrans['ServiceCosts'].astype('float')
ptrans['TiresTubesCost'] = ptrans['TiresTubesCost'].astype('float')
ptrans['TotalExpenses'] = ptrans['TotalExpenses'].astype('float')
ptrans.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17844 entries, 0 to 17843
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   Agency          17844 non-null  category
 1   ReporterType    17844 non-null  category
 2   OrgType         17759 non-null  category
 3   Population      17844 non-null  int32   
 4   isRail          17844 non-null  bool    
 5   isFixedRoute    17844 non-null  bool    
 6   ServiceCosts    10262 non-null  float64 
 7   TiresTubesCost  5615 non-null   float64 
 8   TotalExpenses   17844 non-null  float64 
dtypes: bool(2), category(3), float64(3), int32(1)
memory usage: 846.4 KB


### [Optional] Step 4: change case type for literal columns
UPPER, lower, TitleCase

In [131]:
# after change case type change data type again
ptrans['Agency'] = ptrans['Agency'].str.title()  # changes data type to object
ptrans.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17844 entries, 0 to 17843
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   Agency          17844 non-null  object  
 1   ReporterType    17844 non-null  category
 2   OrgType         17759 non-null  category
 3   Population      17844 non-null  int32   
 4   isRail          17844 non-null  bool    
 5   isFixedRoute    17844 non-null  bool    
 6   ServiceCosts    10262 non-null  float64 
 7   TiresTubesCost  5615 non-null   float64 
 8   TotalExpenses   17844 non-null  float64 
dtypes: bool(2), category(2), float64(3), int32(1), object(1)
memory usage: 2.1 MB


In [132]:
ptrans['Agency'] = ptrans['Agency'].str.title().astype('category')
ptrans['ReporterType'] = ptrans['ReporterType'].str.upper().astype('category')
ptrans['OrgType'] = ptrans['OrgType'].str.upper().astype('category')
ptrans.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17844 entries, 0 to 17843
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   Agency          17844 non-null  category
 1   ReporterType    17844 non-null  category
 2   OrgType         17759 non-null  category
 3   Population      17844 non-null  int32   
 4   isRail          17844 non-null  bool    
 5   isFixedRoute    17844 non-null  bool    
 6   ServiceCosts    10262 non-null  float64 
 7   TiresTubesCost  5615 non-null   float64 
 8   TotalExpenses   17844 non-null  float64 
dtypes: bool(2), category(3), float64(3), int32(1)
memory usage: 846.1 KB


In [134]:
ptrans.iloc[5:10]

Unnamed: 0,Agency,ReporterType,OrgType,Population,isRail,isFixedRoute,ServiceCosts,TiresTubesCost,TotalExpenses
5,Kalispel Tribe Of Indians,REDUCED REPORTER,TRIBE,0,False,False,,,367998.0
6,"Texoma Area Paratransit System, Inc",FULL REPORTER,,0,False,False,12546.0,0.0,276856.0
7,"Texoma Area Paratransit System, Inc",FULL REPORTER,,0,False,False,,,30584.0
8,"Texoma Area Paratransit System, Inc",FULL REPORTER,,0,False,False,,,124364.0
9,"Texoma Area Paratransit System, Inc",FULL REPORTER,,0,False,False,416441.0,,2237328.0
