# Practicing with pandas

Based on __Chen D.Y. (2018) Pandas for Everyone. Python Data Analysis.__

Note: Abalone dataset, describing physical characteristics of abalone shells, downloaded from the UCI Machine Learning Library at http://archive.ics.uci.edu/ml/datasets/Abalone 

### Chapter 1 Practice

In [1]:
#for operating system interface - working on a Windows 10 machine
import os

In [2]:
#Does the data file exist?
os.path.isfile('gapminder.tsv')

True

In [3]:
#load pandas library
import pandas as pd

In [4]:
#load the data (tab-separated file) into a panda dataframe
df = pd.read_csv('gapminder.tsv', sep='\t')

#### Looking at the initial data frame

In [5]:
#check load results by looking at the first five rows
df.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


In [6]:
#check the df object type 
type(df)

pandas.core.frame.DataFrame

In [7]:
#check the number of columns and rows 
df.shape

(1704, 6)

In [8]:
#display the column names
df.columns

Index(['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap'], dtype='object')

In [9]:
#display the data types for each column
df.dtypes

country       object
continent     object
year           int64
lifeExp      float64
pop            int64
gdpPercap    float64
dtype: object

In [10]:
#display detailed information about the df data frame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
country      1704 non-null object
continent    1704 non-null object
year         1704 non-null int64
lifeExp      1704 non-null float64
pop          1704 non-null int64
gdpPercap    1704 non-null float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB


#### Subsetting the data

In [11]:
# Selecting a single column ('country') and saving it as a series
country_df = df['country']
#Display the first five rows
country_df.head()

0    Afghanistan
1    Afghanistan
2    Afghanistan
3    Afghanistan
4    Afghanistan
Name: country, dtype: object

In [12]:
# Select a subset of columns
subset = df[['country', 'continent', 'year']]
subset.head()

Unnamed: 0,country,continent,year
0,Afghanistan,Asia,1952
1,Afghanistan,Asia,1957
2,Afghanistan,Asia,1962
3,Afghanistan,Asia,1967
4,Afghanistan,Asia,1972


In [13]:
#Alternative way - pass a list of columns
#same result
sub_columns = ['country', 'continent', 'year']
sub_df = df[sub_columns]
sub_df.head()

Unnamed: 0,country,continent,year
0,Afghanistan,Asia,1952
1,Afghanistan,Asia,1957
2,Afghanistan,Asia,1962
3,Afghanistan,Asia,1967
4,Afghanistan,Asia,1972


In [14]:
#Select a row by index label
df.loc[0]

country      Afghanistan
continent           Asia
year                1952
lifeExp           28.801
pop              8425333
gdpPercap        779.445
Name: 0, dtype: object

In [15]:
#Select a row by row number
df.iloc[0]

country      Afghanistan
continent           Asia
year                1952
lifeExp           28.801
pop              8425333
gdpPercap        779.445
Name: 0, dtype: object

In [16]:
#copy the data frame and assign it a different index - year
year_df = df.copy()
year_df.set_index('year', inplace=True)

In [17]:
#Check results
year_df.head()

Unnamed: 0_level_0,country,continent,lifeExp,pop,gdpPercap
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1952,Afghanistan,Asia,28.801,8425333,779.445314
1957,Afghanistan,Asia,30.332,9240934,820.85303
1962,Afghanistan,Asia,31.997,10267083,853.10071
1967,Afghanistan,Asia,34.02,11537966,836.197138
1972,Afghanistan,Asia,36.088,13079460,739.981106


In [18]:
#Chose rows using the row label and 
#display the first five results
year_df.loc[1952].head()

Unnamed: 0_level_0,country,continent,lifeExp,pop,gdpPercap
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1952,Afghanistan,Asia,28.801,8425333,779.445314
1952,Albania,Europe,55.23,1282697,1601.056136
1952,Algeria,Africa,43.077,9279525,2449.008185
1952,Angola,Africa,30.015,4232095,3520.610273
1952,Argentina,Americas,62.485,17876956,5911.315053


In [19]:
#Choose a row using a row index
year_df.iloc[0]

country      Afghanistan
continent           Asia
lifeExp           28.801
pop              8425333
gdpPercap        779.445
Name: 1952, dtype: object

In [20]:
#Selecting multiple rows
year_df.iloc[[0, 10, 100]]

Unnamed: 0_level_0,country,continent,lifeExp,pop,gdpPercap
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1952,Afghanistan,Asia,28.801,8425333,779.445314
2002,Afghanistan,Asia,42.129,25268405,726.734055
1972,Bangladesh,Asia,45.252,70759295,630.233627


In [21]:
#subsetting rows using a range
index_range = list(range(0, 10))
year_df.iloc[index_range]

Unnamed: 0_level_0,country,continent,lifeExp,pop,gdpPercap
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1952,Afghanistan,Asia,28.801,8425333,779.445314
1957,Afghanistan,Asia,30.332,9240934,820.85303
1962,Afghanistan,Asia,31.997,10267083,853.10071
1967,Afghanistan,Asia,34.02,11537966,836.197138
1972,Afghanistan,Asia,36.088,13079460,739.981106
1977,Afghanistan,Asia,38.438,14880372,786.11336
1982,Afghanistan,Asia,39.854,12881816,978.011439
1987,Afghanistan,Asia,40.822,13867957,852.395945
1992,Afghanistan,Asia,41.674,16317921,649.341395
1997,Afghanistan,Asia,41.763,22227415,635.341351


__Slicing columns__

In [23]:
#display index_range content
index_range

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [24]:
#first five elements
index_range[0:5]

[0, 1, 2, 3, 4]

In [25]:
#all elements
index_range[:]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [26]:
# Slicing two-dimentional objects
# display all columns for the labels 0 through 4
df.loc[0:4, :]

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


In [27]:
# Select all rows for the columns year and pop
# and display first 5
df.loc[:, ['year', 'pop']].head()

Unnamed: 0,year,pop
0,1952,8425333
1,1957,9240934
2,1962,10267083
3,1967,11537966
4,1972,13079460


#### Filtering and aggregation

In [28]:
#Find average life expectancy for eeach year
df.groupby('year')['lifeExp'].mean()

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

In [29]:
#Group by year and then by continent
#calculate means for lifeExp and gdpPercap and 
#display first 20 results
df.groupby(['year', 'continent']) \
[['lifeExp', 'gdpPercap']]. \
mean().head(n=20)

Unnamed: 0_level_0,Unnamed: 1_level_0,lifeExp,gdpPercap
year,continent,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,Africa,39.1355,1252.572466
1952,Americas,53.27984,4079.062552
1952,Asia,46.314394,5195.484004
1952,Europe,64.4085,5661.057435
1952,Oceania,69.255,10298.08565
1957,Africa,41.266346,1385.236062
1957,Americas,55.96028,4616.043733
1957,Asia,49.318544,5787.73294
1957,Europe,66.703067,6963.012816
1957,Oceania,70.295,11598.522455


In [30]:
#Flatten out the results
df.groupby(['year', 'continent']) \
[['lifeExp', 'gdpPercap']]. \
mean(). \
reset_index(). \
head()

Unnamed: 0,year,continent,lifeExp,gdpPercap
0,1952,Africa,39.1355,1252.572466
1,1952,Americas,53.27984,4079.062552
2,1952,Asia,46.314394,5195.484004
3,1952,Europe,64.4085,5661.057435
4,1952,Oceania,69.255,10298.08565


In [31]:
#Count unique records - number of countries on each continent
df.groupby('continent')['country'].nunique()

continent
Africa      52
Americas    25
Asia        33
Europe      30
Oceania      2
Name: country, dtype: int64

### Chapter 2 Practice

In [32]:
#Creating a list
names=["Bob", "Cheryl", "Dave", "Jenny", "Waldo"]
names[1]

'Cheryl'

In [33]:
#Creating a dictionary
staff = {'QA':"Bob", 'Manager':"Cheryl", 'Dev':"Dave", 'Marketing':"Jenny", 'HR':"Waldo"}
staff['HR']

'Waldo'

#### Pandas series and dataframes

In [34]:
#creating a pandas Series object
name_series = pd.Series(["Bob", "Cheryl", "Dave", "Jenny", "Waldo"])
name_series

0       Bob
1    Cheryl
2      Dave
3     Jenny
4     Waldo
dtype: object

In [35]:
name_series[1]

'Cheryl'

In [36]:
#Creating a pandas dataframe object

staff_df = pd.DataFrame({'QA':["Bob"], 'Manager':["Cheryl"], "Dev":["Dave"], "Marketing":["Jenny"], 'HR':["Waldo"]})
staff_df

Unnamed: 0,QA,Manager,Dev,Marketing,HR
0,Bob,Cheryl,Dave,Jenny,Waldo


In [37]:
#passing multiple values
staff_df = pd.DataFrame({'QA':["Bob", "Gina"], 'Manager':["Cheryl", "Tom"], "Dev":["Dave", "Susan"], "Marketing":["Jenny", "Richard"], 'HR':["Waldo", "Mario"]})
staff_df

Unnamed: 0,QA,Manager,Dev,Marketing,HR
0,Bob,Cheryl,Dave,Jenny,Waldo
1,Gina,Tom,Susan,Richard,Mario


#### Boolean operations

In [38]:
#Load the data file previousely downloaded from https://github.com/chendaniely/pandas_for_everyone/archive/master.zip
scientists = pd.read_csv("scientists.csv")
scientists.head()

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist


In [39]:
#extract Age column and save it as a series
ages = scientists['Age']
ages.head()

0    37
1    61
2    90
3    66
4    56
Name: Age, dtype: int64

In [40]:
#check type
type(ages)

pandas.core.series.Series

In [41]:
#calculate mean
ages.mean()

59.125

In [42]:
#apply a boolean operator
# compare each value to the mean age
ages > ages.mean()

0    False
1     True
2     True
3     True
4    False
5    False
6    False
7     True
Name: Age, dtype: bool

In [43]:
ages[ages > ages.mean()]

1    61
2    90
3    66
7    77
Name: Age, dtype: int64

In [44]:
#use boolean output as an input for a list constructor, store result in a variable
golden_years=list(ages>ages.mean())
golden_years #check results

[False, True, True, True, False, False, False, True]

In [45]:
#Display only ages > mean using boolean as a filter
ages[golden_years]

1    61
2    90
3    66
7    77
Name: Age, dtype: int64

In [46]:
# performing operation on the entire series
ages +10

0     47
1     71
2    100
3     76
4     66
5     55
6     51
7     87
Name: Age, dtype: int64

In [47]:
ages*2

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64

In [48]:
# Operation with Dataframes
scientists['Age'] > scientists['Age'].mean()


0    False
1     True
2     True
3     True
4    False
5    False
6    False
7     True
Name: Age, dtype: bool

In [49]:
#filter records for the scientist whose age > mean
scientists[scientists['Age'] > scientists['Age'].mean()]

Unnamed: 0,Name,Born,Died,Age,Occupation
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [50]:
#chcek type
type(scientists['Age'] > scientists['Age'].mean())

pandas.core.series.Series

In [51]:
lucky_years = scientists['Age'] > scientists['Age'].mean()
lucky_years

0    False
1     True
2     True
3     True
4    False
5    False
6    False
7     True
Name: Age, dtype: bool

In [52]:
scientists[lucky_years]

Unnamed: 0,Name,Born,Died,Age,Occupation
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [53]:
#Scalar operations with dataframes
#Strings get repeated!
#numeric values multiplied
scientists *2

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline FranklinRosaline Franklin,1920-07-251920-07-25,1958-04-161958-04-16,74,ChemistChemist
1,William GossetWilliam Gosset,1876-06-131876-06-13,1937-10-161937-10-16,122,StatisticianStatistician
2,Florence NightingaleFlorence Nightingale,1820-05-121820-05-12,1910-08-131910-08-13,180,NurseNurse
3,Marie CurieMarie Curie,1867-11-071867-11-07,1934-07-041934-07-04,132,ChemistChemist
4,Rachel CarsonRachel Carson,1907-05-271907-05-27,1964-04-141964-04-14,112,BiologistBiologist
5,John SnowJohn Snow,1813-03-151813-03-15,1858-06-161858-06-16,90,PhysicianPhysician
6,Alan TuringAlan Turing,1912-06-231912-06-23,1954-06-071954-06-07,82,Computer ScientistComputer Scientist
7,Johann GaussJohann Gauss,1777-04-301777-04-30,1855-02-231855-02-23,154,MathematicianMathematician


#### Changing data

In [54]:
#check data types for all columns
scientists.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 5 columns):
Name          8 non-null object
Born          8 non-null object
Died          8 non-null object
Age           8 non-null int64
Occupation    8 non-null object
dtypes: int64(1), object(4)
memory usage: 400.0+ bytes


In [55]:
#create a series born_date for the DOB in dat/time format, not as a string (initial Born column)
born_date = pd.to_datetime(scientists['Born'], format ='%Y-%m-%d')
born_date

0   1920-07-25
1   1876-06-13
2   1820-05-12
3   1867-11-07
4   1907-05-27
5   1813-03-15
6   1912-06-23
7   1777-04-30
Name: Born, dtype: datetime64[ns]

In [56]:
#similarly, create a series for the died_date
died_date = pd.to_datetime(scientists['Died'], format = '%Y-%m-%d')
type(died_date)

pandas.core.series.Series

In [57]:
#add these two new series to the data frame as columns using multiple assignment
scientists['born_dt'], scientists['died_dt'] = born_date, died_date
#display resulting data frame
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation,born_dt,died_dt
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist,1920-07-25,1958-04-16
1,William Gosset,1876-06-13,1937-10-16,61,Statistician,1876-06-13,1937-10-16
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse,1820-05-12,1910-08-13
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist,1867-11-07,1934-07-04
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist,1907-05-27,1964-04-14
5,John Snow,1813-03-15,1858-06-16,45,Physician,1813-03-15,1858-06-16
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist,1912-06-23,1954-06-07
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician,1777-04-30,1855-02-23


In [58]:
#Check data types for all columns
scientists.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 7 columns):
Name          8 non-null object
Born          8 non-null object
Died          8 non-null object
Age           8 non-null int64
Occupation    8 non-null object
born_dt       8 non-null datetime64[ns]
died_dt       8 non-null datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(4)
memory usage: 528.0+ bytes


#### Changing data in place

In [59]:
#Temporary change - sort by the date of death
scientists.sort_values(by='died_dt')

Unnamed: 0,Name,Born,Died,Age,Occupation,born_dt,died_dt
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician,1777-04-30,1855-02-23
5,John Snow,1813-03-15,1858-06-16,45,Physician,1813-03-15,1858-06-16
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse,1820-05-12,1910-08-13
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist,1867-11-07,1934-07-04
1,William Gosset,1876-06-13,1937-10-16,61,Statistician,1876-06-13,1937-10-16
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist,1912-06-23,1954-06-07
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist,1920-07-25,1958-04-16
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist,1907-05-27,1964-04-14


In [60]:
# make sure that the data frame itself was not altered
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation,born_dt,died_dt
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist,1920-07-25,1958-04-16
1,William Gosset,1876-06-13,1937-10-16,61,Statistician,1876-06-13,1937-10-16
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse,1820-05-12,1910-08-13
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist,1867-11-07,1934-07-04
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist,1907-05-27,1964-04-14
5,John Snow,1813-03-15,1858-06-16,45,Physician,1813-03-15,1858-06-16
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist,1912-06-23,1954-06-07
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician,1777-04-30,1855-02-23


In [61]:
# Change in place - permanently alters the data frame
scientists.sort_values(by='died_dt', inplace=True)
#check results
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation,born_dt,died_dt
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician,1777-04-30,1855-02-23
5,John Snow,1813-03-15,1858-06-16,45,Physician,1813-03-15,1858-06-16
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse,1820-05-12,1910-08-13
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist,1867-11-07,1934-07-04
1,William Gosset,1876-06-13,1937-10-16,61,Statistician,1876-06-13,1937-10-16
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist,1912-06-23,1954-06-07
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist,1920-07-25,1958-04-16
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist,1907-05-27,1964-04-14


#### Dropping columns and values

In [62]:
#copy the Age column
scientists['Age copy'] = scientists['Age'].copy()
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation,born_dt,died_dt,Age copy
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician,1777-04-30,1855-02-23,77
5,John Snow,1813-03-15,1858-06-16,45,Physician,1813-03-15,1858-06-16,45
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse,1820-05-12,1910-08-13,90
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist,1867-11-07,1934-07-04,66
1,William Gosset,1876-06-13,1937-10-16,61,Statistician,1876-06-13,1937-10-16,61
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist,1912-06-23,1954-06-07,41
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist,1920-07-25,1958-04-16,37
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist,1907-05-27,1964-04-14,56


In [63]:
#Drop the 'Age copy' column, axis = 1 for column
scientists.drop(['Age copy'], inplace=True, axis=1)
scientists #check results

Unnamed: 0,Name,Born,Died,Age,Occupation,born_dt,died_dt
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician,1777-04-30,1855-02-23
5,John Snow,1813-03-15,1858-06-16,45,Physician,1813-03-15,1858-06-16
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse,1820-05-12,1910-08-13
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist,1867-11-07,1934-07-04
1,William Gosset,1876-06-13,1937-10-16,61,Statistician,1876-06-13,1937-10-16
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist,1912-06-23,1954-06-07
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist,1920-07-25,1958-04-16
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist,1907-05-27,1964-04-14


In [64]:
#set the ages above the mean to null (to be dropped later)
import numpy as np
scientists.loc[scientists['Age'] > scientists['Age'].mean(), 'Age']= np.nan
scientists


Unnamed: 0,Name,Born,Died,Age,Occupation,born_dt,died_dt
7,Johann Gauss,1777-04-30,1855-02-23,,Mathematician,1777-04-30,1855-02-23
5,John Snow,1813-03-15,1858-06-16,45.0,Physician,1813-03-15,1858-06-16
2,Florence Nightingale,1820-05-12,1910-08-13,,Nurse,1820-05-12,1910-08-13
3,Marie Curie,1867-11-07,1934-07-04,,Chemist,1867-11-07,1934-07-04
1,William Gosset,1876-06-13,1937-10-16,,Statistician,1876-06-13,1937-10-16
6,Alan Turing,1912-06-23,1954-06-07,41.0,Computer Scientist,1912-06-23,1954-06-07
0,Rosaline Franklin,1920-07-25,1958-04-16,37.0,Chemist,1920-07-25,1958-04-16
4,Rachel Carson,1907-05-27,1964-04-14,56.0,Biologist,1907-05-27,1964-04-14


In [65]:
#temporarily fdrop rowas concaining null values
scientists.dropna() 

Unnamed: 0,Name,Born,Died,Age,Occupation,born_dt,died_dt
5,John Snow,1813-03-15,1858-06-16,45.0,Physician,1813-03-15,1858-06-16
6,Alan Turing,1912-06-23,1954-06-07,41.0,Computer Scientist,1912-06-23,1954-06-07
0,Rosaline Franklin,1920-07-25,1958-04-16,37.0,Chemist,1920-07-25,1958-04-16
4,Rachel Carson,1907-05-27,1964-04-14,56.0,Biologist,1907-05-27,1964-04-14


In [66]:
#Recreate missing ages using dataes of birth and death([Y] use year only)
scientists['Age'].fillna(value=(scientists['died_dt']-scientists['born_dt']).astype('timedelta64[Y]'), inplace=True)
#Check results
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation,born_dt,died_dt
7,Johann Gauss,1777-04-30,1855-02-23,77.0,Mathematician,1777-04-30,1855-02-23
5,John Snow,1813-03-15,1858-06-16,45.0,Physician,1813-03-15,1858-06-16
2,Florence Nightingale,1820-05-12,1910-08-13,90.0,Nurse,1820-05-12,1910-08-13
3,Marie Curie,1867-11-07,1934-07-04,66.0,Chemist,1867-11-07,1934-07-04
1,William Gosset,1876-06-13,1937-10-16,61.0,Statistician,1876-06-13,1937-10-16
6,Alan Turing,1912-06-23,1954-06-07,41.0,Computer Scientist,1912-06-23,1954-06-07
0,Rosaline Franklin,1920-07-25,1958-04-16,37.0,Chemist,1920-07-25,1958-04-16
4,Rachel Carson,1907-05-27,1964-04-14,56.0,Biologist,1907-05-27,1964-04-14


#### Saving and retrieving

In [67]:
#Save the data frame in HDF (hierarchila data format)
scientists.to_hdf("scientists.h5", key='scientists', mode='w')
#read the hdf file back into a different data frame to check the results
new_sci = pd.read_hdf('scientists.h5', key='scientists')
new_sci

Unnamed: 0,Name,Born,Died,Age,Occupation,born_dt,died_dt
7,Johann Gauss,1777-04-30,1855-02-23,77.0,Mathematician,1777-04-30,1855-02-23
5,John Snow,1813-03-15,1858-06-16,45.0,Physician,1813-03-15,1858-06-16
2,Florence Nightingale,1820-05-12,1910-08-13,90.0,Nurse,1820-05-12,1910-08-13
3,Marie Curie,1867-11-07,1934-07-04,66.0,Chemist,1867-11-07,1934-07-04
1,William Gosset,1876-06-13,1937-10-16,61.0,Statistician,1876-06-13,1937-10-16
6,Alan Turing,1912-06-23,1954-06-07,41.0,Computer Scientist,1912-06-23,1954-06-07
0,Rosaline Franklin,1920-07-25,1958-04-16,37.0,Chemist,1920-07-25,1958-04-16
4,Rachel Carson,1907-05-27,1964-04-14,56.0,Biologist,1907-05-27,1964-04-14


In [68]:
import feather
path = 'scientists.feather'
feather.write_dataframe(scientists, path)

In [69]:
new_sci_2 = feather.read_dataframe(path)
new_sci_2

Unnamed: 0,Name,Born,Died,Age,Occupation,born_dt,died_dt
0,Johann Gauss,1777-04-30,1855-02-23,77.0,Mathematician,1777-04-30,1855-02-23
1,John Snow,1813-03-15,1858-06-16,45.0,Physician,1813-03-15,1858-06-16
2,Florence Nightingale,1820-05-12,1910-08-13,90.0,Nurse,1820-05-12,1910-08-13
3,Marie Curie,1867-11-07,1934-07-04,66.0,Chemist,1867-11-07,1934-07-04
4,William Gosset,1876-06-13,1937-10-16,61.0,Statistician,1876-06-13,1937-10-16
5,Alan Turing,1912-06-23,1954-06-07,41.0,Computer Scientist,1912-06-23,1954-06-07
6,Rosaline Franklin,1920-07-25,1958-04-16,37.0,Chemist,1920-07-25,1958-04-16
7,Rachel Carson,1907-05-27,1964-04-14,56.0,Biologist,1907-05-27,1964-04-14


### Practice using Abalone dataset

In [70]:
#load the data (tab-separated file) into a panda dataframe
df2 = pd.read_csv('abalone.data', header=None)
df2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [71]:
#check object type
type(df2)

pandas.core.frame.DataFrame

In [72]:
#Data frame dimensions (# of rows and columns)
df2.shape

(4177, 9)

In [73]:
#data types for all columns
df2.dtypes

0     object
1    float64
2    float64
3    float64
4    float64
5    float64
6    float64
7    float64
8      int64
dtype: object

In [74]:
#info about the data frame
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
0    4177 non-null object
1    4177 non-null float64
2    4177 non-null float64
3    4177 non-null float64
4    4177 non-null float64
5    4177 non-null float64
6    4177 non-null float64
7    4177 non-null float64
8    4177 non-null int64
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB


In [75]:
#add column names
df2.columns = ['sex', 'length', 'diameter', 'height', 'wholeWeight', 'shuckedWeight', 'visceraWeight', 'sehllWeight', 'rings']

In [76]:
#check results
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
sex              4177 non-null object
length           4177 non-null float64
diameter         4177 non-null float64
height           4177 non-null float64
wholeWeight      4177 non-null float64
shuckedWeight    4177 non-null float64
visceraWeight    4177 non-null float64
sehllWeight      4177 non-null float64
rings            4177 non-null int64
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB


In [77]:
# Selecting a single column ('sex') and saving it as a series
sex_df2 = df2['sex']
#Display the first five rows
sex_df2.head()

0    M
1    M
2    F
3    M
4    I
Name: sex, dtype: object

In [78]:
# Select a subset of columns
subset2 = df2[['sex', 'length', 'rings']]
subset2.head()

Unnamed: 0,sex,length,rings
0,M,0.455,15
1,M,0.35,7
2,F,0.53,9
3,M,0.44,10
4,I,0.33,7


In [79]:
#Selecting a subset by passing a list of columns
subset_columns = ['sex', 'length', 'rings']
subset2_2=df2[subset_columns]
subset2_2.head()

Unnamed: 0,sex,length,rings
0,M,0.455,15
1,M,0.35,7
2,F,0.53,9
3,M,0.44,10
4,I,0.33,7


In [80]:
#Select a row by index label
df2.loc[0]

sex                   M
length            0.455
diameter          0.365
height            0.095
wholeWeight       0.514
shuckedWeight    0.2245
visceraWeight     0.101
sehllWeight        0.15
rings                15
Name: 0, dtype: object

In [81]:
#Select a row by row number
df2.iloc[0]

sex                   M
length            0.455
diameter          0.365
height            0.095
wholeWeight       0.514
shuckedWeight    0.2245
visceraWeight     0.101
sehllWeight        0.15
rings                15
Name: 0, dtype: object

In [82]:
#copy the data frame and assign it a different index - rings
df2_rings = df2.copy()
df2_rings.set_index('rings', inplace=True)
#check results
df2_rings.head()

Unnamed: 0_level_0,sex,length,diameter,height,wholeWeight,shuckedWeight,visceraWeight,sehllWeight
rings,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
15,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15
7,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07
9,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21
10,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155
7,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055


In [83]:
#Choose rows using the row label and 
#display the first five results
df2_rings.loc[10].head()


Unnamed: 0_level_0,sex,length,diameter,height,wholeWeight,shuckedWeight,visceraWeight,sehllWeight
rings,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155
10,M,0.43,0.35,0.11,0.406,0.1675,0.081,0.135
10,F,0.535,0.405,0.145,0.6845,0.2725,0.171,0.205
10,F,0.47,0.355,0.1,0.4755,0.1675,0.0805,0.185
10,F,0.44,0.34,0.1,0.451,0.188,0.087,0.13


In [84]:
#Chose a row using a row index
df2_rings.iloc[3]


sex                   M
length             0.44
diameter          0.365
height            0.125
wholeWeight       0.516
shuckedWeight    0.2155
visceraWeight     0.114
sehllWeight       0.155
Name: 10, dtype: object

In [85]:
#Selecting multiple rows
df2_rings.iloc[[0, 10, 100]]


Unnamed: 0_level_0,sex,length,diameter,height,wholeWeight,shuckedWeight,visceraWeight,sehllWeight
rings,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
15,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15
14,F,0.525,0.38,0.14,0.6065,0.194,0.1475,0.21
7,I,0.36,0.265,0.095,0.2315,0.105,0.046,0.075


In [86]:
#subsetting rows using a range (between 0 and 100 with a step=10)
index_range2 = list(range(0, 100, 10))
df2_rings.iloc[index_range2]


Unnamed: 0_level_0,sex,length,diameter,height,wholeWeight,shuckedWeight,visceraWeight,sehllWeight
rings,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
15,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15
14,F,0.525,0.38,0.14,0.6065,0.194,0.1475,0.21
11,M,0.355,0.28,0.095,0.2455,0.0955,0.062,0.075
10,M,0.58,0.47,0.165,0.9975,0.3935,0.242,0.33
9,F,0.45,0.335,0.105,0.425,0.1865,0.091,0.115
8,I,0.52,0.41,0.12,0.595,0.2385,0.111,0.19
7,M,0.45,0.345,0.105,0.4115,0.18,0.1125,0.135
13,M,0.555,0.425,0.13,0.7665,0.264,0.168,0.275
9,F,0.575,0.445,0.14,0.941,0.3845,0.252,0.285
15,M,0.565,0.425,0.135,0.8115,0.341,0.1675,0.255


In [87]:
#display content of the index_range2
index_range2

[0, 10, 20, 30, 40, 50, 60, 70, 80, 90]

In [88]:
#display the first five elements
index_range2[0:5]

[0, 10, 20, 30, 40]

In [89]:
#all elements
index_range2[:]

[0, 10, 20, 30, 40, 50, 60, 70, 80, 90]

In [90]:
#slicing two-dimensional object
#display all columns for the labels 15 to 19 (included)
df2[15:20]

Unnamed: 0,sex,length,diameter,height,wholeWeight,shuckedWeight,visceraWeight,sehllWeight,rings
15,M,0.5,0.4,0.13,0.6645,0.258,0.133,0.24,12
16,I,0.355,0.28,0.085,0.2905,0.095,0.0395,0.115,7
17,F,0.44,0.34,0.1,0.451,0.188,0.087,0.13,10
18,M,0.365,0.295,0.08,0.2555,0.097,0.043,0.1,7
19,M,0.45,0.32,0.1,0.381,0.1705,0.075,0.115,9


In [91]:
#Select all rows for the columns sex, length and rings
#and display the first five
df2.loc[:, ['sex', 'length', 'rings']].head()

Unnamed: 0,sex,length,rings
0,M,0.455,15
1,M,0.35,7
2,F,0.53,9
3,M,0.44,10
4,I,0.33,7


In [92]:
#display columns sex, length and rings for the labels 15 through 19 
df2.loc[15:19,['sex', 'length', 'rings']]

Unnamed: 0,sex,length,rings
15,M,0.5,12
16,I,0.355,7
17,F,0.44,10
18,M,0.365,7
19,M,0.45,9


In [93]:
#Find average shell length for shells with  different number of rings
df2.groupby('rings')['length'].mean()

rings
1     0.075000
2     0.150000
3     0.176000
4     0.221491
5     0.285739
6     0.369363
7     0.422033
8     0.498776
9     0.546865
10    0.574629
11    0.599374
12    0.589457
13    0.578892
14    0.580198
15    0.575728
16    0.587537
17    0.601034
18    0.596071
19    0.595625
20    0.603654
21    0.618214
22    0.595000
23    0.587222
24    0.695000
25    0.645000
26    0.600000
27    0.607500
29    0.700000
Name: length, dtype: float64

In [94]:
#group by sex, then by the number of rings and calculate means for the length, diameter and height
#Note: abalone shells are classified into male, female and infant
df2.groupby(['sex', 'rings']) \
[['length', 'diameter', 'height']]. \
mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,length,diameter,height
sex,rings,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
F,5,0.323750,0.241250,0.080000
F,6,0.462812,0.349062,0.116875
F,7,0.467841,0.363864,0.123068
F,8,0.538033,0.418238,0.148279
F,9,0.574601,0.447899,0.151807
F,10,0.582278,0.456512,0.156855
F,11,0.613700,0.482650,0.166050
F,12,0.594922,0.467813,0.163242
F,13,0.581477,0.460682,0.161307
F,14,0.596071,0.475268,0.171250


In [95]:
#flatten the results
df2.groupby(['sex', 'rings']) \
[['length', 'diameter', 'height']]. \
mean().reset_index()

Unnamed: 0,sex,rings,length,diameter,height
0,F,5,0.323750,0.241250,0.080000
1,F,6,0.462812,0.349062,0.116875
2,F,7,0.467841,0.363864,0.123068
3,F,8,0.538033,0.418238,0.148279
4,F,9,0.574601,0.447899,0.151807
5,F,10,0.582278,0.456512,0.156855
6,F,11,0.613700,0.482650,0.166050
7,F,12,0.594922,0.467813,0.163242
8,F,13,0.581477,0.460682,0.161307
9,F,14,0.596071,0.475268,0.171250


In [96]:
#Count a possible number of groups based on different number of rings for each gender (M, F, I)
df2.groupby('sex')['rings'].nunique()

sex
F    23
I    21
M    24
Name: rings, dtype: int64