Refference:

https://pandas.pydata.org/docs/getting_started/intro_tutorials/01_table_oriented.html#min-tut-01-tableoriented

https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf

DataFrame
- is a 2-Dimensional data structure that can store data of different types
- row column of data (much like a table in a database or a sheet in Excel)

In [1]:
# import pandas and give it an alias pd
import pandas as pd

In [2]:
# create instance of DataFrame
# this is also how we create 
# DataFrame from scratch
df = pd.DataFrame(
    {
        'name': [
            "Braund, Mr. Owen Harris",
            "Allen, Mr. Willian Henry",
            "Bonnel, Miss. Elizabeth"
        ],
        'age': [23,30,45],
        'sex': [
            "male", "male", "female"
        ]
    }
)

# print dataframe
df

Unnamed: 0,name,age,sex
0,"Braund, Mr. Owen Harris",23,male
1,"Allen, Mr. Willian Henry",30,male
2,"Bonnel, Miss. Elizabeth",45,female


Series
- each column in a DataFrame is a Series
- a Series has no column label
- a Series is just a single column
- a series has a row label
- is a 1-Dimensional array
- when using Series.shape, only the nrows is returned

In [34]:
# to print a series in a dataframe
# DataFrame['column_name']
# this will return a series object
series_name = df['name']
series_name

0     Braund, Mr. Owen Harris
1    Allen, Mr. Willian Henry
2     Bonnel, Miss. Elizabeth
Name: name, dtype: object

In [35]:
type(series_name)

pandas.core.series.Series

In [36]:
# Series.shape only returns nrows
series_name.shape

(3,)

In [31]:
# here's how to create a series from scratch
series_from_scratch = pd.Series(["A", "B", "C", "D", "E"], name="alphabet")
# print the series
series_from_scratch

0    A
1    B
2    C
3    D
4    E
Name: alphabet, dtype: object

In [40]:
# let's select multiple columns
# from our DataFrame
name_age = df[['name', 'age']]

# A series is a single column
# If you select multiple columns from a 
# dataframe, the type is a DataFrame
type(name_age)

pandas.core.frame.DataFrame

In [38]:
# print the type and shape of
# series_from_scratch
t = type(series_from_scratch)
s = series_from_scratch.shape

print(f'Type {t} - shape {s}')

Type <class 'pandas.core.series.Series'> - shape (5,)


In [23]:
# create an excel file named titanic_data.xlsx
# and set the sheet name to passengers
df.to_excel('data/titanic_data.xlsx', sheet_name='passengers')

In [22]:
# create an csv file named titanic_data.csv
df.to_csv('data/titanic_data.csv', sep=',')

# Reading CSV file using pandas

In [73]:
# read csv file
titanic_passenger_df = pd.read_csv('data/titanic-passenger-data.csv')

# print DataFrame
titanic_passenger_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Selecting Rows

Using conditional expression

In [45]:
# select passengers whose age is 35 and above
age_above_35 = titanic_passenger_df[titanic_passenger_df['Age']>=35]
age_above_35

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S
...,...,...,...,...,...,...,...,...,...,...,...,...
865,866,1,2,"Bystrom, Mrs. (Karolina)",female,42.0,0,0,236852,13.0000,,S
871,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D35,S
873,874,0,3,"Vander Cruyssen, Mr. Victor",male,47.0,0,0,345765,9.0000,,S
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C


In [48]:
# let's get all male passengers whose age is 35 and above
age_above_35_male = titanic_passenger_df[(titanic_passenger_df['Age']>=35) & (titanic_passenger_df['Sex']=='male')]
age_above_35_male

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
13,14,0,3,"Andersson, Mr. Anders Johan",male,39.0,1,5,347082,31.2750,,S
20,21,0,2,"Fynney, Mr. Joseph J",male,35.0,0,0,239865,26.0000,,S
30,31,0,1,"Uruchurtu, Don. Manuel E",male,40.0,0,0,PC 17601,27.7208,,C
...,...,...,...,...,...,...,...,...,...,...,...,...
847,848,0,3,"Markoff, Mr. Marin",male,35.0,0,0,349213,7.8958,,C
851,852,0,3,"Svensson, Mr. Johan",male,74.0,0,0,347060,7.7750,,S
857,858,1,1,"Daly, Mr. Peter Denis",male,51.0,0,0,113055,26.5500,E17,S
860,861,0,3,"Hansen, Mr. Claus Peter",male,41.0,2,0,350026,14.1083,,S


In [51]:
# let's get all female passengers using the isin(list) function
# the isin(list) function will require a list as an arguement
all_female_passengers = titanic_passenger_df[titanic_passenger_df['Sex'].isin(['female'])]
all_female_passengers

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C
...,...,...,...,...,...,...,...,...,...,...,...,...
880,881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25.0,0,1,230433,26.0000,,S
882,883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22.0,0,0,7552,10.5167,,S
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,,Q
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S


In [55]:
# lets get all female passengers in cabin C85 and B42
all_female_passengers_c85_b42 = titanic_passenger_df[(titanic_passenger_df['Sex'].isin(['female']) & titanic_passenger_df['Cabin'].isin(['C85', 'B42']))]
all_female_passengers_c85_b42

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S


In [59]:
# passengers with unspecified (na) cabin
titanic_passenger_df[titanic_passenger_df['Age'].isna()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S


In [60]:
titanic_passenger_df[titanic_passenger_df['Cabin'].isna()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,,Q
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S


# Selecting Rows and Columns

DataFrame.loc[r, c]
- allows you to query row and column of data from a DataFrame

In [64]:
# let's get a list of names who are less than 20 years old
twenty_below_passengers = titanic_passenger_df.loc[titanic_passenger_df['Age']<20, ['Name', 'Age']]
twenty_below_passengers

Unnamed: 0,Name,Age
7,"Palsson, Master. Gosta Leonard",2.0
9,"Nasser, Mrs. Nicholas (Adele Achem)",14.0
10,"Sandstrom, Miss. Marguerite Rut",4.0
14,"Vestrom, Miss. Hulda Amanda Adolfina",14.0
16,"Rice, Master. Eugene",2.0
...,...,...
855,"Aks, Mrs. Sam (Leah Rosen)",18.0
869,"Johnson, Master. Harold Theodor",4.0
875,"Najib, Miss. Adele Kiamie ""Jane""",15.0
877,"Petroff, Mr. Nedelio",19.0


# Using DataFrame.Query

In [84]:
q_twenty_below_passengers = titanic_passenger_df.query('Age <= 18 & Sex =="female"')
q_twenty_below_passengers

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7000,G6,S
14,15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14.0,0,0,350406,7.8542,,S
22,23,1,3,"McGowan, Miss. Anna ""Annie""",female,15.0,0,0,330923,8.0292,,Q
24,25,0,3,"Palsson, Miss. Torborg Danira",female,8.0,3,1,349909,21.0750,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
830,831,1,3,"Yasbeck, Mrs. Antoni (Selini Alexander)",female,15.0,1,0,2659,14.4542,,C
852,853,0,3,"Boulos, Miss. Nourelain",female,9.0,1,1,2678,15.2458,,C
853,854,1,1,"Lines, Miss. Mary Conover",female,16.0,0,1,PC 17592,39.4000,D28,S
855,856,1,3,"Aks, Mrs. Sam (Leah Rosen)",female,18.0,0,1,392091,9.3500,,S


# Updating Rows

In [93]:
cabin_is_nan = titanic_passenger_df['Cabin'].isna()
# cabin_is_nan

cabin_is_na_to_nc = titanic_passenger_df.loc[cabin_is_nan, ['Cabin']] = 'NC'

'NC'

In [97]:
titanic_passenger_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,NC,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,NC,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,NC,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,NC,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,NC,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C
