In [1]:
import pandas as pd
import numpy as np

In [2]:
nums = [10, 20, 30, 40, 50]
ser1 = pd.Series(nums)
ser1

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [3]:
type(ser1)

pandas.core.series.Series

In [7]:
row_labels = 'A B C D E'.split()

In [8]:
ser1 = pd.Series(nums, index=row_labels)
ser1

A    10
B    20
C    30
D    40
E    50
dtype: int64

In [10]:
ser1 = pd.Series(nums, index='A B C C C'.split())
ser1

A    10
B    20
C    30
C    40
C    50
dtype: int64

In [11]:
ser1['A']   # indexing works like in a normal dict.

10

In [13]:
ser1['C']

C    30
C    40
C    50
dtype: int64

In [16]:
names = ['Rashmi', 'Sameer', np.pi, np.inf, np.nan]
dept = ['HR', 'HR', 'Tech', 'Tech', 'Sales']
emp = pd.Series(names, index=dept)
emp

HR        Rashmi
HR        Sameer
Tech     3.14159
Tech         inf
Sales        NaN
dtype: object

In [17]:
emp['HR']

HR    Rashmi
HR    Sameer
dtype: object

In [19]:
emp['Tech']

Tech    3.14159
Tech        inf
dtype: object

In [21]:
emp['Sales'] = 9501 # you can also set the values using the indexing!

In [22]:
emp

HR        Rashmi
HR        Sameer
Tech     3.14159
Tech         inf
Sales       9501
dtype: object

In [24]:
emp == 'Rashmi'

HR        True
HR       False
Tech     False
Tech     False
Sales    False
dtype: bool

In [25]:
emp[emp == 'Rashmi']

HR    Rashmi
dtype: object

In [28]:
emp[emp == 'Rashmi'].index[0]

'HR'

In [32]:
emp['Tech'] = [5000, 6000]
emp

HR       Rashmi
HR       Sameer
Tech       5000
Tech       6000
Sales      9501
dtype: object

In [33]:
ser1

A    10
B    20
C    30
C    40
C    50
dtype: int64

In [34]:
ser1.describe()   # quick statistical summary of your Col

count     5.000000
mean     30.000000
std      15.811388
min      10.000000
25%      20.000000
50%      30.000000
75%      40.000000
max      50.000000
dtype: float64

In [38]:
ser1.describe().round(2)

count     5.00
mean     30.00
std      15.81
min      10.00
25%      20.00
50%      30.00
75%      40.00
max      50.00
dtype: float64

In [39]:
ser1[0]

10

In [40]:
ser1[-1]   # linear indexing also works just like numpy arrays, lists

50

# DataFrames

In [50]:
np.random.seed(100)
data = 100*np.random.rand(4,5)
data = data.astype('int32')
data

array([[54, 27, 42, 84,  0],
       [12, 67, 82, 13, 57],
       [89, 20, 18, 10, 21],
       [97, 81, 17, 81, 27]])

In [57]:
row_labels = ['India', 'US', 'UK', 'Japan']
col_labels = 'A B C D E'.split()
df = pd.DataFrame(data, columns=col_labels, index=row_labels)
df

Unnamed: 0,A,B,C,D,E
India,54,27,42,84,0
US,12,67,82,13,57
UK,89,20,18,10,21
Japan,97,81,17,81,27


In [58]:
df.shape

(4, 5)

In [48]:
df.size  # number of elements

20

In [49]:
df.values   # will retrive the entire values as a numpy array!

array([[54, 27, 42, 84,  0],
       [12, 67, 82, 13, 57],
       [89, 20, 18, 10, 21],
       [97, 81, 17, 81, 27]])

In [51]:
df

Unnamed: 0,A,B,C,D,E
India,54,27,42,84,0
US,12,67,82,13,57
UK,89,20,18,10,21
Japan,97,81,17,81,27


In [53]:
df['B']   # fetch the values in the col 'B'

India    27
US       67
UK       20
Japan    81
Name: B, dtype: int32

In [55]:
type(df['B'])

pandas.core.series.Series

In [60]:
df[['D', 'B']]  # provide a list of col labels if you want to fetch multiple columns

Unnamed: 0,D,B
India,84,27
US,13,67
UK,10,20
Japan,81,81


In [61]:
df['India']  # this syntax (providing the row labels) only works for Series datatype

KeyError: 'India'

In [62]:
df.loc['India']  # fetching a single prw by providinf its label (use the .loc)

A    54
B    27
C    42
D    84
E     0
Name: India, dtype: int32

In [64]:
type(df.loc['India'])

pandas.core.series.Series

In [65]:
df.loc[['US', 'India']] # list of row labels to be used for fetching multiple rows !!

Unnamed: 0,A,B,C,D,E
US,12,67,82,13,57
India,54,27,42,84,0


In [67]:
df

Unnamed: 0,A,B,C,D,E
India,54,27,42,84,0
US,12,67,82,13,57
UK,89,20,18,10,21
Japan,97,81,17,81,27


In [66]:
df.iloc[0, 2]  # this row,loc indexing

42

In [68]:
df.iloc[0, 2] = 4200
df

Unnamed: 0,A,B,C,D,E
India,54,27,4200,84,0
US,12,67,82,13,57
UK,89,20,18,10,21
Japan,97,81,17,81,27


In [69]:
df.iloc[:, 2]  # fetch all the rows of the 3rd col

India    4200
US         82
UK         18
Japan      17
Name: C, dtype: int32

In [71]:
df.iloc[-1, 0:3]  # 1st 3 columns of the last row

A    97
B    81
C    17
Name: Japan, dtype: int32

In [73]:
df.drop('E', axis=1)

Unnamed: 0,A,B,C,D
India,54,27,4200,84
US,12,67,82,13
UK,89,20,18,10
Japan,97,81,17,81


In [74]:
df

Unnamed: 0,A,B,C,D,E
India,54,27,4200,84,0
US,12,67,82,13,57
UK,89,20,18,10,21
Japan,97,81,17,81,27


In [75]:
df.drop('E', axis=1, inplace=True)
df

Unnamed: 0,A,B,C,D
India,54,27,4200,84
US,12,67,82,13
UK,89,20,18,10
Japan,97,81,17,81


In [84]:
col_names = df.columns.values
row_names = df.index.values
df = pd.DataFrame(df.values, index=df.columns.values, columns=row_names)
df

Unnamed: 0,India,US,UK,Japan
A,54,27,4200,84
B,12,67,82,13
C,89,20,18,10
D,97,81,17,81


In [85]:
# df.rename_axis() can also be used to renames rows & col

In [86]:
df.T

Unnamed: 0,A,B,C,D
India,54,12,89,97
US,27,67,20,81
UK,4200,82,18,17
Japan,84,13,10,81


In [89]:
df.describe().round(2).transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
India,4.0,63.0,38.79,12.0,43.5,71.5,91.0,97.0
US,4.0,48.75,29.85,20.0,25.25,47.0,70.5,81.0
UK,4.0,1079.25,2080.72,17.0,17.75,50.0,1111.5,4200.0
Japan,4.0,47.0,41.03,10.0,12.25,47.0,81.75,84.0


In [90]:
df.info()  # quick infor qabout your dataframe

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, A to D
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   India   4 non-null      int32
 1   US      4 non-null      int32
 2   UK      4 non-null      int32
 3   Japan   4 non-null      int32
dtypes: int32(4)
memory usage: 256.0+ bytes


In [91]:
df.iloc[2,2] = np.nan
df

Unnamed: 0,India,US,UK,Japan
A,54,27,4200.0,84
B,12,67,82.0,13
C,89,20,,10
D,97,81,17.0,81


In [92]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, A to D
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   India   4 non-null      int32  
 1   US      4 non-null      int32  
 2   UK      3 non-null      float64
 3   Japan   4 non-null      int32  
dtypes: float64(1), int32(3)
memory usage: 272.0+ bytes


## Titanic Data Analyses

In [1]:
import pandas as pd

In [2]:
pwd

'G:\\My Drive\\Training\\SimpliLearn\\PG DS - Data Science with Python\\Batch-3'

In [3]:
titanic = pd.read_csv('titanic_train.csv')
titanic.head()   # look into the 1st 5 rows

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [102]:
path + r'\titanic_train.csv'

'C:\\Users\\Prashant\\Desktop\\titanic_train.csv'

In [100]:
path = r'C:\Users\Prashant\Desktop'
titanic = pd.read_csv(path + r'\titanic_train.csv')

In [96]:
titanic.head(8)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S


In [97]:
titanic.tail()  # last 5 rows

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [103]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [104]:
titanic.shape

(891, 12)

In [106]:
titanic.isna().sum()  # number of missing values in EACH Column!!!

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [109]:
100*titanic.isna().sum()/titanic.shape[0]   # converting the above info into percentages

PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            19.865320
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.000000
Cabin          77.104377
Embarked        0.224467
dtype: float64

In [111]:
titanic.describe().round(2)  # by defult you will get the statistical summary only the numeric columns

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.38,2.31,29.7,0.52,0.38,32.2
std,257.35,0.49,0.84,14.53,1.1,0.81,49.69
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.12,0.0,0.0,7.91
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.45
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.33


In [112]:
titanic.describe( include='O').round(2) 

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Frolicher-Stehli, Mr. Maxmillian",male,1601,B96 B98,S
freq,1,577,7,4,644


In [114]:
titanic['Survived'].dtype

dtype('int64')

In [4]:
titanic['Survived'] = titanic['Survived'].astype('category')

In [119]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   PassengerId  891 non-null    int64   
 1   Survived     891 non-null    category
 2   Pclass       891 non-null    int64   
 3   Name         891 non-null    object  
 4   Sex          891 non-null    object  
 5   Age          714 non-null    float64 
 6   SibSp        891 non-null    int64   
 7   Parch        891 non-null    int64   
 8   Ticket       891 non-null    object  
 9   Fare         891 non-null    float64 
 10  Cabin        204 non-null    object  
 11  Embarked     889 non-null    object  
dtypes: category(1), float64(2), int64(4), object(5)
memory usage: 77.7+ KB


In [5]:
titanic['Pclass'] = titanic['Pclass'].astype('category')
titanic['Sex'] = titanic['Sex'].astype('category')
titanic['Embarked'] = titanic['Embarked'].astype('category')
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   PassengerId  891 non-null    int64   
 1   Survived     891 non-null    category
 2   Pclass       891 non-null    category
 3   Name         891 non-null    object  
 4   Sex          891 non-null    category
 5   Age          714 non-null    float64 
 6   SibSp        891 non-null    int64   
 7   Parch        891 non-null    int64   
 8   Ticket       891 non-null    object  
 9   Fare         891 non-null    float64 
 10  Cabin        204 non-null    object  
 11  Embarked     889 non-null    category
dtypes: category(4), float64(2), int64(3), object(3)
memory usage: 59.7+ KB


In [123]:
titanic.describe(include='category')

Unnamed: 0,Survived,Pclass,Sex,Embarked
count,891,891,891,889
unique,2,3,2,3
top,0,3,male,S
freq,549,491,577,644


In [6]:
titanic.drop(['PassengerId', 'Ticket', 'Cabin'], axis=1, inplace=True)
titanic.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,71.2833,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


In [125]:
# What are unique values in the 'Sex' col
titanic['Sex'].unique()

['male', 'female']
Categories (2, object): ['male', 'female']

In [126]:
# How many  unique values are present in the 'Sex' col
titanic['Sex'].nunique()

2

In [127]:
# What is the distribution of males & femailes in the 'Sex' col
titanic['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [129]:
# same as above, but as fractions
titanic['Sex'].value_counts(normalize=True)

male      0.647587
female    0.352413
Name: Sex, dtype: float64

In [132]:
titanic['Embarked'].unique()

['S', 'C', 'Q', NaN]
Categories (3, object): ['S', 'C', 'Q']

In [134]:
titanic['Embarked'].nunique()   # nan is not counted as a unique

3

In [139]:
titanic['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [140]:
# from which port maximum passengers have embarked
titanic['Embarked'].value_counts().index[0]

'S'

In [143]:
# How many passengers embarked from Queenstown ?
titanic[titanic['Embarked'] == 'Q']

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
5,0,3,"Moran, Mr. James",male,,0,0,8.4583,Q
16,0,3,"Rice, Master. Eugene",male,2.0,4,1,29.1250,Q
22,1,3,"McGowan, Miss. Anna ""Annie""",female,15.0,0,0,8.0292,Q
28,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,7.8792,Q
32,1,3,"Glynn, Miss. Mary Agatha",female,,0,0,7.7500,Q
44,1,3,"Devaney, Miss. Margaret Delia",female,19.0,0,0,7.8792,Q
46,0,3,"Lennon, Mr. Denis",male,,1,0,15.5000,Q
47,1,3,"O'Driscoll, Miss. Bridget",female,,0,0,7.7500,Q
82,1,3,"McDermott, Miss. Brigdet Delia",female,,0,0,7.7875,Q
109,1,3,"Moran, Miss. Bertha",female,,1,0,24.1500,Q


In [152]:
sum(titanic['Embarked'] == 'Q')

77

In [145]:
len(titanic[titanic['Embarked'] == 'Q'])

77

In [148]:
titanic[titanic['Embarked'] == 'Q'].shape[0]

77

In [151]:
titanic['Embarked'].value_counts()['Q']

77

In [157]:
# Gimme the names & PClass of all the passengers who have age < 1 year
titanic['Age'] < 1

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Name: Age, Length: 891, dtype: bool

In [159]:
titanic[titanic['Age'] < 1]

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
78,1,2,"Caldwell, Master. Alden Gates",male,0.83,0,2,29.0,S
305,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,151.55,S
469,1,3,"Baclini, Miss. Helene Barbara",female,0.75,2,1,19.2583,C
644,1,3,"Baclini, Miss. Eugenie",female,0.75,2,1,19.2583,C
755,1,2,"Hamalainen, Master. Viljo",male,0.67,1,1,14.5,S
803,1,3,"Thomas, Master. Assad Alexander",male,0.42,0,1,8.5167,C
831,1,2,"Richards, Master. George Sibley",male,0.83,1,1,18.75,S


In [161]:
# how many passengers have age < 1 year
sum(titanic['Age'] < 1)

7

In [165]:
titanic[titanic['Age'] < 1][['Name', 'Pclass', 'Age']]

Unnamed: 0,Name,Pclass,Age
78,"Caldwell, Master. Alden Gates",2,0.83
305,"Allison, Master. Hudson Trevor",1,0.92
469,"Baclini, Miss. Helene Barbara",3,0.75
644,"Baclini, Miss. Eugenie",3,0.75
755,"Hamalainen, Master. Viljo",2,0.67
803,"Thomas, Master. Assad Alexander",3,0.42
831,"Richards, Master. George Sibley",2,0.83


In [168]:
# How many passengers have age < 1 year OR >70 years
sum(   (titanic['Age'] < 1) | (titanic['Age'] > 70)   )

12

In [170]:
# what is the age of George Sibley ???
sum(titanic['Name'] == 'George Sibley')   # NOT THE RIGHT WAY !!!

0

In [8]:
for k, name in enumerate(titanic['Name']):
    if 'George Sibley' in name:
        break

In [9]:
titanic.iloc[k]

Survived                                  1
Pclass                                    2
Name        Richards, Master. George Sibley
Sex                                    male
Age                                    0.83
SibSp                                     1
Parch                                     1
Fare                                  18.75
Embarked                                  S
Name: 831, dtype: object

In [10]:
titanic.iloc[k]['Age']

0.83

In [None]:
# Homework:

In [16]:
# How many passengers have a title "Master" in thier name ??
# titanic['Name'] 
titanic.loc[titanic['Name'].str.contains("Master")]

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
7,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,21.075,S
16,0,3,"Rice, Master. Eugene",male,2.0,4,1,29.125,Q
50,0,3,"Panula, Master. Juha Niilo",male,7.0,4,1,39.6875,S
59,0,3,"Goodwin, Master. William Frederick",male,11.0,5,2,46.9,S
63,0,3,"Skoog, Master. Harald",male,4.0,3,2,27.9,S
65,1,3,"Moubarek, Master. Gerios",male,,1,1,15.2458,C
78,1,2,"Caldwell, Master. Alden Gates",male,0.83,0,2,29.0,S
125,1,3,"Nicola-Yarred, Master. Elias",male,12.0,1,0,11.2417,C
159,0,3,"Sage, Master. Thomas Henry",male,,8,2,69.55,S
164,0,3,"Panula, Master. Eino Viljami",male,1.0,4,1,39.6875,S


In [14]:
len(titanic.loc[titanic['Name'].str.contains("Master")])

40

In [22]:
sum(titanic['Name'].str.contains('master',case=False))

40

In [29]:
sum(titanic['Name'].apply(lambda name: 'master' in name.lower()))

40

In [33]:
# How many passengers travelling in 1st class actually survived ???
titanic[(titanic['Pclass']==1) & (titanic['Survived']==1)]

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,71.2833,C
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1000,S
11,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,26.5500,S
23,1,1,"Sloper, Mr. William Thompson",male,28.0,0,0,35.5000,S
31,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,146.5208,C
52,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49.0,1,0,76.7292,C
55,1,1,"Woolner, Mr. Hugh",male,,0,0,35.5000,S
61,1,1,"Icard, Miss. Amelie",female,38.0,0,0,80.0000,
88,1,1,"Fortune, Miss. Mabel Helen",female,23.0,3,2,263.0000,S
97,1,1,"Greenfield, Mr. William Bertram",male,23.0,0,1,63.3583,C


In [35]:
titanic[(titanic['Pclass']==1) & (titanic['Survived']==1)].shape[0]

136

In [36]:
len(titanic[(titanic['Pclass']==1) & (titanic['Survived']==1)])

136

In [41]:
# How many passengers are travelling alone ??
titanic['Group_size'] = titanic['SibSp'] + titanic['Parch']
titanic.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Group_size
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,1
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,71.2833,C,1
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,1
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,0


In [44]:
sum(titanic['Group_size'].apply(lambda x: x==0))

537

In [43]:
titanic['Alone'] = titanic['Group_size'].apply(lambda x: x==0)
titanic.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Group_size,Alone
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,1,False
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,71.2833,C,1,False
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,0,True
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,1,False
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,0,True


In [45]:
titanic['Alone'].value_counts()

True     537
False    354
Name: Alone, dtype: int64

In [47]:
# !pip install pandas-profiling

In [48]:
import pandas_profiling
profile = pandas_profiling.ProfileReport(titanic)
profile.to_file(output_file = 'Titanic Data Analysis.html')

HBox(children=(HTML(value='Summarize dataset'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Generate report structure'), FloatProgress(value=0.0, max=1.0), HTML(value='')))

  cmap.set_bad(cmap_bad)





HBox(children=(HTML(value='Render HTML'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Export report to file'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [None]:
# !pip install sweetviz

In [49]:
import sweetviz as sv
sweet_report = sv.analyze(titanic)
sweet_report.show_html('Titanic Data Sweet Report.html')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, layout=Layout(flex='2'), max=12.0), HTML(value='')), lâ€¦


Report Titanic Data Sweet Report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


## Save the data

In [51]:
titanic.to_pickle('titanic_clean1.pkl') # serialization (saving the Python object as a binry file)

In [52]:
titanic.to_csv('titanic_clean1.csv', index=False)

In [53]:
titanic.to_excel('titanic_clean1.xlsx', index=False, sheet_name='Titanic_clean')