In [1]:
import pandas as pd
import numpy as np

In [2]:
# create a pd datafarme as below
np.random.seed(10)
df = pd.DataFrame(np.random.randint(18, 100, size=(7,5)),
                 columns=['age', 'conversations', 'friends', 'unk1', 'unk2'])
df

Unnamed: 0,age,conversations,friends,unk1,unk2
0,27,33,82,46,47
1,26,91,18,58,54
2,34,29,72,80,51
3,90,96,67,69,72
4,95,87,31,43,31
5,48,48,30,83,49
6,75,54,45,36,95


In [3]:
# note that we're just using default indices for index here
#   - we can still use loc to get at the rows, but we'll just use the default indices now

# print the conversations column
print (df['conversations'])

# print the age and conversations column
print (df[['age', 'conversations']])

# print row 2
print (df.loc[2])

# print rows 0 and 2
print (df.loc[[0,2]])

# print element at row 2 and conversations column - using chained indexing 
print (df['conversations'][2])

# print element at row 2 and conversations column - using multi-dimensional indexing
print (df.loc[2, 'conversations'])

# print elements at row 0 and row 2 and age column and conversation column - using chained indexing 
print (df[['age', 'conversations']].loc[[0,2]])
# print (df[['age', 'conversations']][[0,2]]) # this will not work!

# print elements at row 0 and row 2 and age column and conversation column - using multi-dimensional indexing
print (df.loc[[0,2], ['age', 'conversations']])

0    33
1    91
2    29
3    96
4    87
5    48
6    54
Name: conversations, dtype: int32
   age  conversations
0   27             33
1   26             91
2   34             29
3   90             96
4   95             87
5   48             48
6   75             54
age              34
conversations    29
friends          72
unk1             80
unk2             51
Name: 2, dtype: int32
   age  conversations  friends  unk1  unk2
0   27             33       82    46    47
2   34             29       72    80    51
29
29
   age  conversations
0   27             33
2   34             29
   age  conversations
0   27             33
2   34             29


In [4]:
# drop columns friends, unk1 and unk2 (so df no longer has these columns)
# how would you do this with using inplace
# how would you do this without using inplace
df.drop(['friends', 'unk1','unk2'], axis=1, inplace=True)
# df = df.drop([''friends', unk1','unk2'], axis=1)
df

Unnamed: 0,age,conversations
0,27,33
1,26,91
2,34,29
3,90,96
4,95,87
5,48,48
6,75,54


In [5]:
# create a new column called 'age-group' and set it to age//10

# without apply and lambda
df['age-group'] = df['age']//10

# with apply and lambda
# df['age-group'] = df['age'].apply(lambda x: x//10)

# with apply and custom function
# def foo(x):
#     return x//10
# df['age-group'] = df['age'].apply(foo)

df

Unnamed: 0,age,conversations,age-group
0,27,33,2
1,26,91,2
2,34,29,3
3,90,96,9
4,95,87,9
5,48,48,4
6,75,54,7


In [6]:
# create a new column called 'gender' in df with the following values: 
# ['male', 'male', 'female', 'female', 'female', 'male', 'male']
df['gender'] = ['male', 'male', 'female', 'female', 'female', 'male', 'male']
df

Unnamed: 0,age,conversations,age-group,gender
0,27,33,2,male
1,26,91,2,male
2,34,29,3,female
3,90,96,9,female
4,95,87,9,female
5,48,48,4,male
6,75,54,7,male


In [7]:
# return all samples that are male and have > 50 conversations
df[(df['conversations'] > 50) & (df['gender']=='male')]

Unnamed: 0,age,conversations,age-group,gender
1,26,91,2,male
6,75,54,7,male


In [8]:
# return all samples that are female or have age > 40
df[(df['gender']=='female') | (df['age'] > 40)]

Unnamed: 0,age,conversations,age-group,gender
2,34,29,3,female
3,90,96,9,female
4,95,87,9,female
5,48,48,4,male
6,75,54,7,male


In [9]:
# Create new dataframe called dfnew which has the same data as df. 
# Then set the element at index 3 and column gender to MALE.
# while ensuring df doesn't change
# and while ensuring you don't get the SettingWithCopyWarning 

# using copy to ensure dfnew is not a view of df
# dfnew = df # this would give view
dfnew = df.copy()

# using multi-dimensional indexing to avoid the SettingWithCopyWarning
# dfnew['gender'].loc[3] = 'MALE' # this would give SettingWithCopyWarning
dfnew.loc[3,'gender'] = 'MALE' 

# df will not be updated with dfnew.loc[3,'gender'] = 'MALE'
df

# dfnew will be updated dfnew.loc[3,'gender'] = 'MALE'
dfnew

Unnamed: 0,age,conversations,age-group,gender
0,27,33,2,male
1,26,91,2,male
2,34,29,3,female
3,90,96,9,MALE
4,95,87,9,female
5,48,48,4,male
6,75,54,7,male


In [10]:
# create a pd datafarme as below
# then create a new column with the squared error (true - pred)^2
# then find mean squared error between true and predicted values
np.random.seed(0)
ser1 = np.random.randint(-100, 100, 5)
ser2 = ser1 + np.random.random(5)
df = pd.DataFrame({'true':ser1, 'pred':ser2})
df

Unnamed: 0,true,pred
0,72,72.857946
1,-53,-52.152748
2,17,17.623564
3,92,92.384382
4,-33,-32.702465


In [11]:
df['sqerror'] = (df['true'] - df['pred'])**2
df

Unnamed: 0,true,pred,sqerror
0,72,72.857946,0.736071
1,-53,-52.152748,0.717836
2,17,17.623564,0.388832
3,92,92.384382,0.147749
4,-33,-32.702465,0.088527


In [12]:
mse = df['sqerror'].mean()
mse

0.4158028029226076

In [13]:
# read kaggle train.csv data into dataframe called titanic
titanic = pd.read_csv('train.csv')
titanic.sample(n=5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
251,252,0,3,"Strom, Mrs. Wilhelm (Elna Matilda Persson)",female,29.0,1,1,347054,10.4625,G6,S
460,461,1,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.55,E12,S
496,497,1,1,"Eustis, Miss. Elizabeth Mussey",female,54.0,1,0,36947,78.2667,D20,C
485,486,0,3,"Lefebre, Miss. Jeannie",female,,3,1,4133,25.4667,,S
144,145,0,2,"Andrew, Mr. Edgardo Samuel",male,18.0,0,0,231945,11.5,,S


In [14]:
# create a copy of titanic and put it in a dataframe called df
df = titanic.copy()

In [15]:
# get the column names of df (columns)
df.columns

# get the #rows, #cols of df (shape)
df.shape

# get info on df (info())
df.info()

# get basic stats on df (describe())
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [16]:
# the Cabin column on it's own isn't useful, but the Deck (First Letter in Cabin) might be; create a Deck column
def getDeck(cabin):
    if pd.notna(cabin):
        return cabin[0]
    else:
        return np.nan
    
df['Deck'] = df['Cabin'].apply(getDeck)

# then look at a sample of rows where cabin was not nan, and make sure the transformation went through as expected
df[df['Deck'].notna()].sample(n=5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Deck
215,216,1,1,"Newell, Miss. Madeleine",female,31.0,1,0,35273,113.275,D36,C,D
867,868,0,1,"Roebling, Mr. Washington Augustus II",male,31.0,0,0,PC 17590,50.4958,A24,S,A
527,528,0,1,"Farthing, Mr. John",male,,0,0,PC 17483,221.7792,C95,S,C
737,738,1,1,"Lesurer, Mr. Gustave J",male,35.0,0,0,PC 17755,512.3292,B101,C,B
515,516,0,1,"Walker, Mr. William Anderson",male,47.0,0,0,36967,34.0208,D46,S,D


In [17]:
# the name column on its own isn't useful, but the title might be; create a title column
def getTitle(name):
    if pd.notna(name):
        title = name.split()[1]
        return title
    else:
        return np.nan

df['Title'] = df['Name'].apply(getTitle)

df[df['Title'].notna()].sample(n=5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Deck,Title
627,628,1,1,"Longley, Miss. Gretchen Fiske",female,21.0,0,0,13502,77.9583,D9,S,D,Miss.
767,768,0,3,"Mangan, Miss. Mary",female,30.5,0,0,364850,7.75,,Q,,Miss.
699,700,0,3,"Humblen, Mr. Adolf Mathias Nicolai Olsen",male,42.0,0,0,348121,7.65,F G63,S,F,Mr.
213,214,0,2,"Givard, Mr. Hans Kristensen",male,30.0,0,0,250646,13.0,,S,,Mr.
236,237,0,2,"Hold, Mr. Stephen",male,44.0,1,0,26707,26.0,,S,,Mr.


In [18]:
# get the frequency distribution of the various titles
df['Title'].value_counts()

Mr.             502
Miss.           179
Mrs.            121
Master.          40
Dr.               7
Rev.              6
y                 4
Impe,             3
Planke,           3
Major.            2
Mlle.             2
Gordon,           2
Col.              2
Don.              1
Carlo,            1
Capt.             1
Mulder,           1
Billiard,         1
Steen,            1
Messemaeker,      1
Pelsmaeker,       1
Melkebeke,        1
Walle,            1
Shawah,           1
the               1
Ms.               1
Mme.              1
der               1
Velde,            1
Jonkheer.         1
Cruyssen,         1
Name: Title, dtype: int64

In [19]:
# drop the following columns from df
# 'Name', 'Ticket', 'Cabin'
df.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
df.sample(n=5)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Title
412,413,1,1,female,33.0,1,0,90.0,Q,C,Miss.
810,811,0,3,male,26.0,0,0,7.8875,S,,Mr.
243,244,0,3,male,22.0,0,0,7.125,S,,Mr.
886,887,0,2,male,27.0,0,0,13.0,S,,Rev.
446,447,1,2,female,13.0,0,1,19.5,S,,Miss.


In [20]:
# get the amount of missing data for each column (count) 
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Fare             0
Embarked         2
Deck           687
Title            0
dtype: int64

In [21]:
# get the amount of missing data for each column (percentage) 
df.isna().mean()

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Sex            0.000000
Age            0.198653
SibSp          0.000000
Parch          0.000000
Fare           0.000000
Embarked       0.002245
Deck           0.771044
Title          0.000000
dtype: float64

In [22]:
# let's take care of missing Age data.
# let's create a new imputed column called impAge, which replaces missing Age values with mean of Age.
df['impAge'] = df['Age'].fillna(value=df['Age'].mean())
df.sample(5)
# df[df['Age'].isna()].head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Title,impAge
705,706,0,2,male,39.0,0,0,26.0,S,,Mr.,39.0
611,612,0,3,male,,0,0,7.05,S,,Mr.,29.699118
70,71,0,2,male,32.0,0,0,10.5,S,,Mr.,32.0
194,195,1,1,female,44.0,0,0,27.7208,C,B,Mrs.,44.0
762,763,1,3,male,20.0,0,0,7.2292,C,,Mr.,20.0


In [23]:
# let's take care of missing Embarked data

# get the overall distribution of Embarked (what values does it take, with what frequency?)
df['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [24]:
# use masking with conditional selection to examine the rows in df which have Embarked=Nan
df[df['Embarked'].isna()]

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Title,impAge
61,62,1,1,female,38.0,0,0,80.0,,B,Miss.,38.0
829,830,1,1,female,62.0,0,0,80.0,,B,Mrs.,62.0


In [25]:
# we could  just drop the rows with missing Embarked data - how would you do this?
# df = df.dropna(subset=['Embarked'])

# but instead, let's create a new imputed column called impEmbarked,
# which replaces missing Embarked values with 'X'
df['impEmbarked'] = df['Embarked'].fillna(value='X')
df[df['Embarked'].isna()].head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Title,impAge,impEmbarked
61,62,1,1,female,38.0,0,0,80.0,,B,Miss.,38.0,X
829,830,1,1,female,62.0,0,0,80.0,,B,Mrs.,62.0,X


In [26]:
# similarly, let's create a new imputed column called impDeck,
# which replaces missing Deck values with 'X'
df['impDeck'] = df['Deck'].fillna(value='X')
df.sample(5)
# df[df['Deck'].isna()].head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Title,impAge,impEmbarked,impDeck
370,371,1,1,male,25.0,1,0,55.4417,C,E,Mr.,25.0,C,E
495,496,0,3,male,,0,0,14.4583,C,,Mr.,29.699118,C,X
275,276,1,1,female,63.0,1,0,77.9583,S,D,Miss.,63.0,S,D
143,144,0,3,male,19.0,0,0,6.75,Q,,Mr.,19.0,Q,X
708,709,1,1,female,22.0,0,0,151.55,S,,Miss.,22.0,S,X
