## Import modules

In [19]:
import pandas as pd
import numpy as np

## Read in the dataset

In [20]:
data = pd.read_csv('data-titanic.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Apply functions using apply()

In [21]:
func_lower = lambda x: x.lower()

In [22]:
data.Name.apply(func_lower)

0                                braund, mr. owen harris
1      cumings, mrs. john bradley (florence briggs th...
2                                 heikkinen, miss. laina
3           futrelle, mrs. jacques heath (lily may peel)
4                               allen, mr. william henry
5                                       moran, mr. james
6                                mccarthy, mr. timothy j
7                         palsson, master. gosta leonard
8      johnson, mrs. oscar w (elisabeth vilhelmina berg)
9                    nasser, mrs. nicholas (adele achem)
10                       sandstrom, miss. marguerite rut
11                              bonnell, miss. elizabeth
12                        saundercock, mr. william henry
13                           andersson, mr. anders johan
14                  vestrom, miss. hulda amanda adolfina
15                      hewlett, mrs. (mary d kingcome) 
16                                  rice, master. eugene
17                          wil

## Apply functions using applymap()

In [23]:
data[['Age', 'Pclass']].applymap(np.square)

Unnamed: 0,Age,Pclass
0,484.0,9
1,1444.0,1
2,676.0,9
3,1225.0,1
4,1225.0,9
5,,9
6,2916.0,1
7,4.0,9
8,729.0,9
9,196.0,4


### Applying our own functions 

In [24]:
def my_func(i):
    return i + 20

In [25]:
data[['Age', 'Pclass']].applymap(my_func)

Unnamed: 0,Age,Pclass
0,42.0,23
1,58.0,21
2,46.0,23
3,55.0,21
4,55.0,23
5,,23
6,74.0,21
7,22.0,23
8,47.0,23
9,34.0,22


## A `SettingWithCopyWarning` scenario

In [36]:
data[data.Age.isnull()].Age = data.Age.mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


## Handling the `SettingWithCopyWarning`

In [39]:
data[data.Age.isnull()].Age.head()

5    NaN
17   NaN
19   NaN
26   NaN
28   NaN
Name: Age, dtype: float64

In [41]:
data.loc[data.Age.isnull(), 'Age'] = data.Age.mean

In [42]:
data[data.Age.isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked


In [7]:
data = pd.read_csv('data-titanic.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Missing Records

### Find out total records in the dataset

In [8]:
data.shape

(891, 12)

### Number of valid records per column

In [9]:
data.count()

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64

## Dropping missing records

### Drop all records that have  one or more missing values

In [10]:
data_missing_dropped = data.dropna()
data_missing_dropped.shape

(183, 12)

### Drop only those rows that have all records missing

In [11]:
data_all_missing_dropped = data.dropna(how="all")
data_all_missing_dropped.shape

(891, 12)

## Fill in missing data

### Fill in missing data with zeros

In [10]:
data_filled_zeros =  data.fillna(0)
data_filled_zeros.count()

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            891
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          891
Embarked       891
dtype: int64

### Fill in missing data with a mean of the values from other rows

In [12]:
data_filled_in_mean = data.copy()
data_filled_in_mean.Age.fillna(data.Age.mean(), inplace=True)
data_filled_in_mean.count()

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            891
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64

## Default Index

In [5]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Set an Index post reading of data

In [6]:
data.set_index('Name')

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"Braund, Mr. Owen Harris",1,0,3,male,22.0,1,0,A/5 21171,7.2500,,S
"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
"Heikkinen, Miss. Laina",3,1,3,female,26.0,0,0,STON/O2. 3101282,7.9250,,S
"Futrelle, Mrs. Jacques Heath (Lily May Peel)",4,1,1,female,35.0,1,0,113803,53.1000,C123,S
"Allen, Mr. William Henry",5,0,3,male,35.0,0,0,373450,8.0500,,S
"Moran, Mr. James",6,0,3,male,,0,0,330877,8.4583,,Q
"McCarthy, Mr. Timothy J",7,0,1,male,54.0,0,0,17463,51.8625,E46,S
"Palsson, Master. Gosta Leonard",8,0,3,male,2.0,3,1,349909,21.0750,,S
"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",9,1,3,female,27.0,0,2,347742,11.1333,,S
"Nasser, Mrs. Nicholas (Adele Achem)",10,1,2,female,14.0,1,0,237736,30.0708,,C


## Set an Index while reading data

In [1]:
data = pd.read_csv('data-titanic.csv', index_col=3)
data.head()

NameError: name 'pd' is not defined

## Selection using Index

In [8]:
data.loc['Braund, Mr. Owen Harris',:]

PassengerId            1
Survived               0
Pclass                 3
Sex                 male
Age                   22
SibSp                  1
Parch                  0
Ticket         A/5 21171
Fare                7.25
Cabin                NaN
Embarked               S
Name: Braund, Mr. Owen Harris, dtype: object

## Reset Index

In [9]:
data.reset_index(inplace=True)

In [10]:
data.head()

Unnamed: 0,Name,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,"Braund, Mr. Owen Harris",1,0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,"Heikkinen, Miss. Laina",3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",4,1,1,female,35.0,1,0,113803,53.1,C123,S
4,"Allen, Mr. William Henry",5,0,3,male,35.0,0,0,373450,8.05,,S


## Remove column(s)

### Remove one column

In [5]:
data.drop('Ticket', axis=1, inplace=True)

In [6]:
data.head()

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
"Braund, Mr. Owen Harris",1,0,3,male,22.0,1,0,7.25,,S
"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",2,1,1,female,38.0,1,0,71.2833,C85,C
"Heikkinen, Miss. Laina",3,1,3,female,26.0,0,0,7.925,,S
"Futrelle, Mrs. Jacques Heath (Lily May Peel)",4,1,1,female,35.0,1,0,53.1,C123,S
"Allen, Mr. William Henry",5,0,3,male,35.0,0,0,8.05,,S


### Remove more than one column

In [7]:
data.drop(['Parch', 'Fare'], axis=1, inplace=True)

In [8]:
data.head()

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Cabin,Embarked
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
"Braund, Mr. Owen Harris",1,0,3,male,22.0,1,,S
"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",2,1,1,female,38.0,1,C85,C
"Heikkinen, Miss. Laina",3,1,3,female,26.0,0,,S
"Futrelle, Mrs. Jacques Heath (Lily May Peel)",4,1,1,female,35.0,1,C123,S
"Allen, Mr. William Henry",5,0,3,male,35.0,0,,S


## Remove row(s)

In [9]:
data.drop(['Braund, Mr. Owen Harris', 'Heikkinen, Miss. Laina'], inplace=True)

In [10]:
data.head()

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Cabin,Embarked
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",2,1,1,female,38.0,1,C85,C
"Futrelle, Mrs. Jacques Heath (Lily May Peel)",4,1,1,female,35.0,1,C123,S
"Allen, Mr. William Henry",5,0,3,male,35.0,0,,S
"Moran, Mr. James",6,0,3,male,,0,,Q
"McCarthy, Mr. Timothy J",7,0,1,male,54.0,0,E46,S


## Renaming Columns

## Rename columns while reading the data

In [5]:
list_columns = ['Date', 'Region ID', 'Region Name', 'State',
             'City', 'County', 'Size Rank','Price']
data = pd.read_csv('data-zillow.csv', names = list_columns)
data.head()

Unnamed: 0,Date,Region ID,Region Name,State,City,County,Size Rank,Price
0,Date,RegionID,RegionName,State,Metro,County,SizeRank,Price
1,2017-05-31,6181,New York,NY,New York,Queens,0,672400
2,2017-05-31,12447,Los Angeles,CA,Los Angeles-Long Beach-Anaheim,Los Angeles,1,629900
3,2017-05-31,17426,Chicago,IL,Chicago,Cook,2,222700
4,2017-05-31,13271,Philadelphia,PA,Philadelphia,Philadelphia,3,137300


## Rename columns using rename method

### Read in the dataset again

In [4]:
data = pd.read_csv('data-zillow.csv')
data.head()

Unnamed: 0,Date,RegionID,RegionName,State,Metro,County,SizeRank,Price
0,2017-05-31,6181,New York,NY,New York,Queens,0,672400
1,2017-05-31,12447,Los Angeles,CA,Los Angeles-Long Beach-Anaheim,Los Angeles,1,629900
2,2017-05-31,17426,Chicago,IL,Chicago,Cook,2,222700
3,2017-05-31,13271,Philadelphia,PA,Philadelphia,Philadelphia,3,137300
4,2017-05-31,40326,Phoenix,AZ,Phoenix,Maricopa,4,211300


### Rename 

In [7]:
data.columns

Index(['Date', 'Region ID', 'Region Name', 'State', 'City', 'County',
       'Size Rank', 'Price'],
      dtype='object')

In [8]:
data.rename(columns={'RegionName':'Region', 'Metro':'City'}, inplace=True)

In [9]:
data.columns

Index(['Date', 'Region ID', 'Region Name', 'State', 'City', 'County',
       'Size Rank', 'Price'],
      dtype='object')

## Rename all columns

In [12]:
data.columns = ['Date', 'Region ID', 'Region Name', 'State',
             'City', 'County', 'Size Rank','Price']