# Data Wrangling

###### Creating a Data Frame

In [1]:
import pandas as pd
dataframe = pd.DataFrame()

###### Describing the Data

###### Navigating DataFrames

In [31]:
# select the first row
# print(df.iloc[0])

# select three rows
# print(df.iloc[1:4])

# all rows up to and including the fourth row
print(df.iloc[:])

                                               Name PClass    Age     Sex  \
0                      Allen, Miss Elisabeth Walton    1st  29.00  female   
1                       Allison, Miss Helen Loraine    1st   2.00  female   
2               Allison, Mr Hudson Joshua Creighton    1st  30.00    male   
3     Allison, Mrs Hudson JC (Bessie Waldo Daniels)    1st  25.00  female   
4                     Allison, Master Hudson Trevor    1st   0.92    male   
...                                             ...    ...    ...     ...   
1308                             Zakarian, Mr Artun    3rd  27.00    male   
1309                         Zakarian, Mr Maprieder    3rd  26.00    male   
1310                               Zenni, Mr Philip    3rd  22.00    male   
1311                               Lievens, Mr Rene    3rd  24.00    male   
1312                                 Zimmerman, Leo    3rd  29.00    male   

      Survived  SexCode  
0            1        1  
1            0        1

In [4]:
# select the first row
print(df.iloc[0])

# select three rows
print(df.iloc[1:4])

# all rows up to and including the fourth row
print(df.iloc[:4])

Name        Allen, Miss Elisabeth Walton
PClass                               1st
Age                                   29
Sex                               female
Survived                               1
SexCode                                1
Name: 0, dtype: object
                                            Name PClass   Age     Sex  \
1                    Allison, Miss Helen Loraine    1st   2.0  female   
2            Allison, Mr Hudson Joshua Creighton    1st  30.0    male   
3  Allison, Mrs Hudson JC (Bessie Waldo Daniels)    1st  25.0  female   

   Survived  SexCode  
1         0        1  
2         0        0  
3         0        1  
                                            Name PClass   Age     Sex  \
0                   Allen, Miss Elisabeth Walton    1st  29.0  female   
1                    Allison, Miss Helen Loraine    1st   2.0  female   
2            Allison, Mr Hudson Joshua Creighton    1st  30.0    male   
3  Allison, Mrs Hudson JC (Bessie Waldo Daniels)    1s

###### Discussion
To select individual rows and slices of rows, pandas provides two methods:

loc is useful when the index of the DataFrame is a label (a string)<br>
iloc works by looking for the position in the DataFrame. For exmaple, iloc[0] will return the first row regardless of whether the index is an integer or a label

###### Selecting Rows Based on Conditionals

In [5]:
# select top two rows where column 'sex' is 'female'
df[df['Sex'] == 'female'].head(3)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0,1


In [6]:
# multiple conditions
df[(df['Sex'] == 'female') & (df['Age'] >= 65)]

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
73,"Crosby, Mrs Edward Gifford (Catherine Elizabet...",1st,69.0,female,1,1


###### Replacing Values

In [7]:
# replace any instance of 'female' with Woman
df['Sex'].replace('female', 'Woman').head(5)

0    Woman
1    Woman
2     male
3    Woman
4     male
Name: Sex, dtype: object

In [8]:
# replace any instance of 'female' with Woman
df['Sex'].replace(['female', 'male'], ['Woman', 'Man']).head(10)


0    Woman
1    Woman
2      Man
3    Woman
4      Man
5      Man
6    Woman
7      Man
8    Woman
9      Man
Name: Sex, dtype: object

In [9]:
df.replace(1, "One").head(2)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29,female,One,One
1,"Allison, Miss Helen Loraine",1st,2,female,0,One


###### Renaming Columns

In [10]:
df.rename(columns={'PClass': 'Passenger Class'}).head(2)

Unnamed: 0,Name,Passenger Class,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1


In [11]:
df.rename(columns={'PClass': 'Passenger Class', 'Sex': 'Gender'}).head(2)

Unnamed: 0,Name,Passenger Class,Age,Gender,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1


###### Finding the Min, Max, Sum, Average, and Count

In [12]:
print('Maximum: {}'.format(df['Age'].max()))
print('Minimum: {}'.format(df['Age'].min()))
print('Mean: {}'.format(df['Age'].mean()))
print('Sum: {}'.format(df['Age'].sum()))
print('Count: {}'.format(df['Age'].count()))

Maximum: 71.0
Minimum: 0.17
Mean: 30.397989417989415
Sum: 22980.88
Count: 756


In [13]:
print("Variance: {}".format(df.var()))
print("Standard Deviation: {}".format(df.std()))
print("Kurtosis: {}".format(df.kurt()))
print("Skewness: {}".format(df.skew()))

Variance: Age         203.320470
Survived      0.225437
SexCode       0.228230
dtype: float64
Standard Deviation: Age         14.259049
Survived     0.474802
SexCode      0.477734
dtype: float64
Kurtosis: Age        -0.036536
Survived   -1.562162
SexCode    -1.616702
dtype: float64
Skewness: Age         0.368511
Survived    0.663491
SexCode     0.621098
dtype: float64


![image.png](attachment:image.png)

![image.png](attachment:image.png)

###### Finding Unique Values

In [14]:

# unique will return an array of all unique values in a column
df['PClass'].unique()

array(['1st', '2nd', '*', '3rd'], dtype=object)

In [15]:
# value_counts will display all unique values with the number of times each value appears
df['Sex'].value_counts()

male      851
female    462
Name: Sex, dtype: int64

###### Handling Missing Values
NaN : Not a number

In [16]:
# select missing values, show 2 rows
df[df['Age'].isnull()].head(2)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
12,"Aubert, Mrs Leontine Pauline",1st,,female,1,1
13,"Barkworth, Mr Algernon H",1st,,male,1,0


###### Deleting a Column

In [17]:
# axis=1 means the column axis
df.drop('Age', axis=1).head(2)

Unnamed: 0,Name,PClass,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,female,1,1
1,"Allison, Miss Helen Loraine",1st,female,0,1


###### Deleting a Row

In [18]:

# create new dataframe excluding the rows you want to delete
df[df['Sex'] != 'male'].head(2)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1


In [19]:
# delete a row by matching a unique value
df[df['Name'] != 'Allison, Miss Helen Loraine'].head(20)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0,1
4,"Allison, Master Hudson Trevor",1st,0.92,male,1,0
5,"Anderson, Mr Harry",1st,47.0,male,1,0
6,"Andrews, Miss Kornelia Theodosia",1st,63.0,female,1,1
7,"Andrews, Mr Thomas, jr",1st,39.0,male,0,0
8,"Appleton, Mrs Edward Dale (Charlotte Lamson)",1st,58.0,female,1,1
9,"Artagaveytia, Mr Ramon",1st,71.0,male,0,0
10,"Astor, Colonel John Jacob",1st,47.0,male,0,0


In [20]:
# delete a row by index
df[df.index != 0].head(2)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
1,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0


###### Grouping Rows by Values

In [21]:
df.groupby('Sex').mean()

Unnamed: 0_level_0,Age,Survived,SexCode
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,29.396424,0.666667,1.0
male,31.014338,0.166863,0.0


In [22]:
df.groupby('Survived')['Name'].count()

Survived
0    863
1    450
Name: Name, dtype: int64

In [23]:
df.groupby(['Sex', 'Survived'])['Age'].mean()

Sex     Survived
female  0           24.901408
        1           30.867143
male    0           32.320780
        1           25.951875
Name: Age, dtype: float64

###### Grouping Rows by Time

###### Looping Over a Column

In [24]:
# for .. in .. loop
for name in df['Name'][0:20]:
    print(name.upper())

ALLEN, MISS ELISABETH WALTON
ALLISON, MISS HELEN LORAINE
ALLISON, MR HUDSON JOSHUA CREIGHTON
ALLISON, MRS HUDSON JC (BESSIE WALDO DANIELS)
ALLISON, MASTER HUDSON TREVOR
ANDERSON, MR HARRY
ANDREWS, MISS KORNELIA THEODOSIA
ANDREWS, MR THOMAS, JR
APPLETON, MRS EDWARD DALE (CHARLOTTE LAMSON)
ARTAGAVEYTIA, MR RAMON
ASTOR, COLONEL JOHN JACOB
ASTOR, MRS JOHN JACOB (MADELEINE TALMADGE FORCE)
AUBERT, MRS LEONTINE PAULINE
BARKWORTH, MR ALGERNON H
BAUMANN, MR JOHN D
BAXTER, MRS JAMES (HELENE DELAUDENIERE CHAPUT)
BAXTER, MR QUIGG EDMOND
BEATTIE, MR THOMSON
BECKWITH, MR RICHARD LEONARD
BECKWITH, MRS RICHARD LEONARD (SALLIE MONYPENY)


In [25]:

# list comprehension (more "pythonic")
[name.upper() for name in df['Name'][0:2]]

['ALLEN, MISS ELISABETH WALTON', 'ALLISON, MISS HELEN LORAINE']

###### Applying a Function Over All Elements in a Column

In [26]:
def uppercase(x):
    return x.upper()

df['Name'].apply(uppercase)[0:2]

0    ALLEN, MISS ELISABETH WALTON
1     ALLISON, MISS HELEN LORAINE
Name: Name, dtype: object

###### Applying a Function to Groups

In [27]:
df.groupby('Sex').apply(lambda x: x.count())

Unnamed: 0_level_0,Name,PClass,Age,Sex,Survived,SexCode
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
female,462,462,288,462,462,462
male,851,851,468,851,851,851


###### Concatenating DataFrames

Homework: Find it??