# Wrangle Data with Pandas II
[Pandas](https://pandas.pydata.org/pandas-docs/stable/index.html) dataframe demonstration. 

In [1]:
# Data path
example_path = "https://raw.githubusercontent.com/emmanueliarussi/DataScienceCapstone/master/4_DataWrangling/data/pandas_example_II.csv"

In [2]:
# Read data from a CSV file
import pandas as pd

# Load transactions
soccer = pd.read_csv(example_path)
soccer

Unnamed: 0,player,salary,GP,G,A,SOT,PPG,P
0,Sergio Agüero\nForward — Manchester City,$19.2m,16.0,14,3.0,34,13.12,209.98
1,Eden Hazard\nMidfield — Chelsea,$18.9m,21.0,8,4.0,17,13.05,274.04
2,Alexis Sánchez\nForward — Arsenal,$17.6m,,12,7.0,29,11.19,223.86
3,Yaya Touré\nMidfield — Manchester City,$16.6m,18.0,7,1.0,19,10.99,197.91
4,Ángel Di María\nMidfield — Manchester United,$15.0m,13.0,3,,13,10.17,132.23
5,Santiago Cazorla\nMidfield — Arsenal,$14.8m,20.0,4,,20,9.97,
6,David Silva\nMidfield — Manchester City,$14.3m,15.0,6,2.0,11,10.35,155.26
7,Cesc Fàbregas\nMidfield — Chelsea,$14.0m,20.0,2,14.0,10,10.47,209.49
8,Saido Berahino\nForward — West Brom,$13.8m,21.0,9,0.0,20,7.02,147.43
9,Steven Gerrard\nMidfield — Liverpool,$13.8m,20.0,5,1.0,11,7.5,150.01


###  Apply computations row-wise

In [3]:
# Remove the $ from the salary
soccer['salary'] = soccer['salary'].apply(lambda x: x.strip('$m'))
soccer

Unnamed: 0,player,salary,GP,G,A,SOT,PPG,P
0,Sergio Agüero\nForward — Manchester City,19.2,16.0,14,3.0,34,13.12,209.98
1,Eden Hazard\nMidfield — Chelsea,18.9,21.0,8,4.0,17,13.05,274.04
2,Alexis Sánchez\nForward — Arsenal,17.6,,12,7.0,29,11.19,223.86
3,Yaya Touré\nMidfield — Manchester City,16.6,18.0,7,1.0,19,10.99,197.91
4,Ángel Di María\nMidfield — Manchester United,15.0,13.0,3,,13,10.17,132.23
5,Santiago Cazorla\nMidfield — Arsenal,14.8,20.0,4,,20,9.97,
6,David Silva\nMidfield — Manchester City,14.3,15.0,6,2.0,11,10.35,155.26
7,Cesc Fàbregas\nMidfield — Chelsea,14.0,20.0,2,14.0,10,10.47,209.49
8,Saido Berahino\nForward — West Brom,13.8,21.0,9,0.0,20,7.02,147.43
9,Steven Gerrard\nMidfield — Liverpool,13.8,20.0,5,1.0,11,7.5,150.01


In [4]:
# Need to parse player strings into new columns
# First, rename with nicer labels
soccer_rn = soccer.rename(columns={'P': 'points', 'GP': 'games','SOT': 'shots_on_target','G': 'goals', 'PPG': 'points_per_game',
                                'A': 'assists'})

soccer_rn

Unnamed: 0,player,salary,games,goals,assists,shots_on_target,points_per_game,points
0,Sergio Agüero\nForward — Manchester City,19.2,16.0,14,3.0,34,13.12,209.98
1,Eden Hazard\nMidfield — Chelsea,18.9,21.0,8,4.0,17,13.05,274.04
2,Alexis Sánchez\nForward — Arsenal,17.6,,12,7.0,29,11.19,223.86
3,Yaya Touré\nMidfield — Manchester City,16.6,18.0,7,1.0,19,10.99,197.91
4,Ángel Di María\nMidfield — Manchester United,15.0,13.0,3,,13,10.17,132.23
5,Santiago Cazorla\nMidfield — Arsenal,14.8,20.0,4,,20,9.97,
6,David Silva\nMidfield — Manchester City,14.3,15.0,6,2.0,11,10.35,155.26
7,Cesc Fàbregas\nMidfield — Chelsea,14.0,20.0,2,14.0,10,10.47,209.49
8,Saido Berahino\nForward — West Brom,13.8,21.0,9,0.0,20,7.02,147.43
9,Steven Gerrard\nMidfield — Liverpool,13.8,20.0,5,1.0,11,7.5,150.01


In [5]:
# Processing player info
def process_player_col(text):
    name, rest = text.split('\n')
    position, team = [x.strip() for x in rest.split(' — ')]
    return pd.Series([name, team, position])

# Add new team and position columns 
soccer_rn['team']     = pd.Series('', index=soccer_rn.index)
soccer_rn['position'] = pd.Series('', index=soccer_rn.index)

# Process each row
soccer_rn[['player', 'team', 'position']] = soccer_rn.player.apply(process_player_col)
soccer_rn

Unnamed: 0,player,salary,games,goals,assists,shots_on_target,points_per_game,points,team,position
0,Sergio Agüero,19.2,16.0,14,3.0,34,13.12,209.98,Manchester City,Forward
1,Eden Hazard,18.9,21.0,8,4.0,17,13.05,274.04,Chelsea,Midfield
2,Alexis Sánchez,17.6,,12,7.0,29,11.19,223.86,Arsenal,Forward
3,Yaya Touré,16.6,18.0,7,1.0,19,10.99,197.91,Manchester City,Midfield
4,Ángel Di María,15.0,13.0,3,,13,10.17,132.23,Manchester United,Midfield
5,Santiago Cazorla,14.8,20.0,4,,20,9.97,,Arsenal,Midfield
6,David Silva,14.3,15.0,6,2.0,11,10.35,155.26,Manchester City,Midfield
7,Cesc Fàbregas,14.0,20.0,2,14.0,10,10.47,209.49,Chelsea,Midfield
8,Saido Berahino,13.8,21.0,9,0.0,20,7.02,147.43,West Brom,Forward
9,Steven Gerrard,13.8,20.0,5,1.0,11,7.5,150.01,Liverpool,Midfield


### Applying computations row-wise to Multiple Columns

In [6]:
# Lowcase player, position, and team columns
cols = ['player', 'position', 'team']
soccer_rn[cols] = soccer_rn[cols].applymap(lambda x: x.lower())
soccer_rn

Unnamed: 0,player,salary,games,goals,assists,shots_on_target,points_per_game,points,team,position
0,sergio agüero,19.2,16.0,14,3.0,34,13.12,209.98,manchester city,forward
1,eden hazard,18.9,21.0,8,4.0,17,13.05,274.04,chelsea,midfield
2,alexis sánchez,17.6,,12,7.0,29,11.19,223.86,arsenal,forward
3,yaya touré,16.6,18.0,7,1.0,19,10.99,197.91,manchester city,midfield
4,ángel di maría,15.0,13.0,3,,13,10.17,132.23,manchester united,midfield
5,santiago cazorla,14.8,20.0,4,,20,9.97,,arsenal,midfield
6,david silva,14.3,15.0,6,2.0,11,10.35,155.26,manchester city,midfield
7,cesc fàbregas,14.0,20.0,2,14.0,10,10.47,209.49,chelsea,midfield
8,saido berahino,13.8,21.0,9,0.0,20,7.02,147.43,west brom,forward
9,steven gerrard,13.8,20.0,5,1.0,11,7.5,150.01,liverpool,midfield


### Counting rows with NaNs

In [7]:
# Count nans rows
nans = soccer_rn.shape[0] - soccer_rn.dropna().shape[0]
print('{} rows have missing values'.format(nans))

3 rows have missing values


In [8]:
# Selecting rows with NaNs in the assists column
soccer_rn[soccer_rn['assists'].isnull()]

Unnamed: 0,player,salary,games,goals,assists,shots_on_target,points_per_game,points,team,position
4,ángel di maría,15.0,13.0,3,,13,10.17,132.23,manchester united,midfield
5,santiago cazorla,14.8,20.0,4,,20,9.97,,arsenal,midfield


In [9]:
# Selecting non NaNs rows (for assists column)
soccer_rn[soccer_rn['assists'].notnull()]

Unnamed: 0,player,salary,games,goals,assists,shots_on_target,points_per_game,points,team,position
0,sergio agüero,19.2,16.0,14,3.0,34,13.12,209.98,manchester city,forward
1,eden hazard,18.9,21.0,8,4.0,17,13.05,274.04,chelsea,midfield
2,alexis sánchez,17.6,,12,7.0,29,11.19,223.86,arsenal,forward
3,yaya touré,16.6,18.0,7,1.0,19,10.99,197.91,manchester city,midfield
6,david silva,14.3,15.0,6,2.0,11,10.35,155.26,manchester city,midfield
7,cesc fàbregas,14.0,20.0,2,14.0,10,10.47,209.49,chelsea,midfield
8,saido berahino,13.8,21.0,9,0.0,20,7.02,147.43,west brom,forward
9,steven gerrard,13.8,20.0,5,1.0,11,7.5,150.01,liverpool,midfield


In [10]:
# Filling NaN with default value 0
soccer_rn.fillna(value=0, inplace=True)
soccer_rn

Unnamed: 0,player,salary,games,goals,assists,shots_on_target,points_per_game,points,team,position
0,sergio agüero,19.2,16.0,14,3.0,34,13.12,209.98,manchester city,forward
1,eden hazard,18.9,21.0,8,4.0,17,13.05,274.04,chelsea,midfield
2,alexis sánchez,17.6,0.0,12,7.0,29,11.19,223.86,arsenal,forward
3,yaya touré,16.6,18.0,7,1.0,19,10.99,197.91,manchester city,midfield
4,ángel di maría,15.0,13.0,3,0.0,13,10.17,132.23,manchester united,midfield
5,santiago cazorla,14.8,20.0,4,0.0,20,9.97,0.0,arsenal,midfield
6,david silva,14.3,15.0,6,2.0,11,10.35,155.26,manchester city,midfield
7,cesc fàbregas,14.0,20.0,2,14.0,10,10.47,209.49,chelsea,midfield
8,saido berahino,13.8,21.0,9,0.0,20,7.02,147.43,west brom,forward
9,steven gerrard,13.8,20.0,5,1.0,11,7.5,150.01,liverpool,midfield
