In [1]:
import os 
import pandas as pd
import numpy as np


In [2]:
csvPathFile = os.path.join(os.getcwd(), 'roster.csv')
print(csvPathFile)

/Users/milesstevens/Documents/GitHub/helloworld/roster.csv


### Read CSV to DataFrame

In [3]:
roster = pd.read_csv(csvPathFile)
print(type(roster))

<class 'pandas.core.frame.DataFrame'>


#### Viewing the data

In [4]:
roster.head()

Unnamed: 0,name
0,Joe
1,Jihuan
2,Ali
3,Frances
4,Daniela V


In [5]:
roster

Unnamed: 0,name
0,Joe
1,Jihuan
2,Ali
3,Frances
4,Daniela V
5,Mostafa
6,Daniela P
7,Cesar
8,Jarrod
9,Austin


### Modifying the Data

In [6]:
d = {'name': ['Wally']}
tmp_df = pd.DataFrame(data=d)
tmp_df

Unnamed: 0,name
0,Wally


In [7]:
d = {'name': ['Wally']}
tmp_df = pd.DataFrame(data=d)
roster = pd.concat([roster, tmp_df], ignore_index=True)
roster

Unnamed: 0,name
0,Joe
1,Jihuan
2,Ali
3,Frances
4,Daniela V
5,Mostafa
6,Daniela P
7,Cesar
8,Jarrod
9,Austin


#### Assign Grades


In [8]:
roster['grade'] = 70
roster

Unnamed: 0,name,grade
0,Joe,70
1,Jihuan,70
2,Ali,70
3,Frances,70
4,Daniela V,70
5,Mostafa,70
6,Daniela P,70
7,Cesar,70
8,Jarrod,70
9,Austin,70


In [9]:
import random
roster['grade'] = random.randint(0,100)
roster

Unnamed: 0,name,grade
0,Joe,79
1,Jihuan,79
2,Ali,79
3,Frances,79
4,Daniela V,79
5,Mostafa,79
6,Daniela P,79
7,Cesar,79
8,Jarrod,79
9,Austin,79


In [10]:
np.random.seed(1)
np.random.randint(0,100, size =len(roster))

array([37, 12, 72,  9, 75,  5, 79, 64, 16,  1, 76, 71,  6, 25, 50, 20, 18,
       84, 11, 28, 29, 14, 50])

In [11]:
np.random.seed(1)
roster['grade']= np.random.randint(0,100, size =len(roster))
roster

Unnamed: 0,name,grade
0,Joe,37
1,Jihuan,12
2,Ali,72
3,Frances,9
4,Daniela V,75
5,Mostafa,5
6,Daniela P,79
7,Cesar,64
8,Jarrod,16
9,Austin,1


.loc can be used with a boolean array (i.e. of 1s and 0s)

In [12]:
roster['name'] == "Daniela P"

0     False
1     False
2     False
3     False
4     False
5     False
6      True
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
Name: name, dtype: bool

In [13]:
roster.loc[roster['name'] == 'Daniela P', 'grade'] = 100
roster

Unnamed: 0,name,grade
0,Joe,37
1,Jihuan,12
2,Ali,72
3,Frances,9
4,Daniela V,75
5,Mostafa,5
6,Daniela P,100
7,Cesar,64
8,Jarrod,16
9,Austin,1


### Check the class average

Each column pandas dataframe is a series object -- which have dozens of built-in methods

In [14]:
roster['grade'].mean()

37.95652173913044

In [15]:
roster.loc[roster['grade'] < 50, 'grade']= roster['grade'] + 40
roster

Unnamed: 0,name,grade
0,Joe,77
1,Jihuan,52
2,Ali,72
3,Frances,49
4,Daniela V,75
5,Mostafa,45
6,Daniela P,100
7,Cesar,64
8,Jarrod,56
9,Austin,41


In [16]:
roster['grade'].mean()

62.30434782608695

In [17]:
roster['grade'] = roster['grade'] + 13

In [18]:
roster['grade'].mean()

75.30434782608695

### Write to CSV

In [19]:
outFilePath = os.path.join(os.getcwd(), 'roster_pandas.csv')
print(outFilePath)

/Users/milesstevens/Documents/GitHub/helloworld/roster_pandas.csv


In [20]:
roster.to_csv(outFilePath, index=False)

### More Aggregation and Manipulation

In [21]:
np.random.seed(2)
np.random.choice(['red', 'blue'], size=len(roster))

array(['red', 'blue', 'blue', 'red', 'red', 'blue', 'red', 'blue', 'red',
       'blue', 'red', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue',
       'blue', 'red', 'red', 'red', 'red', 'blue'], dtype='<U4')

In [22]:
roster['group'] = np.random.choice(['red','blue'], size = len(roster))
roster

Unnamed: 0,name,grade,group
0,Joe,90,blue
1,Jihuan,65,blue
2,Ali,85,red
3,Frances,62,red
4,Daniela V,88,red
5,Mostafa,58,blue
6,Daniela P,113,blue
7,Cesar,77,blue
8,Jarrod,69,red
9,Austin,54,red


In [23]:
group_means = roster.groupby(by=['group']).mean()
group_means

Unnamed: 0_level_0,grade
group,Unnamed: 1_level_1
blue,76.5
red,74.0


In [24]:
group_means.rename(columns={'grade': 'group_avg'}, inplace =True)
group_means

Unnamed: 0_level_0,group_avg
group,Unnamed: 1_level_1
blue,76.5
red,74.0


### Merging DataFrames

In [25]:
print(roster.shape)
print(group_means.shape)

(23, 3)
(2, 1)


In [26]:
roster = roster.merge(group_means, on=['group'])
roster.shape

(23, 4)

In [27]:
roster

Unnamed: 0,name,grade,group,group_avg
0,Joe,90,blue,76.5
1,Jihuan,65,blue,76.5
2,Mostafa,58,blue,76.5
3,Daniela P,113,blue,76.5
4,Cesar,77,blue,76.5
5,Jack,89,blue,76.5
6,Miles,78,blue,76.5
7,Hyeyun,63,blue,76.5
8,Volodymyr,73,blue,76.5
9,Joshua,82,blue,76.5


### Creating new columns from custom functions

In [28]:
def is_top50(col):
    return col > col.median()


In [29]:
roster['top50'] = roster[['grade']].apply(is_top50)

In [30]:
type(roster[['grade']])

pandas.core.frame.DataFrame

In [31]:
roster

Unnamed: 0,name,grade,group,group_avg,top50
0,Joe,90,blue,76.5,True
1,Jihuan,65,blue,76.5,False
2,Mostafa,58,blue,76.5,False
3,Daniela P,113,blue,76.5,True
4,Cesar,77,blue,76.5,True
5,Jack,89,blue,76.5,True
6,Miles,78,blue,76.5,True
7,Hyeyun,63,blue,76.5,False
8,Volodymyr,73,blue,76.5,False
9,Joshua,82,blue,76.5,True


### Creating new columns from custom functions

By group

In [49]:
roster['top50_group'] = roster.groupby(by=['group'])[['grade']].apply(is_top50)
roster

Unnamed: 0,name,grade,group,group_avg,top50,top50_group
0,Joe,90,blue,76.5,True,True
1,Jihuan,65,blue,76.5,False,False
2,Mostafa,58,blue,76.5,False,False
3,Daniela P,113,blue,76.5,True,True
4,Cesar,77,blue,76.5,True,True
5,Jack,89,blue,76.5,True,True
6,Miles,78,blue,76.5,True,True
7,Hyeyun,63,blue,76.5,False,False
8,Volodymyr,73,blue,76.5,False,False
9,Joshua,82,blue,76.5,True,True


## Apply

Apply can be used to invoke a function on

1. each value of a Series object
2. each column or index in a DataFrame object
3. each DataFrame in a GroupBy object

In [34]:
def print_arg(x):
    print(x)
    
def print_type(x):
    print(type(x))

### pandas.Series.apply

In [32]:
grade_series = roster['grade']
print(type(grade_series))


<class 'pandas.core.series.Series'>


In [33]:
grade_series = roster['grade']
print(type(grade_series))
grade_series

<class 'pandas.core.series.Series'>


0      90
1      65
2      58
3     113
4      77
5      89
6      78
7      63
8      73
9      82
10     67
11     63
12     85
13     62
14     88
15     69
16     54
17     84
18     59
19     71
20     97
21     64
22     81
Name: grade, dtype: int64

In [36]:
grade_series.apply(print_arg)

90
65
58
113
77
89
78
63
73
82
67
63
85
62
88
69
54
84
59
71
97
64
81


0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
20    None
21    None
22    None
Name: grade, dtype: object

In [37]:
grade_series.apply(print_type)

<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>


0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
20    None
21    None
22    None
Name: grade, dtype: object

## pandas.DataFrame.apply

Apply a function to each row of the DataFrame (axis=1) or apply a function to each column of the data (axis).

In [38]:
grade_df = roster[['grade']]
print(type(grade_df))

<class 'pandas.core.frame.DataFrame'>


In [39]:
grade_df.apply(print_arg, axis=0)

0      90
1      65
2      58
3     113
4      77
5      89
6      78
7      63
8      73
9      82
10     67
11     63
12     85
13     62
14     88
15     69
16     54
17     84
18     59
19     71
20     97
21     64
22     81
Name: grade, dtype: int64


grade    None
dtype: object

In [40]:
grade_df.apply(print_type, axis=0)

<class 'pandas.core.series.Series'>


grade    None
dtype: object

In [41]:
grade_df.apply(print_type, axis=1)

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
20    None
21    None
22    None
dtype: object

In [42]:
grade_df.apply(print_arg, axis=1)

grade    90
Name: 0, dtype: int64
grade    65
Name: 1, dtype: int64
grade    58
Name: 2, dtype: int64
grade    113
Name: 3, dtype: int64
grade    77
Name: 4, dtype: int64
grade    89
Name: 5, dtype: int64
grade    78
Name: 6, dtype: int64
grade    63
Name: 7, dtype: int64
grade    73
Name: 8, dtype: int64
grade    82
Name: 9, dtype: int64
grade    67
Name: 10, dtype: int64
grade    63
Name: 11, dtype: int64
grade    85
Name: 12, dtype: int64
grade    62
Name: 13, dtype: int64
grade    88
Name: 14, dtype: int64
grade    69
Name: 15, dtype: int64
grade    54
Name: 16, dtype: int64
grade    84
Name: 17, dtype: int64
grade    59
Name: 18, dtype: int64
grade    71
Name: 19, dtype: int64
grade    97
Name: 20, dtype: int64
grade    64
Name: 21, dtype: int64
grade    81
Name: 22, dtype: int64


0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
20    None
21    None
22    None
dtype: object

## DataFrameGroupBy

In [43]:
groups = roster.groupby(by=['group'])
print(type(groups))
groups

<class 'pandas.core.groupby.generic.DataFrameGroupBy'>


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7ffcad8c95e0>

In [44]:
roster

Unnamed: 0,name,grade,group,group_avg,top50
0,Joe,90,blue,76.5,True
1,Jihuan,65,blue,76.5,False
2,Mostafa,58,blue,76.5,False
3,Daniela P,113,blue,76.5,True
4,Cesar,77,blue,76.5,True
5,Jack,89,blue,76.5,True
6,Miles,78,blue,76.5,True
7,Hyeyun,63,blue,76.5,False
8,Volodymyr,73,blue,76.5,False
9,Joshua,82,blue,76.5,True


In [45]:
groups.apply(print_arg)

         name  grade group  group_avg  top50
0         Joe     90  blue       76.5   True
1      Jihuan     65  blue       76.5  False
2     Mostafa     58  blue       76.5  False
3   Daniela P    113  blue       76.5   True
4       Cesar     77  blue       76.5   True
5        Jack     89  blue       76.5   True
6       Miles     78  blue       76.5   True
7      Hyeyun     63  blue       76.5  False
8   Volodymyr     73  blue       76.5  False
9      Joshua     82  blue       76.5   True
10      David     67  blue       76.5  False
11      Wally     63  blue       76.5  False
         name  grade group  group_avg  top50
12        Ali     85   red       74.0   True
13    Frances     62   red       74.0  False
14  Daniela V     88   red       74.0   True
15     Jarrod     69   red       74.0  False
16     Austin     54   red       74.0  False
17        Ala     84   red       74.0   True
18     Sergii     59   red       74.0  False
19      Yijia     71   red       74.0  False
20   Hsin-

In [46]:
groups.apply(print_type)

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
