### Saving and Loading Data

In [2]:
import numpy as np
import pandas as pd

In [6]:
arr = np.arange(10)
np.save('my_array', arr) # this commits to a file
np.savez('my_array_archive', arr) # this commits to a file as a zip

In [12]:
# load back in
arr2 = np.load('my_array.npy')
arr2
arch = np.load('my_array_archive.npz')
arch

<numpy.lib.npyio.NpzFile at 0x1ae1e2272b0>

In [19]:
# we can create delimited data
np.random.seed(20) # we will nowe have reproducible random numbers
r = np.arange(10)
np.random.shuffle(r)
r

array([7, 1, 8, 5, 0, 2, 6, 9, 4, 3])

In [21]:
np.savetxt('r.csv', r, delimiter=',') # output a csv file
d = np.loadtxt('r.csv')
d

array([7., 1., 8., 5., 0., 2., 6., 9., 4., 3.])

### Loading Existing Data

In [40]:
# we will load 'salaries.csv'
df = pd.read_csv('data/salaries.csv')
df # we have a DataFrame
df.describe()
df.columns
df.size
df.sum() # min, max, count etc.
df.rank # same as df['rank']
df

Unnamed: 0,rank,discipline,phd,service,sex,salary
0,Prof,B,56,49,Male,186960
1,Prof,A,12,6,Male,93000
2,Prof,A,23,20,Male,110515
3,Prof,A,40,31,Male,131205
4,Prof,B,20,18,Male,104800
...,...,...,...,...,...,...
73,Prof,B,18,10,Female,105450
74,AssocProf,B,19,6,Female,104542
75,Prof,B,17,17,Female,124312
76,Prof,A,28,14,Female,109954


In [46]:
# grouping
df_rank = df.groupby(['rank', 'sex'])
# we then call statistical methods on our group
df_rank.mean() # or max() min() etc

Unnamed: 0_level_0,Unnamed: 1_level_0,phd,service,salary
rank,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AssocProf,Female,15.5,11.5,88512.8
AssocProf,Male,13.666667,10.666667,102697.666667
AsstProf,Female,5.636364,2.545455,78049.909091
AsstProf,Male,4.25,1.75,85918.0
Prof,Female,23.722222,17.111111,121967.611111
Prof,Male,29.214286,24.178571,124690.142857


In [50]:
# bring together...
l = ['salary', 'service'] # create a list
df.groupby('rank')[l].mean() # then use it

Unnamed: 0_level_0,salary,service
rank,Unnamed: 1_level_1,Unnamed: 2_level_1
AssocProf,91786.230769,11.307692
AsstProf,81362.789474,2.210526
Prof,123624.804348,21.413043


In [56]:
# challenge: calculate the mean salary for each rank
# where salary <=120000
df_sub = df[ df['salary']<=120000 ]
df_sub.groupby(['rank']).mean()


Unnamed: 0_level_0,phd,service,salary
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AssocProf,15.076923,11.307692,91786.230769
AsstProf,5.052632,2.210526,81362.789474
Prof,25.0,17.619048,102080.047619
