# Saving and Loding data

In [1]:
import numpy as np
import pandas as pd

In [2]:
arr = np.arange(10)
np.save('some_array', arr)
np.savez('some_array_archive', arr) # for large files the compression really helps

In [3]:
# load back in
arr2 = np.load('some_array.npy')
arch = np.load('some_array_archive.npz')


In [71]:
# we can delimit our data
np.random.seed(20) # reproducible random values
r = np.arange(10)
np.random.shuffle(r)
r

array([7, 1, 8, 5, 0, 2, 6, 9, 4, 3])

In [43]:
# save as csv
np.savetxt('r.csv', r, delimiter=',') # output a text file
d = np.loadtxt('r.csv')
d

array([7., 1., 8., 5., 0., 2., 6., 9., 4., 3.])

In [44]:
print(arr)
print(r)
d

[0 1 2 3 4 5 6 7 8 9]
[7 1 8 5 0 2 6 9 4 3]


array([7., 1., 8., 5., 0., 2., 6., 9., 4., 3.])

## Loading External data

In [40]:
# remember to import stuff
df = pd.read_csv('pydata/salaries.csv') # make sure it is comma delimited!!
df.tail(30)
df.head(40)
df['salary'].dtype
df.rank # read the series (or use df['rank'])
df.columns
df.size
df.sum()
df.describe()

Unnamed: 0,phd,service,salary
count,78.0,78.0,78.0
mean,19.705128,15.051282,108023.782051
std,12.498425,12.139768,28293.661022
min,1.0,0.0,57800.0
25%,10.25,5.25,88612.5
50%,18.5,14.5,104671.0
75%,27.75,20.75,126774.75
max,56.0,51.0,186960.0


In [41]:
# select members of the data frame
df.service # df['service']

0     49
1      6
2     20
3     31
4     18
      ..
73    10
74     6
75    17
76    14
77    15
Name: service, Length: 78, dtype: int64

In [78]:
# grouping data
df_rank = df.groupby(['rank'])
df_rank.mean()

Unnamed: 0_level_0,phd,service,salary
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AssocProf,15.076923,11.307692,91786.230769
AsstProf,5.052632,2.210526,81362.789474
Prof,27.065217,21.413043,123624.804348


In [80]:
# mean salary for each rank
df.groupby('rank')[['salary', 'service']].mean()

Unnamed: 0_level_0,salary,service
rank,Unnamed: 1_level_1,Unnamed: 2_level_1
AssocProf,91786.230769,11.307692
AsstProf,81362.789474,2.210526
Prof,123624.804348,21.413043


In [82]:
# Calculate mean salary for each rank with parameters
df_sub = df[df['salary']<=120000] # just those earning less than 120k
df_sub.describe()

Unnamed: 0,phd,service,salary
count,53.0,53.0,53.0
mean,15.415094,10.54717,92128.207547
std,10.90066,10.10439,16171.079437
min,1.0,0.0,57800.0
25%,7.0,3.0,77500.0
50%,12.0,8.0,92000.0
75%,23.0,17.0,104800.0
max,51.0,51.0,120000.0


In [85]:
df_f = df[df['sex']== 'Female']
df_f.describe()

Unnamed: 0,phd,service,salary
count,39.0,39.0,39.0
mean,16.512821,11.564103,101002.410256
std,9.784176,8.813252,25952.127317
min,2.0,0.0,62884.0
25%,10.0,4.0,77250.0
50%,17.0,10.0,103750.0
75%,23.5,17.5,117002.5
max,39.0,36.0,161101.0


In [88]:
df_m = df[df['sex']== 'Male']
df_m.describe()

Unnamed: 0,phd,service,salary
count,39.0,39.0,39.0
mean,22.897436,18.538462,115045.153846
std,14.138032,13.999711,29110.516397
min,1.0,0.0,57800.0
25%,12.0,6.5,92000.0
50%,21.0,19.0,107300.0
75%,31.5,25.0,132991.5
max,56.0,51.0,186960.0


In [96]:
# filtering by slice
df[10:20:3]
# select by columns
df.loc[0:11, ['rank', 'salary']] # loc will locate members by their row

Unnamed: 0,rank,salary
0,Prof,186960
1,Prof,93000
2,Prof,110515
3,Prof,131205
4,Prof,104800
5,Prof,122400
6,AssocProf,81285
7,Prof,126300
8,Prof,94350
9,Prof,57800


In [100]:
# using iloc for index-location
df.iloc[10:21, [0, 4]]

Unnamed: 0,rank,sex
10,Prof,Male
11,Prof,Male
12,AsstProf,Male
13,Prof,Male
14,Prof,Male
15,Prof,Male
16,AsstProf,Male
17,AsstProf,Male
18,Prof,Male
19,Prof,Male


### Sorting Data

In [103]:
df.sort_values(by='service').head(60)

Unnamed: 0,rank,discipline,phd,service,sex,salary
55,AsstProf,A,2,0,Female,72500
23,AsstProf,A,2,0,Male,85000
43,AsstProf,B,5,0,Female,77000
17,AsstProf,B,4,0,Male,92000
12,AsstProf,B,1,0,Male,88000
52,Prof,A,12,0,Female,105000
57,AsstProf,A,3,1,Female,72500
28,AsstProf,B,7,2,Male,91300
42,AsstProf,B,4,2,Female,80225
22,AsstProf,A,4,2,Male,73000
