# ASDRP Data Science Introduction

In [1]:
#Import Python Libraries
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Pandas is a python package that deals mostly with :

    Series (1d homogeneous array)
    DataFrame (2d labeled heterogeneous array)
    Panel (general 3d array)



In [3]:
# Example of creating Pandas series :
myseries = pd.Series( np.random.randn(5) )
print(myseries)

0   -0.953803
1    0.668505
2    0.454627
3   -0.000543
4    0.267883
dtype: float64


We did not pass any index, so by default, it assigned the indexes ranging from 0 to len(data)-1

In [4]:
# View index values
print(myseries.index)

RangeIndex(start=0, stop=5, step=1)


In [5]:
# Creating Pandas series with index:
iseries = pd.Series( np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'] )
print(iseries)

a   -0.389120
b    1.349371
c    1.000169
d   -0.479201
e   -0.644107
dtype: float64


In [6]:
# View index values
print(iseries.index)

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')


## Reading Salaries data to DataFrame

In [7]:
df = pd.read_csv("https://raw.githubusercontent.com/philmui/algorithmic-bias-2019/master/data/salaries/Salaries.csv");

In [9]:
df.dtypes

rank          object
discipline    object
phd            int64
service        int64
sex           object
salary         int64
dtype: object

In [10]:
df.count()

rank          78
discipline    78
phd           78
service       78
sex           78
salary        78
dtype: int64

In [11]:
df.head()

Unnamed: 0,rank,discipline,phd,service,sex,salary
0,Prof,B,56,49,Male,186960
1,Prof,A,12,6,Male,93000
2,Prof,A,23,20,Male,110515
3,Prof,A,40,31,Male,131205
4,Prof,B,20,18,Male,104800


In [12]:
df.describe()

Unnamed: 0,phd,service,salary
count,78.0,78.0,78.0
mean,19.705128,15.051282,108023.782051
std,12.498425,12.139768,28293.661022
min,1.0,0.0,57800.0
25%,10.25,5.25,88612.5
50%,18.5,14.5,104671.0
75%,27.75,20.75,126774.75
max,56.0,51.0,186960.0


## GroupBy

In [13]:
df_rank = df.groupby(["rank"])

In [14]:
df_rank.mean()

Unnamed: 0_level_0,phd,service,salary
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AssocProf,15.076923,11.307692,91786.230769
AsstProf,5.052632,2.210526,81362.789474
Prof,27.065217,21.413043,123624.804348


In [17]:
#Calculate mean salary for each professor rank:
df.groupby('rank')[['salary']].mean()

Unnamed: 0_level_0,salary
rank,Unnamed: 1_level_1
AssocProf,91786.230769
AsstProf,81362.789474
Prof,123624.804348


## Filtering

In [19]:
#Calculate mean salary for each professor rank:
df_sub = df[ df['salary'] > 120000 ]
df_sub.head()

Unnamed: 0,rank,discipline,phd,service,sex,salary
0,Prof,B,56,49,Male,186960
3,Prof,A,40,31,Male,131205
5,Prof,A,20,20,Male,122400
7,Prof,A,18,18,Male,126300
10,Prof,B,39,33,Male,128250


In [20]:
#Select only those rows that contain female professors:
df_f = df[ df['sex'] == 'Female' ]
df_f.head()

Unnamed: 0,rank,discipline,phd,service,sex,salary
39,Prof,B,18,18,Female,129000
40,Prof,A,39,36,Female,137000
41,AssocProf,A,13,8,Female,74830
42,AsstProf,B,4,2,Female,80225
43,AsstProf,B,5,0,Female,77000


## Slicing

In [21]:
#Select column salary:
df['salary']

0     186960
1      93000
2     110515
3     131205
4     104800
5     122400
6      81285
7     126300
8      94350
9      57800
10    128250
11    134778
12     88000
13    162200
14    153750
15    150480
16     75044
17     92000
18    107300
19    150500
20     92000
21    103106
22     73000
23     85000
24     91100
25     99418
26    148750
27    155865
28     91300
29    123683
       ...  
48    111512
49    122960
50     97032
51    127512
52    105000
53     73500
54     62884
55     72500
56     77500
57     72500
58    144651
59    103994
60     92000
61    103750
62    109650
63     91000
64     73300
65    117555
66     63100
67     90450
68     77500
69    116450
70     78500
71     71065
72    161101
73    105450
74    104542
75    124312
76    109954
77    109646
Name: salary, Length: 78, dtype: int64

In [22]:
#Select column salary:
df[['rank','salary']]

Unnamed: 0,rank,salary
0,Prof,186960
1,Prof,93000
2,Prof,110515
3,Prof,131205
4,Prof,104800
5,Prof,122400
6,AssocProf,81285
7,Prof,126300
8,Prof,94350
9,Prof,57800


## Row Selection

In [23]:
#Select rows by their position:
df[10:20]

Unnamed: 0,rank,discipline,phd,service,sex,salary
10,Prof,B,39,33,Male,128250
11,Prof,B,23,23,Male,134778
12,AsstProf,B,1,0,Male,88000
13,Prof,B,35,33,Male,162200
14,Prof,B,25,19,Male,153750
15,Prof,B,17,3,Male,150480
16,AsstProf,B,8,3,Male,75044
17,AsstProf,B,4,0,Male,92000
18,Prof,A,19,7,Male,107300
19,Prof,A,29,27,Male,150500


## Sorting

In [28]:
# Create a new data frame from the original sorted by the column Salary
    df_sorted = df.sort_values( by ='salary', ascending=1)
df_sorted.head()

Unnamed: 0,rank,discipline,phd,service,sex,salary
9,Prof,A,51,51,Male,57800
54,AssocProf,A,25,22,Female,62884
66,AsstProf,A,7,6,Female,63100
71,AssocProf,B,12,9,Female,71065
57,AsstProf,A,3,1,Female,72500


In [31]:
df_sorted = df.sort_values( by =['service', 'salary'], ascending = [True, False])
df_sorted.head(10)

Unnamed: 0,rank,discipline,phd,service,sex,salary
52,Prof,A,12,0,Female,105000
17,AsstProf,B,4,0,Male,92000
12,AsstProf,B,1,0,Male,88000
23,AsstProf,A,2,0,Male,85000
43,AsstProf,B,5,0,Female,77000
55,AsstProf,A,2,0,Female,72500
57,AsstProf,A,3,1,Female,72500
28,AsstProf,B,7,2,Male,91300
42,AsstProf,B,4,2,Female,80225
68,AsstProf,A,4,2,Female,77500


## Introduction to Exercises

In [32]:
# Create a Series from dictionary
data = {'pi': 3.1415, 'e': 2.71828}  # dictionary
print(data)
s3 = pd.Series ( data )
print(s3)

{'pi': 3.1415, 'e': 2.71828}
pi    3.14150
e     2.71828
dtype: float64


In [33]:
# reordering the elements
s4 = pd.Series ( data, index = ['e', 'pi', 'tau'])
print(s4)

e      2.71828
pi     3.14150
tau        NaN
dtype: float64


NAN (non a number) - is used to specify a missing value in Pandas.

In [39]:
# Creating a Pandas Series object from a single number:
sone = pd.Series( 1, index = range(10), name='Ones')
print(sone)

0    1
1    1
2    1
3    1
4    1
5    1
6    1
7    1
8    1
9    1
Name: Ones, dtype: int64


In [41]:
# Many ways to "slice" Pandas series (series have zero-based index by default):
print(myseries)
myseries[3]  # returns 4th element

0    1.663701
1   -0.350928
2    0.042151
3    0.115110
4   -1.561674
dtype: float64


0.11511047759415725

In [42]:
myseries[:2] # First 2 elements

0    1.663701
1   -0.350928
dtype: float64

In [43]:
print( s1[ [2,1,0]])  # Elements out of order

2   -1.011741
1   -0.203793
0    1.069600
dtype: float64


In [44]:
# Series can be used as ndarray:
print("Median:" , myseries.median())

Median: 0.0421512295560481


In [45]:
myseries[myseries > 0]

0    1.663701
2    0.042151
3    0.115110
dtype: float64

In [46]:
# vector operations:
np.exp(myseries)

0    5.278810
1    0.704035
2    1.043052
3    1.121997
4    0.209784
dtype: float64

## Exercises