# Basic numpy and pandas library usage

First step import the libraries.

In [1]:
import numpy as np
import pandas as pd

Calculate mean of numeric array using numpy.

In [2]:
a = np.array([0,1,2,3,4,5,6,7,8,9,10]) 
np.mean(a)

5.0

## Import CSV file to test pandas

In [3]:
# csv file to load
file = 'Cartwheeldata.csv'
# import the file into a data frame
df = pd.read_csv(file)
# check file type
type(df)

pandas.core.frame.DataFrame

### View CSV file contents and structure
Show the file head to preview the data inside the file.

In [4]:
df.head(10)

Unnamed: 0,ID,Age,Gender,GenderGroup,Glasses,GlassesGroup,Height,Wingspan,CWDistance,Complete,CompleteGroup,Score
0,1,56,F,1,Y,1,62.0,61.0,79,Y,1,7
1,2,26,F,1,Y,1,62.0,60.0,70,Y,1,8
2,3,33,F,1,Y,1,66.0,64.0,85,Y,1,7
3,4,39,F,1,N,0,64.0,63.0,87,Y,1,10
4,5,27,M,2,N,0,73.0,75.0,72,N,0,4
5,6,24,M,2,N,0,75.0,71.0,81,N,0,3
6,7,28,M,2,N,0,75.0,76.0,107,Y,1,10
7,8,22,F,1,N,0,65.0,62.0,98,Y,1,9
8,9,29,M,2,Y,1,74.0,73.0,106,N,0,5
9,10,33,F,1,Y,1,63.0,60.0,65,Y,1,8


In [5]:
df.tail(4)

Unnamed: 0,ID,Age,Gender,GenderGroup,Glasses,GlassesGroup,Height,Wingspan,CWDistance,Complete,CompleteGroup,Score
21,22,29,M,2,N,0,71.0,70.0,101,Y,1,8
22,23,25,M,2,N,0,70.0,68.0,82,Y,1,4
23,24,26,M,2,N,0,69.0,71.0,63,Y,1,5
24,25,23,F,1,Y,1,65.0,63.0,67,N,0,3


Before displaying the database, check the dimensions and the number of columns and rows.

In [6]:
df.ndim

2

In [7]:
df.shape

(25, 12)

Display the whole dataframe (dangerous if data frame has too many rows)

In [8]:
df

Unnamed: 0,ID,Age,Gender,GenderGroup,Glasses,GlassesGroup,Height,Wingspan,CWDistance,Complete,CompleteGroup,Score
0,1,56,F,1,Y,1,62.0,61.0,79,Y,1,7
1,2,26,F,1,Y,1,62.0,60.0,70,Y,1,8
2,3,33,F,1,Y,1,66.0,64.0,85,Y,1,7
3,4,39,F,1,N,0,64.0,63.0,87,Y,1,10
4,5,27,M,2,N,0,73.0,75.0,72,N,0,4
5,6,24,M,2,N,0,75.0,71.0,81,N,0,3
6,7,28,M,2,N,0,75.0,76.0,107,Y,1,10
7,8,22,F,1,N,0,65.0,62.0,98,Y,1,9
8,9,29,M,2,Y,1,74.0,73.0,106,N,0,5
9,10,33,F,1,Y,1,63.0,60.0,65,Y,1,8


Show dataframe columns.

In [9]:
df.columns

Index(['ID', 'Age', 'Gender', 'GenderGroup', 'Glasses', 'GlassesGroup',
       'Height', 'Wingspan', 'CWDistance', 'Complete', 'CompleteGroup',
       'Score'],
      dtype='object')

In [10]:
df.dtypes

ID                 int64
Age                int64
Gender            object
GenderGroup        int64
Glasses           object
GlassesGroup       int64
Height           float64
Wingspan         float64
CWDistance         int64
Complete          object
CompleteGroup      int64
Score              int64
dtype: object

### Summary of the DataFrame
Use describe to get a summary of the DataFrame.

In [11]:
df.describe()

Unnamed: 0,ID,Age,GenderGroup,GlassesGroup,Height,Wingspan,CWDistance,CompleteGroup,Score
count,25.0,25.0,25.0,25.0,25.0,25.0,25.0,25.0,25.0
mean,13.0,28.24,1.52,0.56,67.65,66.26,82.48,0.76,6.4
std,7.359801,6.989754,0.509902,0.506623,4.431187,5.492647,15.058552,0.43589,2.533114
min,1.0,22.0,1.0,0.0,61.5,57.5,63.0,0.0,2.0
25%,7.0,24.0,1.0,0.0,64.0,62.0,70.0,1.0,4.0
50%,13.0,26.0,2.0,1.0,68.0,66.0,81.0,1.0,6.0
75%,19.0,29.0,2.0,1.0,71.0,71.0,92.0,1.0,8.0
max,25.0,56.0,2.0,1.0,75.0,76.0,115.0,1.0,10.0


### Extract Subsets of DataFrame

Extract ranges of values using matrix indexes and column names.

In [12]:
df.loc[:, ('Age', 'Height')]

Unnamed: 0,Age,Height
0,56,62.0
1,26,62.0
2,33,66.0
3,39,64.0
4,27,73.0
5,24,75.0
6,28,75.0
7,22,65.0
8,29,74.0
9,33,63.0


Calculate the mean for some columns.

In [13]:
df.loc[:, ('Age', 'Height')].mean()

Age       28.24
Height    67.65
dtype: float64

In [14]:
df.loc[1:4, ('Age', 'Height', 'Wingspan')]

Unnamed: 0,Age,Height,Wingspan
1,26,62.0,60.0
2,33,66.0,64.0
3,39,64.0,63.0
4,27,73.0,75.0


Using python slices to access the DataFrame Series.

In [15]:
df.iloc[1:5, 3:]

Unnamed: 0,GenderGroup,Glasses,GlassesGroup,Height,Wingspan,CWDistance,Complete,CompleteGroup,Score
1,1,Y,1,62.0,60.0,70,Y,1,8
2,1,Y,1,66.0,64.0,85,Y,1,7
3,1,N,0,64.0,63.0,87,Y,1,10
4,2,N,0,73.0,75.0,72,N,0,4


In [16]:
# this is identical to copy
df.iloc[:,:]

Unnamed: 0,ID,Age,Gender,GenderGroup,Glasses,GlassesGroup,Height,Wingspan,CWDistance,Complete,CompleteGroup,Score
0,1,56,F,1,Y,1,62.0,61.0,79,Y,1,7
1,2,26,F,1,Y,1,62.0,60.0,70,Y,1,8
2,3,33,F,1,Y,1,66.0,64.0,85,Y,1,7
3,4,39,F,1,N,0,64.0,63.0,87,Y,1,10
4,5,27,M,2,N,0,73.0,75.0,72,N,0,4
5,6,24,M,2,N,0,75.0,71.0,81,N,0,3
6,7,28,M,2,N,0,75.0,76.0,107,Y,1,10
7,8,22,F,1,N,0,65.0,62.0,98,Y,1,9
8,9,29,M,2,Y,1,74.0,73.0,106,N,0,5
9,10,33,F,1,Y,1,63.0,60.0,65,Y,1,8


### Check for unique values

In [17]:
df.Gender.unique()

array(['F', 'M'], dtype=object)

In [18]:
df.GenderGroup.unique()

array([1, 2])

### Grouping

In [19]:
df.groupby(['Gender', 'GenderGroup']).size()

Gender  GenderGroup
F       1              12
M       2              13
dtype: int64

In [20]:
df.groupby(['Age', 'Height']).size()

Age  Height
22   65.00     1
23   61.50     1
     65.00     1
     69.00     1
     70.00     1
24   68.00     1
     75.00     1
25   65.00     1
     70.00     1
26   61.50     1
     62.00     1
     69.00     1
     71.00     1
27   66.00     1
     73.00     1
28   62.75     1
     75.00     1
29   71.00     1
     74.00     1
30   69.50     1
31   73.00     1
33   63.00     1
     66.00     1
39   64.00     1
56   62.00     1
dtype: int64