## Pandas for Exploratory Data Analysis


In [1]:
import pandas as pd
import os
from pathlib import Path

### Reading Files, Selecting Columns, and Summarizing

In [2]:
# where are we?
home = Path.cwd()

In [3]:
# what's the parent directory?
home.parent

PosixPath('/Users/austinlasseter/atelier/generalassembly/datdc35/04-pandas-eda')

In [4]:
# what's the data directory?
datadir = Path.joinpath(home.parent, 'data')
datadir

PosixPath('/Users/austinlasseter/atelier/generalassembly/datdc35/04-pandas-eda/data')

In [48]:
# and what's in there?
os.listdir(datadir)

['Production.ProductSubcategory.csv',
 'drinks.csv',
 'imdb_1000.csv',
 'ufo.csv',
 'Sales.SalesOrderHeader.csv',
 'titanic.csv',
 'u.user',
 'Sales.SalesOrderDetail.csv',
 'Production.Product.csv']

In [6]:
# create the filepath
path2file = Path.joinpath(home.parent, 'data', 'u.user')

In [7]:
# Note: we can read a file from local computer or directly from a URL
path2file = r'https://raw.githubusercontent.com/justmarkham/DAT7/master/data/u.user'

In [47]:
# what's wrong with this picture?
users = pd.read_table(path2file)
users.head()

  


Unnamed: 0,1|24|M|technician|85711
0,2|53|F|other|94043
1,3|23|M|writer|32067
2,4|24|M|technician|43537
3,5|33|F|other|15213
4,6|42|M|executive|98101


In [9]:
# read 'u.user' into 'users'
user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']

In [10]:
# a better way to read this in!
users = pd.read_table(path2file, sep='|', names=user_cols)
users.head()

  


Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


### examine the users data

In [11]:
type(users)             # DataFrame

pandas.core.frame.DataFrame

In [12]:
users.head()            # print the first 5 rows

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [13]:
users.index             # "the index" (aka "the labels")

RangeIndex(start=0, stop=943, step=1)

In [14]:
users.columns           # column names (which is "an index")

Index(['user_id', 'age', 'gender', 'occupation', 'zip_code'], dtype='object')

In [15]:
users.dtypes            # data types of each column

user_id        int64
age            int64
gender        object
occupation    object
zip_code      object
dtype: object

In [16]:
users.shape             # number of rows and columns

(943, 5)

In [17]:
print(users.values[0])           # underlying numpy array
print(users.values[1]) 
print(users.values[2])  

[1 24 'M' 'technician' '85711']
[2 53 'F' 'other' '94043']
[3 23 'M' 'writer' '32067']


In [18]:
users.info()            # concise summary (including memory usage)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 943 entries, 0 to 942
Data columns (total 5 columns):
user_id       943 non-null int64
age           943 non-null int64
gender        943 non-null object
occupation    943 non-null object
zip_code      943 non-null object
dtypes: int64(2), object(3)
memory usage: 36.9+ KB


#### select a column

In [19]:
users['gender']         # select one column

0      M
1      F
2      M
3      M
4      F
5      M
6      M
7      M
8      M
9      M
10     F
11     F
12     M
13     M
14     F
15     M
16     M
17     F
18     M
19     F
20     M
21     M
22     F
23     F
24     M
25     M
26     F
27     M
28     M
29     M
      ..
913    F
914    M
915    M
916    F
917    M
918    M
919    F
920    F
921    F
922    M
923    M
924    F
925    M
926    M
927    M
928    M
929    F
930    M
931    M
932    M
933    M
934    M
935    M
936    M
937    F
938    F
939    M
940    M
941    F
942    M
Name: gender, Length: 943, dtype: object

In [20]:
type(users['gender'])   # Series

pandas.core.series.Series

In [21]:
users.gender            # select one column using the DataFrame attribute

0      M
1      F
2      M
3      M
4      F
5      M
6      M
7      M
8      M
9      M
10     F
11     F
12     M
13     M
14     F
15     M
16     M
17     F
18     M
19     F
20     M
21     M
22     F
23     F
24     M
25     M
26     F
27     M
28     M
29     M
      ..
913    F
914    M
915    M
916    F
917    M
918    M
919    F
920    F
921    F
922    M
923    M
924    F
925    M
926    M
927    M
928    M
929    F
930    M
931    M
932    M
933    M
934    M
935    M
936    M
937    F
938    F
939    M
940    M
941    F
942    M
Name: gender, Length: 943, dtype: object

### summarize (describe) the data

In [22]:
users.describe()                    # describe all numeric columns

Unnamed: 0,user_id,age
count,943.0,943.0
mean,472.0,34.051962
std,272.364951,12.19274
min,1.0,7.0
25%,236.5,25.0
50%,472.0,31.0
75%,707.5,43.0
max,943.0,73.0


In [23]:
users.describe(include=['object'])  # describe all object columns (can include multiple types)

Unnamed: 0,gender,occupation,zip_code
count,943,943,943
unique,2,21,795
top,M,student,55414
freq,670,196,9


In [24]:
users.describe(include='all')       # describe all columns

Unnamed: 0,user_id,age,gender,occupation,zip_code
count,943.0,943.0,943,943,943.0
unique,,,2,21,795.0
top,,,M,student,55414.0
freq,,,670,196,9.0
mean,472.0,34.051962,,,
std,272.364951,12.19274,,,
min,1.0,7.0,,,
25%,236.5,25.0,,,
50%,472.0,31.0,,,
75%,707.5,43.0,,,


In [25]:
users.gender.describe()             # describe a single column

count     943
unique      2
top         M
freq      670
Name: gender, dtype: object

In [26]:
users.age.mean()                    # only calculate the mean

34.05196182396607

### count the number of occurrences of each value

In [27]:
users.gender.value_counts()     # most useful for categorical variables

M    670
F    273
Name: gender, dtype: int64

In [28]:
users.age.value_counts()        # can also be used with numeric variables

30    39
25    38
22    37
28    36
27    35
26    34
24    33
29    32
20    32
32    28
23    28
35    27
21    27
33    26
31    25
19    23
44    23
39    22
40    21
36    21
42    21
51    20
50    20
48    20
49    19
37    19
18    18
34    17
38    17
45    15
      ..
47    14
43    13
46    12
53    12
55    11
41    10
57     9
60     9
52     6
56     6
15     6
13     5
16     5
54     4
63     3
14     3
65     3
70     3
61     3
59     3
58     3
64     2
68     2
69     2
62     2
11     1
10     1
73     1
66     1
7      1
Name: age, Length: 61, dtype: int64

## Filtering and Sorting

### logical filtering: only show users with age < 20

In [29]:
young_bool = users.age < 20         # create a Series of booleans...

In [30]:
users[young_bool]                   # ...and use that Series to filter rows

Unnamed: 0,user_id,age,gender,occupation,zip_code
29,30,7,M,student,55436
35,36,19,F,student,93117
51,52,18,F,student,55105
56,57,16,M,none,84010
66,67,17,M,student,60402
67,68,19,M,student,22904
100,101,15,M,student,05146
109,110,19,M,student,77840
141,142,13,M,other,48118
178,179,15,M,entertainment,20755


In [31]:
users[users.age < 20]               # or, combine into a single step

Unnamed: 0,user_id,age,gender,occupation,zip_code
29,30,7,M,student,55436
35,36,19,F,student,93117
51,52,18,F,student,55105
56,57,16,M,none,84010
66,67,17,M,student,60402
67,68,19,M,student,22904
100,101,15,M,student,05146
109,110,19,M,student,77840
141,142,13,M,other,48118
178,179,15,M,entertainment,20755


In [32]:
users[users.age < 20].occupation    # select one column from the filtered results

29           student
35           student
51           student
56              none
66           student
67           student
100          student
109          student
141            other
178    entertainment
205          student
220          student
222          student
245          student
256          student
257          student
261          student
269          student
280          student
288             none
290          student
302          student
319          student
340          student
346          student
366          student
367          student
374    entertainment
392          student
396          student
           ...      
600           artist
608          student
617          student
618          student
619           writer
620          student
623          student
627             none
630          student
631          student
641          student
645          student
673          student
699          student
709          student
728          student
746          

In [33]:
users[users.age < 20].occupation.value_counts()     # value_counts of resulting Series

student          64
other             4
none              3
writer            2
entertainment     2
artist            1
salesman          1
Name: occupation, dtype: int64

### logical filtering with multiple conditions

In [34]:
users[(users.age < 20) & (users.gender=='M')]       # ampersand for AND condition

Unnamed: 0,user_id,age,gender,occupation,zip_code
29,30,7,M,student,55436
56,57,16,M,none,84010
66,67,17,M,student,60402
67,68,19,M,student,22904
100,101,15,M,student,5146
109,110,19,M,student,77840
141,142,13,M,other,48118
178,179,15,M,entertainment,20755
220,221,19,M,student,20685
245,246,19,M,student,28734


In [35]:
users[(users.age < 20) | (users.age > 60)]          # pipe for OR condition

Unnamed: 0,user_id,age,gender,occupation,zip_code
29,30,7,M,student,55436
35,36,19,F,student,93117
51,52,18,F,student,55105
56,57,16,M,none,84010
66,67,17,M,student,60402
67,68,19,M,student,22904
100,101,15,M,student,05146
105,106,61,M,retired,55125
109,110,19,M,student,77840
141,142,13,M,other,48118


In [36]:
users[users.occupation.isin(['doctor', 'lawyer'])]  # alternative to multiple OR conditions

Unnamed: 0,user_id,age,gender,occupation,zip_code
9,10,53,M,lawyer,90703
124,125,30,M,lawyer,22202
125,126,28,F,lawyer,20015
137,138,46,M,doctor,53211
160,161,50,M,lawyer,55104
204,205,47,M,lawyer,6371
250,251,28,M,doctor,85032
298,299,29,M,doctor,63108
338,339,35,M,lawyer,37901
364,365,29,M,lawyer,20009


### sorting

In [37]:
users.sort_values('age')                   # sort a DataFrame by a single column

Unnamed: 0,user_id,age,gender,occupation,zip_code
29,30,7,M,student,55436
470,471,10,M,student,77459
288,289,11,M,none,94619
879,880,13,M,student,83702
608,609,13,F,student,55106
141,142,13,M,other,48118
673,674,13,F,student,55337
627,628,13,M,none,94306
812,813,14,F,student,02136
205,206,14,F,student,53115


In [38]:
users.sort_values('age', ascending=False)  # use descending order instead

Unnamed: 0,user_id,age,gender,occupation,zip_code
480,481,73,M,retired,37771
802,803,70,M,administrator,78212
766,767,70,M,engineer,00000
859,860,70,F,retired,48322
584,585,69,M,librarian,98501
558,559,69,M,executive,10022
348,349,68,M,retired,61455
572,573,68,M,retired,48911
210,211,66,M,salesman,32605
650,651,65,M,retired,02903


In [39]:
users.sort_values(['occupation', 'age'])   # sort by multiple columns

Unnamed: 0,user_id,age,gender,occupation,zip_code
117,118,21,M,administrator,90210
179,180,22,F,administrator,60202
281,282,22,M,administrator,20057
316,317,22,M,administrator,13210
438,439,23,F,administrator,20817
508,509,23,M,administrator,10011
393,394,25,M,administrator,96819
664,665,25,M,administrator,55412
725,726,25,F,administrator,80538
77,78,26,M,administrator,61801


# Using 'groupby'

In [40]:
# for each occupation in 'users', count the number of occurrences
users['occupation'].value_counts() # sorted by counts

student          196
other            105
educator          95
administrator     79
engineer          67
programmer        66
librarian         51
writer            45
executive         32
scientist         31
artist            28
technician        27
marketing         26
entertainment     18
healthcare        16
retired           14
lawyer            12
salesman          12
none               9
doctor             7
homemaker          7
Name: occupation, dtype: int64

In [41]:
users.groupby('occupation').user_id.count() # sorted by alphabetically, occupation

occupation
administrator     79
artist            28
doctor             7
educator          95
engineer          67
entertainment     18
executive         32
healthcare        16
homemaker          7
lawyer            12
librarian         51
marketing         26
none               9
other            105
programmer        66
retired           14
salesman          12
scientist         31
student          196
technician        27
writer            45
Name: user_id, dtype: int64

In [42]:
# for each occupation, calculate the mean age
users.groupby('occupation').age.mean().sort_values()

occupation
student          22.081633
none             26.555556
entertainment    29.222222
artist           31.392857
homemaker        32.571429
programmer       33.121212
technician       33.148148
other            34.523810
scientist        35.548387
salesman         35.666667
writer           36.311111
engineer         36.388060
lawyer           36.750000
marketing        37.615385
executive        38.718750
administrator    38.746835
librarian        40.000000
healthcare       41.562500
educator         42.010526
doctor           43.571429
retired          63.071429
Name: age, dtype: float64

In [43]:
# for each occupation, calculate the minimum and maximum ages
users.groupby('occupation').age.agg(['min','max'])


Unnamed: 0_level_0,min,max
occupation,Unnamed: 1_level_1,Unnamed: 2_level_1
administrator,21,70
artist,19,48
doctor,28,64
educator,23,63
engineer,22,70
entertainment,15,50
executive,22,69
healthcare,22,62
homemaker,20,50
lawyer,21,53


In [44]:
# for each combination of occupation and gender, calculate the mean age
users.groupby(['occupation','gender']).age.mean()

occupation     gender
administrator  F         40.638889
               M         37.162791
artist         F         30.307692
               M         32.333333
doctor         M         43.571429
educator       F         39.115385
               M         43.101449
engineer       F         29.500000
               M         36.600000
entertainment  F         31.000000
               M         29.000000
executive      F         44.000000
               M         38.172414
healthcare     F         39.818182
               M         45.400000
homemaker      F         34.166667
               M         23.000000
lawyer         F         39.500000
               M         36.200000
librarian      F         40.000000
               M         40.000000
marketing      F         37.200000
               M         37.875000
none           F         36.500000
               M         18.600000
other          F         35.472222
               M         34.028986
programmer     F         32.16666

### Other Commonly Used Features

In [45]:
# map existing values to a different set of values
users['is_male'] = users.gender.map({'F':0, 'M':1})


### Other Less Used Features

#### detecting duplicate rows


In [46]:
users.duplicated()          # True if a row is identical to a previous row
users.duplicated().sum()    # count of duplicates
users[users.duplicated()]   # only show duplicates
users.drop_duplicates()     # drop duplicate rows
users.age.duplicated()      # check a single column for duplicates
users.duplicated(['age', 'gender', 'zip_code']).sum()   # specify columns for finding duplicates


13