## Exploratory Data Analysis (EDA)

In [1]:
import pandas as pd
import os
from pathlib import Path
import numpy as np

#### Read in the `users` dataset

In [2]:
# wher are we?
home = Path.cwd()

In [4]:
# where is the data?
data_dir = Path.joinpath(home.parent, 'data')
data_dir

PosixPath('/Users/austinlasseter/atelier/generalassembly/python-2-dc/09-pandas-eda-1/data')

In [5]:
# what's in there?
os.listdir(data_dir)

['Production.ProductSubcategory.csv',
 'drinks.csv',
 'imdb_1000.csv',
 'ufo.csv',
 'Sales.SalesOrderHeader.csv',
 'titanic.csv',
 'u.user',
 'Sales.SalesOrderDetail.csv',
 'Production.Product.csv']

In [6]:
# create path to our file
path2file = Path.joinpath(data_dir, 'u.user')

In [14]:
# why is this wrong?
users = pd.read_csv(path2file, sep='|')
# user = pd.read_table(path2file)

users.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


### Preliminary Examination Steps

In [15]:
# what's my type?
type(users)

pandas.core.frame.DataFrame

In [16]:
# what's the index?
users.index

RangeIndex(start=0, stop=943, step=1)

In [17]:
# what are the columns?
users.columns

Index(['user_id', 'age', 'gender', 'occupation', 'zip_code'], dtype='object')

In [18]:
# what is the data type of each column?
users.dtypes

user_id        int64
age            int64
gender        object
occupation    object
zip_code      object
dtype: object

In [21]:
# what is a Pandas dataframe?
# it's an array of Numpy array
users.values[2]

array([3, 23, 'M', 'writer', '32067'], dtype=object)

In [22]:
users.index[2]

2

In [23]:
# concise summary of df
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 943 entries, 0 to 942
Data columns (total 5 columns):
user_id       943 non-null int64
age           943 non-null int64
gender        943 non-null object
occupation    943 non-null object
zip_code      943 non-null object
dtypes: int64(2), object(3)
memory usage: 36.9+ KB


In [24]:
# describe numeric cols
users.describe()

Unnamed: 0,user_id,age
count,943.0,943.0
mean,472.0,34.051962
std,272.364951,12.19274
min,1.0,7.0
25%,236.5,25.0
50%,472.0,31.0
75%,707.5,43.0
max,943.0,73.0


In [35]:
# you can also describe non-numeric columns:
users.describe(include=['object'])
users.describe(include='all')

Unnamed: 0,user_id,age,gender,occupation,zip_code
count,943.0,943.0,943,943,943.0
unique,,,2,21,795.0
top,,,M,student,55414.0
freq,,,670,196,9.0
mean,472.0,34.051962,,,
std,272.364951,12.19274,,,
min,1.0,7.0,,,
25%,236.5,25.0,,,
50%,472.0,31.0,,,
75%,707.5,43.0,,,


In [25]:
# missing data?
users.isnull().sum()

user_id       0
age           0
gender        0
occupation    0
zip_code      0
dtype: int64

### Take a look at individual columns

In [30]:
# select 1 column
users['gender'].tail()

938    F
939    M
940    M
941    F
942    M
Name: gender, dtype: object

In [28]:
# select 2 cols
users[['gender', 'occupation']].sample(4)

Unnamed: 0,gender,occupation
507,M,marketing
731,F,other
329,F,educator
736,M,programmer


In [31]:
# what is the column type?
type(users['gender'])

pandas.core.series.Series

### a few col methods

In [41]:
# mean of numeric cols
users['age'].mean()
users['age'].median()
users['age'].std()
users['age'].min()
users['age'].max()
users['age'].count()

943

In [47]:
# methods for string (non-numeric) columns
users['gender'].value_counts()
users['gender'].value_counts().sort_values(ascending=False)
users['gender'].value_counts().sort_index()
users['gender'].value_counts().values
users['gender'].value_counts().index

Index(['M', 'F'], dtype='object')

### Filtering and Sorting Columns

###### heading

esc - m - shift return

In [51]:
# create a series of booleans
young_bool = users['age']<20
young_bool[:5]

0    False
1    False
2    False
3    False
4    False
Name: age, dtype: bool

In [52]:
# apply that filter to my df
users[young_bool].head(5)

Unnamed: 0,user_id,age,gender,occupation,zip_code
29,30,7,M,student,55436
35,36,19,F,student,93117
51,52,18,F,student,55105
56,57,16,M,none,84010
66,67,17,M,student,60402


In [53]:
# combine into a single step
users[     users['age']<20    ].tail()

Unnamed: 0,user_id,age,gender,occupation,zip_code
871,872,19,F,student,74078
879,880,13,M,student,83702
886,887,14,F,student,27249
903,904,17,F,student,61073
924,925,18,F,salesman,49036


In [59]:
# you can select 1 column from the filtered results
newdf =  users[users['age']<20]    
newdf['occupation'].head()

29    student
35    student
51    student
56       none
66    student
Name: occupation, dtype: object

In [62]:
# you can call any other method on this
users[users['age']<20]['occupation'].value_counts()    
users[users['age']<20]['occupation'].describe()

count          77
unique          7
top       student
freq           64
Name: occupation, dtype: object

## Multiple conditions

In [64]:
users[(users['age']<20) & (users['gender']=='M')].describe()

Unnamed: 0,user_id,age
count,45.0,45.0
mean,448.311111,16.844444
std,240.713317,2.671019
min,30.0,7.0
25%,289.0,16.0
50%,451.0,17.0
75%,624.0,19.0
max,880.0,19.0


In [66]:
# Pipe for 'or'
users[(users['age']<20) | (users['gender']=='M')].tail()

Unnamed: 0,user_id,age,gender,occupation,zip_code
935,936,24,M,other,32789
936,937,48,M,educator,98072
939,940,32,M,administrator,2215
940,941,20,M,student,97229
942,943,22,M,student,77841


In [68]:
# you can also filter on a predefined list
list_of_occupations = ['doctor', 'lawyer', 'student']
users[users['occupation'].isin(list_of_occupations)]['gender'].head()

8     M
9     M
29    M
31    F
32    M
Name: gender, dtype: object

### Sorting

In [70]:
# sort the entire df by one variable/column
users.sort_values('age').head(3)

Unnamed: 0,user_id,age,gender,occupation,zip_code
29,30,7,M,student,55436
470,471,10,M,student,77459
288,289,11,M,none,94619


In [72]:
# default is asc, you can also choose desc order
users.sort_values('age', ascending=True).tail(3)

Unnamed: 0,user_id,age,gender,occupation,zip_code
802,803,70,M,administrator,78212
859,860,70,F,retired,48322
480,481,73,M,retired,37771


In [75]:
# set a column as the new index
users.set_index('user_id')

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213
6,42,M,executive,98101
7,57,M,administrator,91344
8,36,M,administrator,05201
9,29,M,student,01002
10,53,M,lawyer,90703


## Using Groupby

`groupby` aggregates a numeric column(s) by a categorical column(s)

In [77]:
# for each occupation in the df, count the number of instances
users['occupation'].value_counts().head(3)

student     196
other       105
educator     95
Name: occupation, dtype: int64

In [95]:
# another way to get this info
users.groupby(['occupation'])['user_id'].count()
users.groupby(['occupation'])['user_id'].count().sort_index()
users.groupby(['occupation'])['user_id'].count().sort_values(ascending=False).head()

occupation
student          196
other            105
educator          95
administrator     79
engineer          67
Name: user_id, dtype: int64

In [88]:
users['age'].describe()

count    943.000000
mean      34.051962
std       12.192740
min        7.000000
25%       25.000000
50%       31.000000
75%       43.000000
max       73.000000
Name: age, dtype: float64

In [91]:
# six different aggregators you can use
users.groupby(['occupation'])['age'].max()
users.groupby(['occupation'])['age'].min()
users.groupby(['occupation'])['age'].mean()
users.groupby(['occupation'])['age'].median()
users.groupby(['occupation'])['age'].std()
users.groupby(['occupation'])['age'].count()

occupation
administrator     79
artist            28
doctor             7
educator          95
engineer          67
entertainment     18
executive         32
healthcare        16
homemaker          7
lawyer            12
librarian         51
marketing         26
none               9
other            105
programmer        66
retired           14
salesman          12
scientist         31
student          196
technician        27
writer            45
Name: age, dtype: int64

In [99]:
# two ways to sort a groupby
users.groupby(['occupation'])['age'].mean().sort_values().head()
users.groupby(['occupation'])['age'].mean().sort_index(ascending=False).head()

occupation
writer        36.311111
technician    33.148148
student       22.081633
scientist     35.548387
salesman      35.666667
Name: age, dtype: float64

In [100]:
# you can group by multiple [categorical] variables
print(users.columns)

Index(['user_id', 'age', 'gender', 'occupation', 'zip_code'], dtype='object')


In [102]:
users.groupby(['occupation', 'gender', 'zip_code'])['age'].mean()

occupation     gender  zip_code
administrator  F       03062       51.0
                       03755       39.0
                       04102       50.0
                       15237       59.0
                       16506       49.0
                       16803       43.0
                       17345       30.0
                       19711       27.0
                       19716       49.0
                       20817       23.0
                       20879       54.0
                       21114       29.0
                       33763       48.0
                       42141       38.0
                       43204       51.0
                       44124       42.0
                       44224       47.0
                       44265       36.0
                       48103       38.0
                       49428       38.0
                       52241       34.0
                       52302       30.0
                       55337       33.0
                       55406       35.0
        

In [104]:
# you can also aggregate multiple [numeric] variables
users.groupby(['occupation', 'gender'])[['age', 'user_id']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,age,user_id
occupation,gender,Unnamed: 2_level_1,Unnamed: 3_level_1
administrator,F,40.638889,443.944444
administrator,M,37.162791,420.069767
artist,F,30.307692,432.153846
artist,M,32.333333,469.0
doctor,M,43.571429,533.714286
educator,F,39.115385,454.730769
educator,M,43.101449,471.492754
engineer,F,29.5,806.5
engineer,M,36.6,445.553846
entertainment,F,31.0,780.0


In [105]:
# you can choose multiple aggregation methods
users.groupby(['occupation'])['age'].agg(['mean', 'median', 'min', 'max', 'std', 'count'])

Unnamed: 0_level_0,mean,median,min,max,std,count
occupation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
administrator,38.746835,37.0,21,70,11.123397,79
artist,31.392857,30.0,19,48,8.668116,28
doctor,43.571429,45.0,28,64,12.501428,7
educator,42.010526,42.0,23,63,10.413264,95
engineer,36.38806,36.0,22,70,11.199236,67
entertainment,29.222222,25.0,15,50,10.056052,18
executive,38.71875,38.5,22,69,10.608075,32
healthcare,41.5625,44.5,22,62,11.313524,16
homemaker,32.571429,32.0,20,50,10.737119,7
lawyer,36.75,34.0,21,53,10.830303,12


### A few other (lesser used) pandas methods

In [106]:
# recode values
users['is_male']=users['gender'].map({'F':0, 'M':1})
users.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code,is_male
0,1,24,M,technician,85711,1
1,2,53,F,other,94043,0
2,3,23,M,writer,32067,1
3,4,24,M,technician,43537,1
4,5,33,F,other,15213,0


In [108]:
# duplicates
users.duplicated().head()

0    False
1    False
2    False
3    False
4    False
dtype: bool

In [110]:
# count the number of dupes in our df
users.isnull().sum()
users.duplicated().sum()

0