Tutorial for missing values and summary statistics:

In [1]:
import pandas as pd
import numpy as np

In [3]:
listings = pd.read_csv('listings.csv')

In [7]:
listings.dtypes

id                                    int64
listing_url                          object
scrape_id                             int64
last_scraped                         object
name                                 object
summary                              object
space                                object
description                          object
experiences_offered                  object
neighborhood_overview                object
notes                                object
transit                              object
access                               object
interaction                          object
house_rules                          object
thumbnail_url                        object
medium_url                           object
picture_url                          object
xl_picture_url                       object
host_id                               int64
host_url                             object
host_name                            object
host_since                      

In [8]:
df = listings[['id', 'license', 'scrape_id', 'review_scores_rating']]

In [9]:
df.isnull().sum()

id                         0
license                 6608
scrape_id                  0
review_scores_rating    2126
dtype: int64

In [10]:
#we can simply drop nulls with dropna:
#in our case, all 6608 license rows are nulls so dropna returns an empty df
df.dropna()

Unnamed: 0,id,license,scrape_id,review_scores_rating


In [12]:
#we can return a df of booleans based on null/notnull with isnull:
df.isnull().head()

Unnamed: 0,id,license,scrape_id,review_scores_rating
0,False,True,False,False
1,False,True,False,False
2,False,True,False,False
3,False,True,False,False
4,False,True,False,True


In [18]:
#dropna can take arguments to determine dropping criteria:
#with the all criterion, only examples with all nulls will be dropped
df.dropna(how='all').head()

Unnamed: 0,id,license,scrape_id,review_scores_rating
0,11204286,,20160706203047,100.0
1,7972006,,20160706203047,100.0
2,7727710,,20160706203047,80.0
3,13124681,,20160706203047,100.0
4,3469225,,20160706203047,


In [26]:
#we can further condition dropna based on the threshold param:
#here we require 0 non null values in the examples
df.dropna(thresh=0).head()

Unnamed: 0,id,license,scrape_id,review_scores_rating
0,11204286,,20160706203047,100.0
1,7972006,,20160706203047,100.0
2,7727710,,20160706203047,80.0
3,13124681,,20160706203047,100.0
4,3469225,,20160706203047,


In [28]:
#here we require 4 non null values in the examples
df.dropna(thresh=4).head()

Unnamed: 0,id,license,scrape_id,review_scores_rating


In [31]:
#drop columns with missing values:
#notice using the axis method will become deprecated in future pandas versions
df.dropna(axis=1).head()

Unnamed: 0,id,scrape_id
0,11204286,20160706203047
1,7972006,20160706203047
2,7727710,20160706203047
3,13124681,20160706203047
4,3469225,20160706203047


FILLNA

In [33]:
#we can use fillna to fill missing values:
df.fillna(0).head()

Unnamed: 0,id,license,scrape_id,review_scores_rating
0,11204286,1.0,20160706203047,100.0
1,7972006,0.0,20160706203047,100.0
2,7727710,0.0,20160706203047,80.0
3,13124681,0.0,20160706203047,100.0
4,3469225,0.0,20160706203047,0.0


In [39]:
#we can specify further conditions to be more accurate in the replacement:
#to better demonstrate functionality, we create a col of nulls:
df['nulls_col'] = np.nan
df.fillna({'license': 1, 'nulls_col': 2}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [41]:
#diffrent methods for filling the values can be set directly witht he method arg:
#for example, one can use the ffill method to fill the entire column with the last 
#not null value
df.fillna(method='ffill').head()

Unnamed: 0,id,license,scrape_id,review_scores_rating,nulls_col
0,11204286,1.0,20160706203047,100.0,2.0
1,7972006,1.0,20160706203047,100.0,2.0
2,7727710,1.0,20160706203047,80.0,2.0
3,13124681,1.0,20160706203047,100.0,2.0
4,3469225,1.0,20160706203047,100.0,2.0


Data Summarization:

The NumPy package contains several functions that are useful here, but several summarization or reduction methods are built into Pandas data structures.

In [42]:
df.sum()

id                      5.393800e+10
license                 6.608000e+03
scrape_id               1.332219e+17
review_scores_rating    4.230130e+05
nulls_col               1.321600e+04
dtype: float64

In [44]:
df.mean()

id                      8.162531e+06
license                 1.000000e+00
scrape_id               2.016071e+13
review_scores_rating    9.438041e+01
nulls_col               2.000000e+00
dtype: float64

In [45]:
df.median()

id                      8.248510e+06
license                 1.000000e+00
scrape_id               2.016071e+13
review_scores_rating    9.700000e+01
nulls_col               2.000000e+00
dtype: float64

In [46]:
#the pandas summarization methods allow for handling nulls:
#notice the values are slightly diffrent from above, 
#and very different fro the review_scores_rating col which had all nulls
df.mean(skipna=False)

id                      8.162531e+06
license                 1.000000e+00
scrape_id               2.016071e+13
review_scores_rating             NaN
nulls_col               2.000000e+00
dtype: float64

In [50]:
#we can use the axis method to summarize over rows rather than columns:
df.sum(axis=1).head()

0    2.016072e+13
1    2.016071e+13
2    2.016071e+13
3    2.016072e+13
4    2.016071e+13
dtype: float64

In [52]:
#this operation returns a series of length len(df) with index df.index
df.sum(axis=1).sort_values(ascending=False).head()

511     2.016072e+13
1217    2.016072e+13
4479    2.016072e+13
991     2.016072e+13
6301    2.016072e+13
dtype: float64

In [53]:
#the describe command will provide summary stats for the data passed to it
df.describe()

Unnamed: 0,id,license,scrape_id,review_scores_rating,nulls_col
count,6608.0,6608.0,6608.0,4482.0,6608.0
mean,8162531.0,1.0,20160710000000.0,94.380411,2.0
std,4214318.0,0.0,0.0,7.822489,0.0
min,6.0,1.0,20160710000000.0,20.0,2.0
25%,4697434.0,1.0,20160710000000.0,92.0,2.0
50%,8248510.0,1.0,20160710000000.0,97.0,2.0
75%,12252990.0,1.0,20160710000000.0,100.0,2.0
max,13882930.0,1.0,20160710000000.0,100.0,2.0


In [54]:
#also works with a series:
df.scrape_id.describe()

count    6.608000e+03
mean     2.016071e+13
std      0.000000e+00
min      2.016071e+13
25%      2.016071e+13
50%      2.016071e+13
75%      2.016071e+13
max      2.016071e+13
Name: scrape_id, dtype: float64

In [59]:
#covariance between two variables:
df['id'].cov(df['review_scores_rating'])

1245425.5272006723

In [58]:
#correlation between two variables:
df['id'].corr(df['review_scores_rating'])

0.039797889430816025

In [61]:
#the commands can also be run on the entire df:
print df.cov()
print df.corr()

                                id  license     scrape_id  \
id                    1.776048e+13      0.0  5.362118e-10   
license               0.000000e+00      0.0  0.000000e+00   
scrape_id             5.362118e-10      0.0  7.520730e+00   
review_scores_rating  1.245426e+06      0.0  6.330625e-15   
nulls_col             0.000000e+00      0.0  0.000000e+00   

                      review_scores_rating  nulls_col  
id                            1.245426e+06        0.0  
license                       0.000000e+00        0.0  
scrape_id                     6.330625e-15        0.0  
review_scores_rating          6.119134e+01        0.0  
nulls_col                     0.000000e+00        0.0  
                                id  license     scrape_id  \
id                    1.000000e+00      NaN  4.639584e-17   
license                        NaN      NaN           NaN   
scrape_id             4.639584e-17      NaN  1.000000e+00   
review_scores_rating  3.979789e-02      NaN  1.125836

In [64]:
#we can use hierarchical indexing to calculate summary stats for dataframes:
hier_df = pd.DataFrame(np.arange(1,16).reshape(5,3),
                      index=[['ind1', 'ind1', 'ind2', 'ind2', 'ind3'],
                             [1,2,1,2,1]],
                      columns = [['top','top','top'],['col1', 'col2', 'col3']])

In [81]:
hier_df

Unnamed: 0_level_0,Unnamed: 1_level_0,top,top,top
Unnamed: 0_level_1,Unnamed: 1_level_1,col1,col2,col3
ind1,1,1,2,3
ind1,2,4,5,6
ind2,1,7,8,9
ind2,2,10,11,12
ind3,1,13,14,15


In [67]:
#summing for subset of df
hier_df.loc['ind1'].sum()

top  col1    5
     col2    7
     col3    9
dtype: int64

In [68]:
#summing for subset of df
hier_df.loc['ind1'].sum(axis=1)

1     6
2    15
dtype: int64

In [69]:
#mean for subset of df
hier_df.loc['ind2'].mean()

top  col1     8.5
     col2     9.5
     col3    10.5
dtype: float64

In [71]:
#mean for subset of df with axis=1
hier_df.loc['ind2'].mean(axis=1)

1     8.0
2    11.0
dtype: float64

In [78]:
hier_df.loc['ind1'].corr()

Unnamed: 0_level_0,Unnamed: 1_level_0,top,top,top
Unnamed: 0_level_1,Unnamed: 1_level_1,col1,col2,col3
top,col1,1.0,1.0,1.0
top,col2,1.0,1.0,1.0
top,col3,1.0,1.0,1.0


In [80]:
hier_df.loc['ind2'].cov()

Unnamed: 0_level_0,Unnamed: 1_level_0,top,top,top
Unnamed: 0_level_1,Unnamed: 1_level_1,col1,col2,col3
top,col1,4.5,4.5,4.5
top,col2,4.5,4.5,4.5
top,col3,4.5,4.5,4.5


In [85]:
#one can also use more complex filtering options:
hier_df.loc['ind1', ['top', 'col2']].sum(axis=1)

1     6
2    15
dtype: int64

In [92]:
#you can also aggregate by passing the level explicitly:
hier_df.sum(level=1)

Unnamed: 0_level_0,top,top,top
Unnamed: 0_level_1,col1,col2,col3
1,21,24,27
2,14,16,18
