In [1]:
import pandas as pd

In [2]:
users = pd.read_table('user.tbl', sep='|')

In [3]:
users.describe()

Unnamed: 0,user_id,age
count,943.0,943.0
mean,472.0,34.051962
std,272.364951,12.19274
min,1.0,7.0
25%,236.5,25.0
50%,472.0,31.0
75%,707.5,43.0
max,943.0,73.0


In [4]:
users.describe(include=['object'])

Unnamed: 0,gender,occupation,zip_code
count,943,943,943
unique,2,21,795
top,M,student,55414
freq,670,196,9


In [5]:
## Frequency counts for categorical variables

In [6]:
users.gender.value_counts()

M    670
F    273
Name: gender, dtype: int64

In [7]:
users.occupation.value_counts(sort=True)

student          196
other            105
educator          95
administrator     79
engineer          67
programmer        66
librarian         51
writer            45
executive         32
scientist         31
artist            28
technician        27
marketing         26
entertainment     18
healthcare        16
retired           14
salesman          12
lawyer            12
none               9
homemaker          7
doctor             7
Name: occupation, dtype: int64

In [8]:
users.occupation.value_counts().sort_index()

administrator     79
artist            28
doctor             7
educator          95
engineer          67
entertainment     18
executive         32
healthcare        16
homemaker          7
lawyer            12
librarian         51
marketing         26
none               9
other            105
programmer        66
retired           14
salesman          12
scientist         31
student          196
technician        27
writer            45
Name: occupation, dtype: int64

In [9]:
users.occupation.value_counts(normalize=True)

student          0.207847
other            0.111347
educator         0.100742
administrator    0.083775
engineer         0.071050
programmer       0.069989
librarian        0.054083
writer           0.047720
executive        0.033934
scientist        0.032874
artist           0.029692
technician       0.028632
marketing        0.027572
entertainment    0.019088
healthcare       0.016967
retired          0.014846
salesman         0.012725
lawyer           0.012725
none             0.009544
homemaker        0.007423
doctor           0.007423
Name: occupation, dtype: float64

In [10]:
type(users.occupation.value_counts())

pandas.core.series.Series

In [11]:
users.occupation.value_counts(normalize=True).apply(lambda n: "{:.2%}".format(n))

student          20.78%
other            11.13%
educator         10.07%
administrator     8.38%
engineer          7.10%
programmer        7.00%
librarian         5.41%
writer            4.77%
executive         3.39%
scientist         3.29%
artist            2.97%
technician        2.86%
marketing         2.76%
entertainment     1.91%
healthcare        1.70%
retired           1.48%
salesman          1.27%
lawyer            1.27%
none              0.95%
homemaker         0.74%
doctor            0.74%
Name: occupation, dtype: object

In [12]:

users.occupation.value_counts(normalize=True).index

Index(['student', 'other', 'educator', 'administrator', 'engineer',
       'programmer', 'librarian', 'writer', 'executive', 'scientist', 'artist',
       'technician', 'marketing', 'entertainment', 'healthcare', 'retired',
       'salesman', 'lawyer', 'none', 'homemaker', 'doctor'],
      dtype='object')

In [13]:
 #  Finally, make it look tabular by creating a data frame with 
#  Index, then counts, then percentage
pd.DataFrame({'frequency': users.occupation.value_counts(), 
               'percentage': users.occupation.value_counts(normalize=True).apply(lambda n: "{:.2%}".format(n))})

Unnamed: 0,frequency,percentage
student,196,20.78%
other,105,11.13%
educator,95,10.07%
administrator,79,8.38%
engineer,67,7.10%
programmer,66,7.00%
librarian,51,5.41%
writer,45,4.77%
executive,32,3.39%
scientist,31,3.29%


Sample statistics for quantitative variables

```
Standard Error of Mean. A measure of how much the value of the mean may vary from sample to sample taken from the same distribution. It can be used to roughly compare the observed mean to a hypothesized value (that is, you can conclude the two values are different if the ratio of the difference to the standard error is less than -2 or greater than +2).
```
```
Skewness. A measure of the asymmetry of a distribution. The normal distribution is symmetric and has a skewness value of 0. A distribution with a significant positive skewness has a long right tail. A distribution with a significant negative skewness has a long left tail. As a guideline, a skewness value more than twice its standard error is taken to indicate a departure from symmetry.
```

In [14]:
(users.age.mean(), users.age.median(), users.age.std(), users.age.sem(), users.age.skew())

(34.05196182396607,
 31.0,
 12.192739733059032,
 0.3970502346848897,
 0.6053815216208658)

In [15]:
users.age.quantile([.25, .50, .75])

0.25    25.0
0.50    31.0
0.75    43.0
Name: age, dtype: float64

In [16]:
from scipy.stats import iqr
iqr(users.age)

18.0

#### Different data set:  beverage consumption

In [17]:
# Drinks data.
drink_cols = ['country', 'beer', 'spirit', 'wine', 'liters', 'continent']
types = {'country': str, 'beer': int, 'spirit': int, 'wine': int, 'liters': float, 'continent': str}
drinks = pd.read_csv('drinks.csv', header=0, names=drink_cols, dtype=types)

In [18]:
drinks.rename(columns={'liters': 'total_alcohol'}, inplace=True)

In [19]:
drinks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   country        193 non-null    object 
 1   beer           193 non-null    int64  
 2   spirit         193 non-null    int64  
 3   wine           193 non-null    int64  
 4   total_alcohol  193 non-null    float64
 5   continent      170 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 9.2+ KB


In [20]:
drinks.head()

Unnamed: 0,country,beer,spirit,wine,total_alcohol,continent
0,Afghanistan,0,0,0,0.0,AS
1,Albania,89,132,54,4.9,EU
2,Algeria,25,0,14,0.7,AF
3,Andorra,245,138,312,12.4,EU
4,Angola,217,57,45,5.9,AF


In [21]:
drinks.shape

(193, 6)

#### Multivariate Non-Graphical


In [22]:
ct = pd.crosstab(users.occupation, users.gender)

In [23]:
type(ct)

pandas.core.frame.DataFrame

In [24]:
pd.crosstab(users.occupation, users.gender, margins=True)

gender,F,M,All
occupation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
administrator,36,43,79
artist,13,15,28
doctor,0,7,7
educator,26,69,95
engineer,2,65,67
entertainment,2,16,18
executive,3,29,32
healthcare,11,5,16
homemaker,6,1,7
lawyer,2,10,12


In [25]:
df = pd.crosstab(users.occupation, users.gender)

In [26]:
df.columns

Index(['F', 'M'], dtype='object', name='gender')

In [27]:
df['PctF'] = df.F / (df.F + df.M)

In [28]:
df

gender,F,M,PctF
occupation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
administrator,36,43,0.455696
artist,13,15,0.464286
doctor,0,7,0.0
educator,26,69,0.273684
engineer,2,65,0.029851
entertainment,2,16,0.111111
executive,3,29,0.09375
healthcare,11,5,0.6875
homemaker,6,1,0.857143
lawyer,2,10,0.166667


In [29]:
# Calculate the mean alcohol amounts for each continent.
#   -- also sum, count, max, min
drinks.groupby('continent').mean()

Unnamed: 0_level_0,beer,spirit,wine,total_alcohol
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AF,61.471698,16.339623,16.264151,3.007547
AS,37.045455,60.840909,9.068182,2.170455
EU,193.777778,132.555556,142.222222,8.617778
OC,89.6875,58.4375,35.625,3.38125
SA,175.083333,114.75,62.416667,6.308333


In [30]:
type(drinks.groupby('continent').mean())

pandas.core.frame.DataFrame

In [31]:
drinks.groupby('continent').mean().reset_index()

Unnamed: 0,continent,beer,spirit,wine,total_alcohol
0,AF,61.471698,16.339623,16.264151,3.007547
1,AS,37.045455,60.840909,9.068182,2.170455
2,EU,193.777778,132.555556,142.222222,8.617778
3,OC,89.6875,58.4375,35.625,3.38125
4,SA,175.083333,114.75,62.416667,6.308333
