# Pandas DataFrames

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
plt.rcParams['figure.figsize'] = (12, 8)
#plt.style.use('seaborn')



## Set path to data
path = os.getcwd() + '\\data\\'

## <a id="toc">Table of Contents</a>
> 1. [Creating Pandas DataFrame Objects](#1)
    1. Create a DataFrame
    2. Accessing Rows
    3. Creating a DataFrame from numpy
> 2. [DataFrame Axes (1 and 0)](#2)
    1. Index (rows, 0)
    2. Columns (1)
> 3. [Creating a Clean DataFrame Function](#3)
> 4. [Math Methods in DataFrames](#4)
    1. Index Alignment
> 5. [Looping and Aggregation](#5)
    1. Looping
    2. Aggregations
    3. Multiple Aggregations using Agg
    4. Describe
    5. The .apply method vs .pipe method
> 6. [Creating and Updating Columns](#6)

### 1. <a id=1> Creating Pandas DataFrame Objects </a>
[Back to contents](#toc)

#### a. Create a DataFrame

In [2]:
df = pd.DataFrame({'growth':[.5, .9, 1.2],
                   'Name' : ['Paul', 'George', 'Kristi']})

In [3]:
df

Unnamed: 0,growth,Name
0,0.5,Paul
1,0.9,George
2,1.2,Kristi


#### b. Accessing Rows

Access by row using location indeexing .iloc

In [4]:
df.iloc[2]

growth       1.2
Name      Kristi
Name: 2, dtype: object

Indexing by column

In [5]:
df['Name']

0      Paul
1    George
2    Kristi
Name: Name, dtype: object

#### c. Creating a DataFrame from numpy

In [6]:
np.random.seed(24)

## 10 rows and 3 columns
values = np.random.randn(10,3)
values

array([[ 1.32921217, -0.77003345, -0.31628036],
       [-0.99081039, -1.07081626, -1.43871328],
       [ 0.56441685,  0.29572189, -1.62640423],
       [ 0.2195652 ,  0.6788048 ,  1.88927273],
       [ 0.9615384 ,  0.1040112 , -0.48116532],
       [ 0.85022853,  1.45342467,  1.05773744],
       [ 0.16556161,  0.51501838, -1.33693569],
       [ 0.56286114,  1.39285483, -0.06332798],
       [ 0.12166836,  1.20760254, -0.00204021],
       [ 1.62779574,  0.35449279,  1.03752763]])

In [7]:
df = pd.DataFrame(values, columns = ['a', 'b', 'c'])
df

Unnamed: 0,a,b,c
0,1.329212,-0.770033,-0.31628
1,-0.99081,-1.070816,-1.438713
2,0.564417,0.295722,-1.626404
3,0.219565,0.678805,1.889273
4,0.961538,0.104011,-0.481165
5,0.850229,1.453425,1.057737
6,0.165562,0.515018,-1.336936
7,0.562861,1.392855,-0.063328
8,0.121668,1.207603,-0.00204
9,1.627796,0.354493,1.037528


### 2. <a id=2> DataFrame Axes (1 and 0) </a>
[Back to contents](#toc)

In [8]:
df

Unnamed: 0,a,b,c
0,1.329212,-0.770033,-0.31628
1,-0.99081,-1.070816,-1.438713
2,0.564417,0.295722,-1.626404
3,0.219565,0.678805,1.889273
4,0.961538,0.104011,-0.481165
5,0.850229,1.453425,1.057737
6,0.165562,0.515018,-1.336936
7,0.562861,1.392855,-0.063328
8,0.121668,1.207603,-0.00204
9,1.627796,0.354493,1.037528


Unlike a series, which has one axis, there are two axes for a dataframe. They are commonly referred
to as axis 0 and 1, or the "index" (or 'rows') axis and the "columns" axis respectively:

In [9]:
df.axes

[RangeIndex(start=0, stop=10, step=1), Index(['a', 'b', 'c'], dtype='object')]

#### a. Index (rows, 0)
As many operations take an axis parameter, it is important to remember that 0 is the index and
1 is the columns:

In [10]:
df.axes[0]

RangeIndex(start=0, stop=10, step=1)

#### b. Columns (1)

In [11]:
df.axes[1]

Index(['a', 'b', 'c'], dtype='object')

For example, we can sum a dataframe along the index or along the columns using the labels 0
and 1:

##### Sum each column

In [12]:
df.sum(axis=0)

a    5.412038
b    4.161081
c   -1.280329
dtype: float64

In [13]:
df.sum(axis = 'index')

a    5.412038
b    4.161081
c   -1.280329
dtype: float64

##### Sum each row

In [14]:
df.sum(axis=1)

0    0.242898
1   -3.500340
2   -0.766265
3    2.787643
4    0.584384
5    3.361391
6   -0.656356
7    1.892388
8    1.327231
9    3.019816
dtype: float64

In [15]:
df.sum(axis = 'columns')

0    0.242898
1   -3.500340
2   -0.766265
3    2.787643
4    0.584384
5    3.361391
6   -0.656356
7    1.892388
8    1.327231
9    3.019816
dtype: float64

### 3. <a id=3> Creating a Clean DataFrame Function </a>
[Back to contents](#toc)

In [16]:
file = 'siena2018-pres.csv'
df = pd.read_csv(path + file, index_col = 0)
df.head()

Unnamed: 0,Seq.,President,Party,Bg,Im,Int,IQ,L,WR,AC,EAb,LA,CAb,OA,PL,RC,CAp,HE,EAp,DA,FPA,AM,EV,O
1,1,George Washington,Independent,7,7,1,10,1,6,2,2,1,11,2,18,1,1,1,1,2,2,1,2,1
2,2,John Adams,Federalist,3,13,4,4,24,14,31,21,21,13,8,28,17,4,13,15,19,13,16,10,14
3,3,Thomas Jefferson,Democratic-Republican,2,2,14,1,8,5,14,6,6,4,4,5,5,7,20,4,6,9,7,5,5
4,4,James Madison,Democratic-Republican,4,6,7,3,16,15,6,13,17,10,6,9,10,6,14,7,11,19,11,8,7
5,5,James Monroe,Democratic-Republican,9,14,11,18,6,16,7,10,12,15,17,12,8,11,9,9,10,5,6,9,8


Fix the data

In [17]:
def tweak_siena_pres(df_):
    
## Function to convert all int64s to uint8
    def int64_to_uint8(df_):
        cols = df_.select_dtypes('int64')
        return df_.astype({col:'uint8' for col in cols})

## Rename columns
    col_rename = {'Seq.':'Seq', 
                  'Bg':'Background',
                  'PL':'Party leadership',
                  'CAb':'Communication ability',
                  'RC': 'Relations with Congress', 
                  'CAp': 'Court appointments',
                  'HE': 'Handling of economy', 
                  'L': 'Luck',
                  'AC': 'Ability to compromise', 
                  'WR': 'Willing to take risks',
                  'EAp': 'Executive appointments', 
                  'OA': 'Overall ability',
                  'Im': 'Imagination', 
                  'DA': 'Domestic accomplishments',
                  'Int': 'Integrity', 
                  'EAb': 'Executive ability',
                  'FPA': 'Foreign policy accomplishments',
                  'LA': 'Leadership ability',
                  'IQ': 'Intelligence', 'AM ': 'Avoid crucial mistakes',
                  'EV': "Experts' view", 'O': 'Overall'}

    colNames = {}
    for key, value in col_rename.items():
        colClean = value.replace(' ','_')
        colNames[key] = colClean

## Create the dataframe
    return(df_
           .rename(columns = colNames)     
           .astype({'Party':'category'})
           .pipe(int64_to_uint8)
           .assign(Average_rank = lambda df_:(df_.select_dtypes('uint8')
                                                 .sum(axis=1)
                                                 .rank(method='dense')
                                                 .astype('uint8')),
                   Quartile = lambda df_:pd.qcut(df_.Average_rank, 4, labels='1st 2nd 3rd 4th'.split())
                  )
          )

In [18]:
df = tweak_siena_pres(df)
df.head()

Unnamed: 0,Seq,President,Party,Background,Imagination,Integrity,Intelligence,Luck,Willing_to_take_risks,Ability_to_compromise,Executive_ability,Leadership_ability,Communication_ability,Overall_ability,Party_leadership,Relations_with_Congress,Court_appointments,Handling_of_economy,Executive_appointments,Domestic_accomplishments,Foreign_policy_accomplishments,AM,Experts'_view,Overall,Average_rank,Quartile
1,1,George Washington,Independent,7,7,1,10,1,6,2,2,1,11,2,18,1,1,1,1,2,2,1,2,1,1,1st
2,2,John Adams,Federalist,3,13,4,4,24,14,31,21,21,13,8,28,17,4,13,15,19,13,16,10,14,13,2nd
3,3,Thomas Jefferson,Democratic-Republican,2,2,14,1,8,5,14,6,6,4,4,5,5,7,20,4,6,9,7,5,5,5,1st
4,4,James Madison,Democratic-Republican,4,6,7,3,16,15,6,13,17,10,6,9,10,6,14,7,11,19,11,8,7,7,1st
5,5,James Monroe,Democratic-Republican,9,14,11,18,6,16,7,10,12,15,17,12,8,11,9,9,10,5,6,9,8,8,1st


### 4. <a id=4> Math Methods in DataFrames </a>
[Back to contents](#toc)

In [19]:
file = 'siena2018-pres.csv'
df = pd.read_csv(path + file, index_col = 0)
pres = tweak_siena_pres(df)
pres.head()

Unnamed: 0,Seq,President,Party,Background,Imagination,Integrity,Intelligence,Luck,Willing_to_take_risks,Ability_to_compromise,Executive_ability,Leadership_ability,Communication_ability,Overall_ability,Party_leadership,Relations_with_Congress,Court_appointments,Handling_of_economy,Executive_appointments,Domestic_accomplishments,Foreign_policy_accomplishments,AM,Experts'_view,Overall,Average_rank,Quartile
1,1,George Washington,Independent,7,7,1,10,1,6,2,2,1,11,2,18,1,1,1,1,2,2,1,2,1,1,1st
2,2,John Adams,Federalist,3,13,4,4,24,14,31,21,21,13,8,28,17,4,13,15,19,13,16,10,14,13,2nd
3,3,Thomas Jefferson,Democratic-Republican,2,2,14,1,8,5,14,6,6,4,4,5,5,7,20,4,6,9,7,5,5,5,1st
4,4,James Madison,Democratic-Republican,4,6,7,3,16,15,6,13,17,10,6,9,10,6,14,7,11,19,11,8,7,7,1st
5,5,James Monroe,Democratic-Republican,9,14,11,18,6,16,7,10,12,15,17,12,8,11,9,9,10,5,6,9,8,8,1st


#### a. Index Alignment

We can perform math operations of the dataframe. There are the math methods like .add and .div
and we also have dunder methods that allow us to use the operators like +, -, /, and *.
Note that the index will align when we perform math. To demonstrate alignment, I will add the
values from index values at rows 0-2 and column positions at index 0-3 and add then to the index
values from rows 1-5 and 0-4:

In [20]:
scores = pres.loc[:, 'Background':'Average_rank']
scores.head()

Unnamed: 0,Background,Imagination,Integrity,Intelligence,Luck,Willing_to_take_risks,Ability_to_compromise,Executive_ability,Leadership_ability,Communication_ability,Overall_ability,Party_leadership,Relations_with_Congress,Court_appointments,Handling_of_economy,Executive_appointments,Domestic_accomplishments,Foreign_policy_accomplishments,AM,Experts'_view,Overall,Average_rank
1,7,7,1,10,1,6,2,2,1,11,2,18,1,1,1,1,2,2,1,2,1,1
2,3,13,4,4,24,14,31,21,21,13,8,28,17,4,13,15,19,13,16,10,14,13
3,2,2,14,1,8,5,14,6,6,4,4,5,5,7,20,4,6,9,7,5,5,5
4,4,6,7,3,16,15,6,13,17,10,6,9,10,6,14,7,11,19,11,8,7,7
5,9,14,11,18,6,16,7,10,12,15,17,12,8,11,9,9,10,5,6,9,8,8


In [21]:
s1 = scores.iloc[:3, :4]
s1

Unnamed: 0,Background,Imagination,Integrity,Intelligence
1,7,7,1,10
2,3,13,4,4
3,2,2,14,1


In [22]:
s2 = scores.iloc[1:6, :5]
s2

Unnamed: 0,Background,Imagination,Integrity,Intelligence,Luck
2,3,13,4,4,24
3,2,2,14,1,8
4,4,6,7,3,16
5,9,14,11,18,6
6,1,9,6,5,29


In [23]:
s1 + s2

Unnamed: 0,Background,Imagination,Integrity,Intelligence,Luck
1,,,,,
2,6.0,26.0,8.0,8.0,
3,4.0,4.0,28.0,2.0,
4,,,,,
5,,,,,
6,,,,,


Only the overlapping rows (rows 2 and 3) and columns (Background through Intelligence) get
added together. The other values are missing!

### 5. <a id=5> Looping and Aggregation </a>
[Back to contents](#toc)

#### a. Looping

You can use a for loop with a dataframe, though you generally want to avoid for loops when doing
numerical manipulation. When I see a for loop with pandas code, it means this is a slow operation,
and you are not able to take advantage of the vectorization that speeds up many operations.
However, sometimes a for loop is appropriate (I use them when labeling plots).

If you need to loop over a dataframe, here are three methods for doing it. 
- The .iteritems method gives you a tuple with the column name and the column (a series). 
- The .iterrows method gives you a tuple with the index value and the row (converted into a series). 
- Finally, the .itertuples method gives you a row represented as a named tuple (with the index in position 0):

In [24]:
file = 'siena2018-pres.csv'
df = pd.read_csv(path + file, index_col = 0)
pres = tweak_siena_pres(df)
pres.head()

Unnamed: 0,Seq,President,Party,Background,Imagination,Integrity,Intelligence,Luck,Willing_to_take_risks,Ability_to_compromise,Executive_ability,Leadership_ability,Communication_ability,Overall_ability,Party_leadership,Relations_with_Congress,Court_appointments,Handling_of_economy,Executive_appointments,Domestic_accomplishments,Foreign_policy_accomplishments,AM,Experts'_view,Overall,Average_rank,Quartile
1,1,George Washington,Independent,7,7,1,10,1,6,2,2,1,11,2,18,1,1,1,1,2,2,1,2,1,1,1st
2,2,John Adams,Federalist,3,13,4,4,24,14,31,21,21,13,8,28,17,4,13,15,19,13,16,10,14,13,2nd
3,3,Thomas Jefferson,Democratic-Republican,2,2,14,1,8,5,14,6,6,4,4,5,5,7,20,4,6,9,7,5,5,5,1st
4,4,James Madison,Democratic-Republican,4,6,7,3,16,15,6,13,17,10,6,9,10,6,14,7,11,19,11,8,7,7,1st
5,5,James Monroe,Democratic-Republican,9,14,11,18,6,16,7,10,12,15,17,12,8,11,9,9,10,5,6,9,8,8,1st


**iteritems()** - iteration over columns (col_name, series) tuple

In [25]:
for col_name, col in pres.iteritems():
    print(col_name,',',type(col))

Seq , <class 'pandas.core.series.Series'>
President , <class 'pandas.core.series.Series'>
Party , <class 'pandas.core.series.Series'>
Background , <class 'pandas.core.series.Series'>
Imagination , <class 'pandas.core.series.Series'>
Integrity , <class 'pandas.core.series.Series'>
Intelligence , <class 'pandas.core.series.Series'>
Luck , <class 'pandas.core.series.Series'>
Willing_to_take_risks , <class 'pandas.core.series.Series'>
Ability_to_compromise , <class 'pandas.core.series.Series'>
Executive_ability , <class 'pandas.core.series.Series'>
Leadership_ability , <class 'pandas.core.series.Series'>
Communication_ability , <class 'pandas.core.series.Series'>
Overall_ability , <class 'pandas.core.series.Series'>
Party_leadership , <class 'pandas.core.series.Series'>
Relations_with_Congress , <class 'pandas.core.series.Series'>
Court_appointments , <class 'pandas.core.series.Series'>
Handling_of_economy , <class 'pandas.core.series.Series'>
Executive_appointments , <class 'pandas.core.s

**iterrows()** - iteration over rows (index, row(as a series)) tuple

In [26]:
for idx, row in pres.iterrows():
    print(idx, type(row))

1 <class 'pandas.core.series.Series'>
2 <class 'pandas.core.series.Series'>
3 <class 'pandas.core.series.Series'>
4 <class 'pandas.core.series.Series'>
5 <class 'pandas.core.series.Series'>
6 <class 'pandas.core.series.Series'>
7 <class 'pandas.core.series.Series'>
8 <class 'pandas.core.series.Series'>
9 <class 'pandas.core.series.Series'>
10 <class 'pandas.core.series.Series'>
11 <class 'pandas.core.series.Series'>
12 <class 'pandas.core.series.Series'>
13 <class 'pandas.core.series.Series'>
14 <class 'pandas.core.series.Series'>
15 <class 'pandas.core.series.Series'>
16 <class 'pandas.core.series.Series'>
17 <class 'pandas.core.series.Series'>
18 <class 'pandas.core.series.Series'>
19 <class 'pandas.core.series.Series'>
20 <class 'pandas.core.series.Series'>
21 <class 'pandas.core.series.Series'>
22 <class 'pandas.core.series.Series'>
23 <class 'pandas.core.series.Series'>
24 <class 'pandas.core.series.Series'>
25 <class 'pandas.core.series.Series'>
26 <class 'pandas.core.series.Seri

**itertuples()** - iteration over rows as namedtuple (index as first item)

In [27]:
for tup in pres.itertuples():
    print(tup.Index, tup.Party)

1 Independent
2 Federalist
3 Democratic-Republican
4 Democratic-Republican
5 Democratic-Republican
6 Democratic-Republican
7 Democratic
8 Democratic
9 Whig
10 Independent
11 Democratic
12 Whig
13 Whig
14 Democratic
15 Democratic
16 Republican
17 Democratic
18 Republican
19 Republican
20 Republican
21 Republican
22 Democratic
23 Republican
24 Republican
25 Republican
26 Republican
27 Democratic
28 Republican
29 Republican
30 Republican
31 Democratic
32 Democratic
33 Republican
34 Democratic
35 Democratic
36 Republican
37 Republican
38 Democratic
39 Republican
40 Republican
41 Democratic
42 Republican
43 Democratic
44 Republican


#### b. Aggregations

In [28]:
scores = pres.loc[:, 'Background':'Average_rank']
scores.head()

Unnamed: 0,Background,Imagination,Integrity,Intelligence,Luck,Willing_to_take_risks,Ability_to_compromise,Executive_ability,Leadership_ability,Communication_ability,Overall_ability,Party_leadership,Relations_with_Congress,Court_appointments,Handling_of_economy,Executive_appointments,Domestic_accomplishments,Foreign_policy_accomplishments,AM,Experts'_view,Overall,Average_rank
1,7,7,1,10,1,6,2,2,1,11,2,18,1,1,1,1,2,2,1,2,1,1
2,3,13,4,4,24,14,31,21,21,13,8,28,17,4,13,15,19,13,16,10,14,13
3,2,2,14,1,8,5,14,6,6,4,4,5,5,7,20,4,6,9,7,5,5,5
4,4,6,7,3,16,15,6,13,17,10,6,9,10,6,14,7,11,19,11,8,7,7
5,9,14,11,18,6,16,7,10,12,15,17,12,8,11,9,9,10,5,6,9,8,8


Calculate the average of each row

In [29]:
scores.mean(axis = 'columns')

1      3.681818
2     14.454545
3      6.545455
4      9.636364
5     10.454545
6     17.181818
7     19.590909
8     25.681818
9     36.909091
10    34.409091
11    13.318182
12    29.500000
13    37.454545
14    39.409091
15    42.000000
16     4.045455
17    42.272727
18    24.227273
19    30.136364
20    27.272727
21    31.454545
22    22.181818
23    32.818182
24    19.727273
25     5.227273
26    21.318182
27    13.590909
28    38.772727
29    29.909091
30    31.954545
31     3.909091
32    11.818182
33     9.227273
34    12.727273
35    15.272727
36    26.909091
37    26.000000
38    26.818182
39    14.545455
40    20.818182
41    14.636364
42    30.363636
43    15.818182
44    39.772727
dtype: float64

In [30]:
scores.mean(axis = 1)

1      3.681818
2     14.454545
3      6.545455
4      9.636364
5     10.454545
6     17.181818
7     19.590909
8     25.681818
9     36.909091
10    34.409091
11    13.318182
12    29.500000
13    37.454545
14    39.409091
15    42.000000
16     4.045455
17    42.272727
18    24.227273
19    30.136364
20    27.272727
21    31.454545
22    22.181818
23    32.818182
24    19.727273
25     5.227273
26    21.318182
27    13.590909
28    38.772727
29    29.909091
30    31.954545
31     3.909091
32    11.818182
33     9.227273
34    12.727273
35    15.272727
36    26.909091
37    26.000000
38    26.818182
39    14.545455
40    20.818182
41    14.636364
42    30.363636
43    15.818182
44    39.772727
dtype: float64

In [31]:
scores.sum(axis = 'columns')

1      81
2     318
3     144
4     212
5     230
6     378
7     431
8     565
9     812
10    757
11    293
12    649
13    824
14    867
15    924
16     89
17    930
18    533
19    663
20    600
21    692
22    488
23    722
24    434
25    115
26    469
27    299
28    853
29    658
30    703
31     86
32    260
33    203
34    280
35    336
36    592
37    572
38    590
39    320
40    458
41    322
42    668
43    348
44    875
dtype: int64

In [32]:
scores.sum(axis = 0)

Background                        968
Imagination                       957
Integrity                         990
Intelligence                      990
Luck                              990
Willing_to_take_risks             953
Ability_to_compromise             968
Executive_ability                 978
Leadership_ability                990
Communication_ability             990
Overall_ability                   990
Party_leadership                  990
Relations_with_Congress           979
Court_appointments                990
Handling_of_economy               990
Executive_appointments            990
Domestic_accomplishments          990
Foreign_policy_accomplishments    990
AM                                990
Experts'_view                     990
Overall                           990
Average_rank                      990
dtype: int64

#### c. Multiple Aggregations using Agg

In [33]:
pres.head()

Unnamed: 0,Seq,President,Party,Background,Imagination,Integrity,Intelligence,Luck,Willing_to_take_risks,Ability_to_compromise,Executive_ability,Leadership_ability,Communication_ability,Overall_ability,Party_leadership,Relations_with_Congress,Court_appointments,Handling_of_economy,Executive_appointments,Domestic_accomplishments,Foreign_policy_accomplishments,AM,Experts'_view,Overall,Average_rank,Quartile
1,1,George Washington,Independent,7,7,1,10,1,6,2,2,1,11,2,18,1,1,1,1,2,2,1,2,1,1,1st
2,2,John Adams,Federalist,3,13,4,4,24,14,31,21,21,13,8,28,17,4,13,15,19,13,16,10,14,13,2nd
3,3,Thomas Jefferson,Democratic-Republican,2,2,14,1,8,5,14,6,6,4,4,5,5,7,20,4,6,9,7,5,5,5,1st
4,4,James Madison,Democratic-Republican,4,6,7,3,16,15,6,13,17,10,6,9,10,6,14,7,11,19,11,8,7,7,1st
5,5,James Monroe,Democratic-Republican,9,14,11,18,6,16,7,10,12,15,17,12,8,11,9,9,10,5,6,9,8,8,1st


We can pass in a dictionary to perform multiple aggregations on a column.

In [34]:
pres.agg({'Luck':['count', 'size'],
         'Overall':['count', 'max']})

Unnamed: 0,Luck,Overall
count,44.0,44.0
size,44.0,
max,,44.0


You can use a keyword argument with a tuple to specify the index value of the resultant
aggregation:

In [35]:
pres.agg(Intelligence_count = ('Intelligence', 'count'),
        Intelligence_size = ('Intelligence', 'size'))

Unnamed: 0,Intelligence
Intelligence_count,44
Intelligence_size,44


#### d. Describe
The .describe method is a meta-aggregation that returns a dataframe with summary statistics
for each numeric columns:

In [36]:
pres.describe()

Unnamed: 0,Background,Imagination,Integrity,Intelligence,Luck,Willing_to_take_risks,Ability_to_compromise,Executive_ability,Leadership_ability,Communication_ability,Overall_ability,Party_leadership,Relations_with_Congress,Court_appointments,Handling_of_economy,Executive_appointments,Domestic_accomplishments,Foreign_policy_accomplishments,AM,Experts'_view,Overall,Average_rank
count,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0
mean,22.0,21.75,22.5,22.5,22.5,21.659091,22.0,22.227273,22.5,22.5,22.5,22.5,22.25,22.5,22.5,22.5,22.5,22.5,22.5,22.5,22.5,22.5
std,12.409674,12.519984,12.845233,12.845233,12.845233,11.892822,12.409674,12.500909,12.845233,12.845233,12.845233,12.845233,12.519984,12.845233,12.845233,12.845233,12.845233,12.845233,12.845233,12.845233,12.845233,12.845233
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,11.75,11.0,11.75,11.75,11.75,11.75,11.75,11.75,11.75,11.75,11.75,11.75,11.75,11.75,11.75,11.75,11.75,11.75,11.75,11.75,11.75,11.75
50%,22.0,21.5,22.5,22.5,22.5,22.5,22.0,22.5,22.5,22.5,22.5,22.5,22.5,22.5,22.5,22.5,22.5,22.5,22.5,22.5,22.5,22.5
75%,32.25,32.25,33.25,33.25,33.25,31.25,32.25,32.25,33.25,33.25,33.25,33.25,33.0,33.25,33.25,33.25,33.25,33.25,33.25,33.25,33.25,33.25
max,43.0,43.0,44.0,44.0,44.0,41.0,43.0,43.0,44.0,44.0,44.0,44.0,43.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0


#### e. The .apply method vs .pipe method
Like the series, the dataframe has an .apply method. Like the series method, you should be wary of
using the dataframe method. More specifically, if you are dealing with numbers, you might want
to see if you can operate in a vectorized way.

Also, keep in mind that a dataframe is two-dimensional. So rather than applying a function to
a single value, when you call .apply on a dataframe, you work on a whole row or a whole column.
Because of that, I find that I rarely use this method.
Most of the .apply examples you find in the wild are silly examples that show how .apply works,
but also give a false impression that you should be everywhere, including using it for these silly
examples.

For example, if you wanted to calculate the spread of the presidential rankings for each row, I
would do this:

In [37]:
(pres
 .select_dtypes('number')
 .pipe(lambda df_:df_.max(axis = 'columns') - df_.min(axis='columns')))

1     17
2     28
3     19
4     16
5     13
6     28
7     34
8     18
9     22
10    19
11    16
12    15
13     8
14     3
15     8
16    27
17    10
18    21
19    13
20    21
21    24
22    12
23     8
24    21
25    13
26    19
27    28
28    10
29    26
30    31
31    15
32    27
33    18
34    28
35    38
36    31
37    23
38    35
39    28
40    19
41    36
42    24
43    22
44    34
dtype: uint8

In [38]:
def test(df_):
    return df_.max(axis = 'columns') -df_.min(axis='columns')

(pres
 .select_dtypes('number')
 .pipe(test))

1     17
2     28
3     19
4     16
5     13
6     28
7     34
8     18
9     22
10    19
11    16
12    15
13     8
14     3
15     8
16    27
17    10
18    21
19    13
20    21
21    24
22    12
23     8
24    21
25    13
26    19
27    28
28    10
29    26
30    31
31    15
32    27
33    18
34    28
35    38
36    31
37    23
38    35
39    28
40    19
41    36
42    24
43    22
44    34
dtype: uint8

The .apply version looks like this. They look pretty similar but the former does an optimized max and min calculation, while the
latter does a separate calculation for each row.

In [39]:
(pres
 .select_dtypes('number')
 .apply(lambda _row: _row.max() - _row.min(), axis = 'columns')
)

1     17
2     28
3     19
4     16
5     13
6     28
7     34
8     18
9     22
10    19
11    16
12    15
13     8
14     3
15     8
16    27
17    10
18    21
19    13
20    21
21    24
22    12
23     8
24    21
25    13
26    19
27    28
28    10
29    26
30    31
31    15
32    27
33    18
34    28
35    38
36    31
37    23
38    35
39    28
40    19
41    36
42    24
43    22
44    34
dtype: int8

### 6. <a id=6> Creating and Updating Columns</a>
[Back to contents](#toc)
    

In [40]:
file = '2020-jetbrains-python-survey.csv'
jb = pd.read_csv(path + file)
jb.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,is.python.main,other.lang.None,other.lang.Java,other.lang.JavaScript,other.lang.C/C++,other.lang.PHP,other.lang.C#,other.lang.Ruby,other.lang.Bash / Shell,other.lang.Objective-C,other.lang.Go,other.lang.Visual Basic,other.lang.Scala,other.lang.SQL,other.lang.Kotlin,other.lang.R,other.lang.Swift,other.lang.Clojure,other.lang.Perl,other.lang.Rust,other.lang.Groovy,other.lang.TypeScript,other.lang.CoffeeScript,other.lang.HTML/CSS,other.lang.Other,python.years,years.of.coding,main.purposes,other.purposes.Educational purposes,other.purposes.Data analysis,other.purposes.DevOps / System administration / Writing automation scripts,other.purposes.Software testing / Writing automated tests,other.purposes.Software prototyping,other.purposes.Web development,other.purposes.Machine learning,other.purposes.Mobile development,other.purposes.Desktop development,other.purposes.Computer graphics,other.purposes.Network programming,other.purposes.Game development,other.purposes.Multimedia applications development,other.purposes.Embedded development,other.purposes.Programming of web parsers / scrapers / crawlers,other.purposes.Other,how.involved.Computer graphics,how.involved.Data analysis,how.involved.DevOps / System administration / Writing automation scripts,how.involved.Educational purposes,how.involved.Software testing / Writing automated tests,how.involved.Web development,how.involved.Network programming,how.involved.Desktop development,how.involved.Machine learning,how.involved.Software prototyping,how.involved.Programming of web parsers / scrapers / crawlers,how.involved.Embedded development,how.involved.Mobile development,how.involved.Game development,how.involved.Multimedia applications development,use.python.most,are.you.datascientist,python.version.most,python2.version.most,python3.version.most,python.version.upgrade.I don’t update,python.version.upgrade.Somebody else manages Python updates for me,python.version.upgrade.Python_org,python.version.upgrade.Build from source,python.version.upgrade.Automatic upgrade via cloud provider,python.version.upgrade.Anaconda,python.version.upgrade.ActivePython,python.version.upgrade.Intel Distribution for Python,"python.version.upgrade.OS-provided Python (via apt-get, yum, homebrew, etc_)",python.version.upgrade.pyenv,python.version.upgrade.pythonz,python.version.upgrade.I use Docker containers,python.version.upgrade.Other,isolate.environments.None,isolate.environments.Virtualenv,isolate.environments.Pipenv,isolate.environments.Poetry,isolate.environments.Conda,isolate.environments.Docker,isolate.environments.Vagrant / virtual machines,isolate.environments.Other,web.frameworks.None,web.frameworks.Django,web.frameworks.web2py,web.frameworks.Bottle,web.frameworks.CherryPy,web.frameworks.Flask,web.frameworks.Hug,web.frameworks.Pyramid,web.frameworks.Tornado,web.frameworks.Falcon,web.frameworks.FastAPI,web.frameworks.Other,data.frameworks.None,data.frameworks.PyTorch,data.frameworks.NumPy,data.frameworks.SciPy,data.frameworks.Pandas,data.frameworks.Matplotlib,data.frameworks.Seaborn,data.frameworks.SciKit-Learn,data.frameworks.Keras,data.frameworks.TensorFlow,data.frameworks.Theano,data.frameworks.NLTK,data.frameworks.MXNet,data.frameworks.Gensim,data.frameworks.Other,other.frameworks.None,other.frameworks.Requests,other.frameworks.aiohttp,other.frameworks.PyQT,other.frameworks.PyGTK,other.frameworks.wxPython,other.frameworks.Pillow,other.frameworks.Tkinter,other.frameworks.Pygame,other.frameworks.Twisted,other.frameworks.Asyncio,other.frameworks.Kivy,other.frameworks.Six,other.frameworks.Scrapy,other.frameworks.Other,cloud.platform.None,cloud.platform.Google Cloud Platform,cloud.platform.AWS,cloud.platform.Microsoft Azure,cloud.platform.Rackspace,cloud.platform.Heroku,cloud.platform.DigitalOcean,cloud.platform.Linode,cloud.platform.PythonAnywhere,cloud.platform.OpenShift,cloud.platform.OpenStack,cloud.platform.Other,run.in.cloud.None of the following,run.in.cloud.In virtual machines,run.in.cloud.Within containers,run.in.cloud.On a Platform-as-a-Service (such as Heroku or Google App Engine),run.in.cloud.Serverless (such as AWS Lambda or Cloud Functions),run.in.cloud.Other,develop.for.cloud.None of the following,develop.for.cloud.With local system interpreter,develop.for.cloud.Locally with virtualenv (or similar),develop.for.cloud.In Docker containers,develop.for.cloud.In virtual machines,develop.for.cloud.In remote development environments,develop.for.cloud.Directly in the production environment,develop.for.cloud.Other,devenv.os.Windows,devenv.os.Linux,devenv.os.macOS,devenv.os.BSD,devenv.os.Other,unittesting.None,unittesting.pytest,unittesting.nose,unittesting.unittest,unittesting.doctest,unittesting.Tox,unittesting.Hypothesis,unittesting.mock,unittesting.Other,orm.No database development,orm.Raw SQL,orm.Django ORM,orm.SQLAlchemy,orm.Peewee,orm.Dejavu,orm.PonyORM,orm.SQLObject,orm.Tortoise ORM,orm.Other,database.None,database.DB2,database.MS SQL Server,database.MySQL,database.Oracle Database,database.PostgreSQL,database.SQLite,database.Cassandra,database.Couchbase,database.HBase,database.MongoDB,database.Neo4j,database.Redis,database.Amazon Redshift,database.h2,database.Other,bigdata.None,bigdata.Apache Beam,bigdata.Apache Flink,bigdata.Apache Hadoop/MapReduce,bigdata.Apache Hive,bigdata.Apache Samza,bigdata.Apache Kafka,bigdata.Apache Spark,bigdata.Apache Tez,bigdata.Dask,bigdata.ClickHouse,bigdata.Other,ci.None,ci.Jenkins / Hudson,ci.TeamCity,ci.Bamboo,ci.Travis CI,ci.CircleCI,ci.CruiseControl,ci.Gitlab CI,ci.AppVeyor,ci.Other,configuration.management.None,configuration.management.Salt,configuration.management.Puppet,configuration.management.Chef,configuration.management.Ansible,configuration.management.Custom solution,configuration.management.Other,ide.main,ide.editor.None,ide.editor.Other,ide.editor.PyCharm Professional Edition,ide.editor.PyCharm Community Edition,ide.editor.Sublime Text,ide.editor.Vim,ide.editor.Atom,ide.editor.VS Code,ide.editor.Eclipse + Pydev,ide.editor.JupyterLab,ide.editor.Jupyter Notebook,ide.editor.IntelliJ IDEA,ide.editor.NotePad++,ide.editor.IDLE,ide.editor.Emacs,ide.editor.Python Tools for Visual Studio (PTVS),ide.editor.Spyder,several.projects,first.learn.about.main.ide,how.often.use.main.ide,missing.features.main.ide,nps.main.ide,job.team,team.size,employment.status,company.size,job.role.DBA,job.role.Architect,job.role.QA engineer,job.role.Developer / Programmer,job.role.Technical writer,job.role.Technical support,job.role.Data analyst,job.role.Business analyst,job.role.Team lead,job.role.Product manager,job.role.CIO / CEO / CTO,job.role.Systems analyst,job.role.Other,age,country.live
0,Yes,,,,,,,,Bash / Shell,,,,,,,,,,,,,,,,,3–5 years,1–2 years,For work,,,,,,,,,,Computer graphics,,,,,,,Computer graphics,,,,,,,,,,,,,,,,,Python 3,,Python 3_7,,,Python_org,,,,,,,,,,,,,,Poetry,,,,,,,,,CherryPy,,,,,,,,,,,,,Matplotlib,,,,,,,,,,,,aiohttp,,,,,,,,,,,,,,,AWS,,,,,,,,,,,,Within containers,,,,,With local system interpreter,,,,,,,,Linux,,,,,pytest,,,,,,,,,,Django ORM,,,,,,,,,,,,,,,,,,MongoDB,,,,,,,Apache Beam,,,,,,,,,,,,,,,,CircleCI,,,,,,,,,Ansible,,,PyCharm Community Edition,,,,,,,,,,,,,,,Emacs,,,"Yes, I work on many different projects",Conference / User Group,Weekly,"No, it has all the features I need",3.0,Work as an external consultant or trainer,,Partially employed by a company / organization,Just me,,,,,,,,Business analyst,,,,,,30–39,
1,Yes,,Java,JavaScript,,,C#,,,,,,,SQL,,R,,,,,,TypeScript,,,,3–5 years,3–5 years,Both for work and personal,Educational purposes,,,Software testing / Writing automated tests,Software prototyping,,Machine learning,,,,,,,,Programming of web parsers / scrapers / crawlers,,,,,Educational purposes,Software testing / Writing automated tests,,,,Machine learning,Software prototyping,Programming of web parsers / scrapers / crawlers,,,,,Software prototyping,Yes,Python 3,,Python 3_6,,,,,Automatic upgrade via cloud provider,,,,,,,,,,,,,Conda,Docker,,,,,,,,,,,Tornado,,,,,PyTorch,NumPy,SciPy,Pandas,,,SciKit-Learn,,TensorFlow,,NLTK,,,,,Requests,,,,,,,,,,,Six,Scrapy,,,Google Cloud Platform,AWS,Microsoft Azure,,,,,,,,,,In virtual machines,Within containers,,Serverless (such as AWS Lambda or Cloud Functi...,,,,,,,,Directly in the production environment,,Windows,,,,,,,,unittest,,,,mock,,No database development,,,,,,,,,,,,,MySQL,,PostgreSQL,SQLite,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,VS Code,,,,,Sublime Text,,,,,,Jupyter Notebook,,NotePad++,,,,,"Yes, I work on one main and several side projects",School / University,Daily,"No, it has all the features I need",8.0,Work in a team,2-7 people,Fully employed by a company / organization,"More than 5,000",,,,Developer / Programmer,,,,,,,,,,21–29,India
2,Yes,,,,C/C++,,,,Bash / Shell,,,,,,,,,,,,,,,,,3–5 years,3–5 years,Both for work and personal,,Data analysis,DevOps / System administration / Writing autom...,Software testing / Writing automated tests,Software prototyping,,,,Desktop development,,,,,,,,,Data analysis,DevOps / System administration / Writing autom...,,Software testing / Writing automated tests,,,Desktop development,,Software prototyping,,,,,,DevOps / System administration / Writing autom...,No,Python 3,,Python 3_6,,Somebody else manages Python updates for me,,,,Anaconda,,,"OS-provided Python (via apt-get, yum, homebrew...",,,,,,Virtualenv,Pipenv,,Conda,,,,,,,,,Flask,,,,,,,,,NumPy,SciPy,Pandas,Matplotlib,,,,,,NLTK,,,,,Requests,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Windows,Linux,,,,,pytest,,,,,,,,No database development,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Travis CI,,,,,Other,,,,,,,,Vim,,,,,,,,VS Code,,,,,,,,,,"Yes, I work on one main and several side projects",Friend / Colleague,Daily,"No, it has all the features I need",10.0,Work on your own project(s) independently,,Fully employed by a company / organization,"More than 5,000",,,,Developer / Programmer,,Technical support,Data analyst,,Team lead,,,,,30–39,United States
3,Yes,,,JavaScript,,,,,Bash / Shell,,,,,SQL,,,,,,,,,,HTML/CSS,,11+ years,11+ years,Both for work and personal,,,DevOps / System administration / Writing autom...,Software testing / Writing automated tests,,Web development,,,,,,,,,,,,,DevOps / System administration / Writing autom...,,Software testing / Writing automated tests,Web development,,,,,,,,,,Web development,,Python 3,,Python 3_8,,,,,,,,,,,,I use Docker containers,,,Virtualenv,,,,Docker,,,,Django,,,,,,,,,,,,,,,,,,,,,,,,,,,Requests,aiohttp,,,,Pillow,,,,Asyncio,,,,,,,AWS,,,,DigitalOcean,Linode,,,,,,,Within containers,,Serverless (such as AWS Lambda or Cloud Functi...,,,,Locally with virtualenv (or similar),In Docker containers,,,,,,Linux,,,,,pytest,,,,,,,,,,Django ORM,,,,,,,,,,,,,PostgreSQL,,,,,,,Redis,,,,,,,,,,,,,,,,,,,,,,,,,Other,,,,,,,Other,PyCharm Professional Edition,,,,,Sublime Text,Vim,,,,,,,,,,,,"Yes, I work on many different projects",Friend / Colleague,Daily,Yes – Please list:,10.0,,,,,,,,,,,,,,,,,,,
4,Yes,,Java,JavaScript,C/C++,,,,Bash / Shell,,,,,SQL,,,,,,,,,,HTML/CSS,,1–2 years,Less than 1 year,"For personal, educational or side projects",,,DevOps / System administration / Writing autom...,,,Web development,,,,,,,,,,,,,DevOps / System administration / Writing autom...,,,Web development,,,,,,,,,,Web development,,Python 3,,Python 3_8,,,,,,,,,"OS-provided Python (via apt-get, yum, homebrew...",,,,,,,,Poetry,,,,,,,,,,Flask,,,,,,,,,,,,,,,,,,,,,,,Requests,,,,,Pillow,,,,,,,Scrapy,,,,,,,,,,,,,,,,,,,,,,,,,,,,Windows,Linux,,,,,pytest,,,,,,,,,,,SQLAlchemy,,,,,,,,,,MySQL,,PostgreSQL,SQLite,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,VS Code,,,,,,Vim,,,,,,,,,,,,"Yes, I work on one main and several side projects",Search engines,Daily,"No, it has all the features I need",10.0,Work on your own project(s) independently,,Student,,,,,,,,,,,,,,,21–29,Italy


In [41]:
jb.shape

(54462, 264)

#### a. Modifying the Columns Dynamically

In [42]:
for i in jb.columns:
    print(i)

is.python.main
other.lang.None
other.lang.Java
other.lang.JavaScript
other.lang.C/C++
other.lang.PHP
other.lang.C#
other.lang.Ruby
other.lang.Bash / Shell
other.lang.Objective-C
other.lang.Go
other.lang.Visual Basic
other.lang.Scala
other.lang.SQL
other.lang.Kotlin
other.lang.R
other.lang.Swift
other.lang.Clojure
other.lang.Perl
other.lang.Rust
other.lang.Groovy
other.lang.TypeScript
other.lang.CoffeeScript
other.lang.HTML/CSS
other.lang.Other
python.years
years.of.coding
main.purposes
other.purposes.Educational purposes
other.purposes.Data analysis
other.purposes.DevOps / System administration / Writing automation scripts
other.purposes.Software testing / Writing automated tests
other.purposes.Software prototyping
other.purposes.Web development
other.purposes.Machine learning
other.purposes.Mobile development
other.purposes.Desktop development
other.purposes.Computer graphics
other.purposes.Network programming
other.purposes.Game development
other.purposes.Multimedia applications deve

In [46]:
import collections
collections?

[1;31mType:[0m        module
[1;31mString form:[0m <module 'collections' from 'C:\\Users\\pestyl\\Anaconda3\\lib\\collections\\__init__.py'>
[1;31mFile:[0m        c:\users\pestyl\anaconda3\lib\collections\__init__.py
[1;31mDocstring:[0m  
This module implements specialized container datatypes providing
alternatives to Python's general purpose built-in containers, dict,
list, set, and tuple.

* namedtuple   factory function for creating tuple subclasses with named fields
* deque        list-like container with fast appends and pops on either end
* ChainMap     dict-like class for creating a single view of multiple mappings
* Counter      dict subclass for counting hashable objects
* OrderedDict  dict subclass that remembers the order entries were added
* defaultdict  dict subclass that calls a factory function to supply missing values
* UserDict     wrapper around dictionary objects for easier dict subclassing
* UserList     wrapper around list objects for easier list subclassin

The following code determines if there are any column names that have multiple values for a simple group. For example, database.mysql, database.sqlserver. If it find columns that do that, it removes them.

In [57]:
counter = collections.defaultdict(list)
for col in sorted(jb.columns):
    period_count = col.count('.')  ## Count the number of periods in each column name
    if period_count >= 2:
        part_end = 2
    else:
        part_end = 1
    parts = col.split('.')[:part_end] ## Ignore the first part of the column name <ignore>.xxx.xxx
    counter['.'.join(parts)].append(col)

uniq_cols = []
for cols in counter.values():
    if len(cols) == 1:
        uniq_cols.extend(cols)
        
uniq_cols

['age',
 'are.you.datascientist',
 'company.size',
 'country.live',
 'employment.status',
 'first.learn.about.main.ide',
 'how.often.use.main.ide',
 'ide.main',
 'is.python.main',
 'job.team',
 'main.purposes',
 'missing.features.main.ide',
 'nps.main.ide',
 'python.years',
 'python2.version.most',
 'python3.version.most',
 'several.projects',
 'team.size',
 'use.python.most',
 'years.of.coding']

In [58]:
(jb[uniq_cols]
 .rename(columns = lambda col: col.replace('.', '_'))
)

Unnamed: 0,age,are_you_datascientist,company_size,country_live,employment_status,first_learn_about_main_ide,how_often_use_main_ide,ide_main,is_python_main,job_team,main_purposes,missing_features_main_ide,nps_main_ide,python_years,python2_version_most,python3_version_most,several_projects,team_size,use_python_most,years_of_coding
0,30–39,,Just me,,Partially employed by a company / organization,Conference / User Group,Weekly,PyCharm Community Edition,Yes,Work as an external consultant or trainer,For work,"No, it has all the features I need",3.0,3–5 years,,Python 3_7,"Yes, I work on many different projects",,,1–2 years
1,21–29,Yes,"More than 5,000",India,Fully employed by a company / organization,School / University,Daily,VS Code,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",8.0,3–5 years,,Python 3_6,"Yes, I work on one main and several side projects",2-7 people,Software prototyping,3–5 years
2,30–39,No,"More than 5,000",United States,Fully employed by a company / organization,Friend / Colleague,Daily,Vim,Yes,Work on your own project(s) independently,Both for work and personal,"No, it has all the features I need",10.0,3–5 years,,Python 3_6,"Yes, I work on one main and several side projects",,DevOps / System administration / Writing autom...,3–5 years
3,,,,,,Friend / Colleague,Daily,PyCharm Professional Edition,Yes,,Both for work and personal,Yes – Please list:,10.0,11+ years,,Python 3_8,"Yes, I work on many different projects",,Web development,11+ years
4,21–29,,,Italy,Student,Search engines,Daily,VS Code,Yes,Work on your own project(s) independently,"For personal, educational or side projects","No, it has all the features I need",10.0,1–2 years,,Python 3_8,"Yes, I work on one main and several side projects",,Web development,Less than 1 year
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54457,21–29,No,2–10,Russian Federation,Fully employed by a company / organization,School / University,Daily,Vim,Yes,Work on your own project(s) independently,Both for work and personal,"No, it has all the features I need",10.0,6–10 years,,Python 3_6,"Yes, I work on many different projects",,Data analysis,1–2 years
54458,,No,,,,,,,Yes,,Both for work and personal,,,3–5 years,,Python 3_7,,,Web development,1–2 years
54459,21–29,,Just me,Russian Federation,Self-employed (a person earning income directl...,Friend / Colleague,Daily,PyCharm Professional Edition,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",10.0,3–5 years,,Python 3_7,"Yes, I work on many different projects",2-7 people,Web development,6–10 years
54460,30–39,Yes,51–500,Spain,Fully employed by a company / organization,Search engines,Daily,Other,Yes,Work on your own project(s) independently,Both for work and personal,Yes – Please list:,3.0,6–10 years,,Python 3_7,"Yes, I work on many different projects",,Data analysis,3–5 years


#### b. Substring and Convert a Column

In [65]:
(jb[uniq_cols]
 .rename(columns = lambda col: col.replace('.','_'))
 .age
 .value_counts()
)

21–29          9710
30–39          7512
40–49          3010
18–20          2567
50–59          1374
60 or older     588
Name: age, dtype: int64

I’m going to pull out the first two characters from the age column and convert it to numbers.
We will have to convert to float because there are missing values:

In [66]:
jb.age.dtypes

dtype('O')

In [59]:
jb.loc[:,'age']

0        30–39
1        21–29
2        30–39
3          NaN
4        21–29
         ...  
54457    21–29
54458      NaN
54459    21–29
54460    30–39
54461    21–29
Name: age, Length: 54462, dtype: object

In [60]:
jb.age.str.slice(0,2)

0         30
1         21
2         30
3        NaN
4         21
        ... 
54457     21
54458    NaN
54459     21
54460     30
54461     21
Name: age, Length: 54462, dtype: object

In [61]:
jb.age.str[0:2]

0         30
1         21
2         30
3        NaN
4         21
        ... 
54457     21
54458    NaN
54459     21
54460     30
54461     21
Name: age, Length: 54462, dtype: object

Note that currently, pandas (here is the bug12) can’t convert strings directly to 'Int64', you need to
convert to float first.

In [62]:
(jb[uniq_cols]
 .rename(columns = lambda col: col.replace('.','_'))
 .age.str.slice(0,2)
 .astype('float')
 .astype('Int64')
)

0          30
1          21
2          30
3        <NA>
4          21
         ... 
54457      21
54458    <NA>
54459      21
54460      30
54461      21
Name: age, Length: 54462, dtype: Int64

#### c. Create Column in a DataFrame Using Assign

In [68]:
jb[uniq_cols].head()

Unnamed: 0,age,are.you.datascientist,company.size,country.live,employment.status,first.learn.about.main.ide,how.often.use.main.ide,ide.main,is.python.main,job.team,main.purposes,missing.features.main.ide,nps.main.ide,python.years,python2.version.most,python3.version.most,several.projects,team.size,use.python.most,years.of.coding
0,30–39,,Just me,,Partially employed by a company / organization,Conference / User Group,Weekly,PyCharm Community Edition,Yes,Work as an external consultant or trainer,For work,"No, it has all the features I need",3.0,3–5 years,,Python 3_7,"Yes, I work on many different projects",,,1–2 years
1,21–29,Yes,"More than 5,000",India,Fully employed by a company / organization,School / University,Daily,VS Code,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",8.0,3–5 years,,Python 3_6,"Yes, I work on one main and several side projects",2-7 people,Software prototyping,3–5 years
2,30–39,No,"More than 5,000",United States,Fully employed by a company / organization,Friend / Colleague,Daily,Vim,Yes,Work on your own project(s) independently,Both for work and personal,"No, it has all the features I need",10.0,3–5 years,,Python 3_6,"Yes, I work on one main and several side projects",,DevOps / System administration / Writing autom...,3–5 years
3,,,,,,Friend / Colleague,Daily,PyCharm Professional Edition,Yes,,Both for work and personal,Yes – Please list:,10.0,11+ years,,Python 3_8,"Yes, I work on many different projects",,Web development,11+ years
4,21–29,,,Italy,Student,Search engines,Daily,VS Code,Yes,Work on your own project(s) independently,"For personal, educational or side projects","No, it has all the features I need",10.0,1–2 years,,Python 3_8,"Yes, I work on one main and several side projects",,Web development,Less than 1 year


When you call .assign you generally pass in a keyword argument corresponding to the column
name to create or update. You can assign the argument to a series, a scalar, or a function. You
will see that many of my examples use lambda functions.
Using a function (it can be a normal function, but often we use a lambda to have the logic
inline) has an unseen benefit. This function will accept the current state of the dataframe. If you
have done any filtering or manipulation in the chain before calling .assign, it will be represented
in this dataframe.

In [89]:
(jb[uniq_cols]
 .rename(columns = lambda cols: cols.replace('.','_'))
 .assign(age = lambda df_: df_.age
                              .str
                              .slice(0,2)
                              .astype('float')
                              .astype('Int64'))
)

Unnamed: 0,age,are_you_datascientist,company_size,country_live,employment_status,first_learn_about_main_ide,how_often_use_main_ide,ide_main,is_python_main,job_team,main_purposes,missing_features_main_ide,nps_main_ide,python_years,python2_version_most,python3_version_most,several_projects,team_size,use_python_most,years_of_coding
0,30,,Just me,,Partially employed by a company / organization,Conference / User Group,Weekly,PyCharm Community Edition,Yes,Work as an external consultant or trainer,For work,"No, it has all the features I need",3.0,3–5 years,,Python 3_7,"Yes, I work on many different projects",,,1–2 years
1,21,Yes,"More than 5,000",India,Fully employed by a company / organization,School / University,Daily,VS Code,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",8.0,3–5 years,,Python 3_6,"Yes, I work on one main and several side projects",2-7 people,Software prototyping,3–5 years
2,30,No,"More than 5,000",United States,Fully employed by a company / organization,Friend / Colleague,Daily,Vim,Yes,Work on your own project(s) independently,Both for work and personal,"No, it has all the features I need",10.0,3–5 years,,Python 3_6,"Yes, I work on one main and several side projects",,DevOps / System administration / Writing autom...,3–5 years
3,,,,,,Friend / Colleague,Daily,PyCharm Professional Edition,Yes,,Both for work and personal,Yes – Please list:,10.0,11+ years,,Python 3_8,"Yes, I work on many different projects",,Web development,11+ years
4,21,,,Italy,Student,Search engines,Daily,VS Code,Yes,Work on your own project(s) independently,"For personal, educational or side projects","No, it has all the features I need",10.0,1–2 years,,Python 3_8,"Yes, I work on one main and several side projects",,Web development,Less than 1 year
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54457,21,No,2–10,Russian Federation,Fully employed by a company / organization,School / University,Daily,Vim,Yes,Work on your own project(s) independently,Both for work and personal,"No, it has all the features I need",10.0,6–10 years,,Python 3_6,"Yes, I work on many different projects",,Data analysis,1–2 years
54458,,No,,,,,,,Yes,,Both for work and personal,,,3–5 years,,Python 3_7,,,Web development,1–2 years
54459,21,,Just me,Russian Federation,Self-employed (a person earning income directl...,Friend / Colleague,Daily,PyCharm Professional Edition,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",10.0,3–5 years,,Python 3_7,"Yes, I work on many different projects",2-7 people,Web development,6–10 years
54460,30,Yes,51–500,Spain,Fully employed by a company / organization,Search engines,Daily,Other,Yes,Work on your own project(s) independently,Both for work and personal,Yes – Please list:,3.0,6–10 years,,Python 3_7,"Yes, I work on many different projects",,Data analysis,3–5 years


#### d. More Column Cleanup

The are_you_datascientist column can be converted to a boolean column with the .replace method:

In [82]:
jb[uniq_cols].head()

Unnamed: 0,age,are.you.datascientist,company.size,country.live,employment.status,first.learn.about.main.ide,how.often.use.main.ide,ide.main,is.python.main,job.team,main.purposes,missing.features.main.ide,nps.main.ide,python.years,python2.version.most,python3.version.most,several.projects,team.size,use.python.most,years.of.coding
0,30–39,,Just me,,Partially employed by a company / organization,Conference / User Group,Weekly,PyCharm Community Edition,Yes,Work as an external consultant or trainer,For work,"No, it has all the features I need",3.0,3–5 years,,Python 3_7,"Yes, I work on many different projects",,,1–2 years
1,21–29,Yes,"More than 5,000",India,Fully employed by a company / organization,School / University,Daily,VS Code,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",8.0,3–5 years,,Python 3_6,"Yes, I work on one main and several side projects",2-7 people,Software prototyping,3–5 years
2,30–39,No,"More than 5,000",United States,Fully employed by a company / organization,Friend / Colleague,Daily,Vim,Yes,Work on your own project(s) independently,Both for work and personal,"No, it has all the features I need",10.0,3–5 years,,Python 3_6,"Yes, I work on one main and several side projects",,DevOps / System administration / Writing autom...,3–5 years
3,,,,,,Friend / Colleague,Daily,PyCharm Professional Edition,Yes,,Both for work and personal,Yes – Please list:,10.0,11+ years,,Python 3_8,"Yes, I work on many different projects",,Web development,11+ years
4,21–29,,,Italy,Student,Search engines,Daily,VS Code,Yes,Work on your own project(s) independently,"For personal, educational or side projects","No, it has all the features I need",10.0,1–2 years,,Python 3_8,"Yes, I work on one main and several side projects",,Web development,Less than 1 year


In [84]:
jb['are.you.datascientist'].value_counts()

No       13399
Yes       6710
Other     1155
Name: are.you.datascientist, dtype: int64

In [99]:
(jb[uniq_cols]
 .rename(columns = lambda cols: cols.replace('.','_'))
 .assign(age = lambda df_: df_.age
                              .str
                              .slice(0,2)
                              .astype('float')
                              .astype('Int64'),
         are_you_datascientist = lambda df_: df_.are_you_datascientist
                                                .replace({'Yes':True, 'No':False, np.nan:False})
        ).are_you_datascientist
)

0        False
1         True
2        False
3        False
4        False
         ...  
54457    False
54458    False
54459    False
54460     True
54461    False
Name: are_you_datascientist, Length: 54462, dtype: object

On to the next column. Let’s look at company_size.

In [101]:
jb['company.size'].value_counts()

51–500             4608
More than 5,000    3635
11–50              3507
2–10               2558
1,001–5,000        1934
Just me            1492
501–1,000          1165
Not sure            526
Name: company.size, dtype: int64