## Joining (merging) on indexes; ways of storing the output of a for loop.

In [1]:
import pandas as pd
import numpy as np


In [2]:
# Here is a dataframe with some not-particularly interesting (uniformly distributed) numbers
p = pd.DataFrame(np.random.random(10))
p

Unnamed: 0,0
0,0.493416
1,0.939566
2,0.406957
3,0.904016
4,0.236298
5,0.535936
6,0.243642
7,0.809358
8,0.515593
9,0.314249


In [3]:
# Here is a dataframe with some not-particularly interesting (uniformly distributed) numbers
p = pd.DataFrame(np.random.random(10))
p

Unnamed: 0,0
0,0.999405
1,0.264872
2,0.102244
3,0.737411
4,0.01739
5,0.337991
6,0.431045
7,0.871132
8,0.153383
9,0.708265


In [4]:
# Here is a sample of two of the rows of this dataframe.
r = p.sample(2)
r

Unnamed: 0,0
7,0.871132
5,0.337991


In [5]:
# Note that the subset dataframe remembers the indexes of each row.
# I can use this to merge.

In [6]:
# And here I merge on the INDEXES.  
pr_merged = p.merge(r, how="outer"  , left_index=True, right_index=True )
pr_merged

Unnamed: 0,0_x,0_y
0,0.999405,
1,0.264872,
2,0.102244,
3,0.737411,
4,0.01739,
5,0.337991,0.337991
6,0.431045,
7,0.871132,0.871132
8,0.153383,
9,0.708265,


In [7]:
# If the NaN values are an issue, I can replace them with zero
pr_merged.fillna(0)

Unnamed: 0,0_x,0_y
0,0.999405,0.0
1,0.264872,0.0
2,0.102244,0.0
3,0.737411,0.0
4,0.01739,0.0
5,0.337991,0.337991
6,0.431045,0.0
7,0.871132,0.871132
8,0.153383,0.0
9,0.708265,0.0


In [8]:
pr_merged.dropna()

Unnamed: 0,0_x,0_y
5,0.337991,0.337991
7,0.871132,0.871132


In [9]:
child1 = pd.DataFrame ({"type": ["Snickers-funsize", "Snickers-minisize", "Twizzlers", "Nerds"], 
                        "number": [2, 4, 3, 1]})
child1

Unnamed: 0,type,number
0,Snickers-funsize,2
1,Snickers-minisize,4
2,Twizzlers,3
3,Nerds,1


In [10]:
child2 = pd.DataFrame ({"type": ["Snickers-minisize", "Twizzlers", "Payday", "Jolly Rancher", "M&Ms"], 
                        "number": [3, 2, 2, 3, 1]})
child2

Unnamed: 0,type,number
0,Snickers-minisize,3
1,Twizzlers,2
2,Payday,2
3,Jolly Rancher,3
4,M&Ms,1


In [11]:
child1.merge(child2, on="type")

Unnamed: 0,type,number_x,number_y
0,Snickers-minisize,4,3
1,Twizzlers,3,2


In [12]:
# Why does this give me only two rows?

In [13]:
merged_df = child1.merge(child2, on="type",  how="outer")
#merged_df = merged_df.fillna(0)
merged_df

Unnamed: 0,type,number_x,number_y
0,Snickers-funsize,2.0,
1,Snickers-minisize,4.0,3.0
2,Twizzlers,3.0,2.0
3,Nerds,1.0,
4,Payday,,2.0
5,Jolly Rancher,,3.0
6,M&Ms,,1.0


In [14]:
merged_df = child1.merge(child2, on="type",  how="outer").fillna(0)
merged_df

Unnamed: 0,type,number_x,number_y
0,Snickers-funsize,2.0,0.0
1,Snickers-minisize,4.0,3.0
2,Twizzlers,3.0,2.0
3,Nerds,1.0,0.0
4,Payday,0.0,2.0
5,Jolly Rancher,0.0,3.0
6,M&Ms,0.0,1.0


In [15]:
# Here is a dataframe with some not-particularly interesting (uniformly distributed) numbers
p = pd.DataFrame(np.random.random(10))
p

Unnamed: 0,0
0,0.946138
1,0.093076
2,0.563943
3,0.73209
4,0.605462
5,0.464844
6,0.591626
7,0.794803
8,0.453648
9,0.82957


In [16]:
# Here is a sample of two of the rows of this dataframe.
r = p.sample(2)
r

Unnamed: 0,0
2,0.563943
6,0.591626


In [17]:
# Note that the subset dataframe remembers the indexes of each row.
# I can use this to merge.

In [18]:
# And here I join on the INDEXES.  
pr_merged = p.merge(r, how="outer"  , left_index=True, right_index=True )
pr_merged

Unnamed: 0,0_x,0_y
0,0.946138,
1,0.093076,
2,0.563943,0.563943
3,0.73209,
4,0.605462,
5,0.464844,
6,0.591626,0.591626
7,0.794803,
8,0.453648,
9,0.82957,


In [19]:
pr_merged.fillna(0)

Unnamed: 0,0_x,0_y
0,0.946138,0.0
1,0.093076,0.0
2,0.563943,0.563943
3,0.73209,0.0
4,0.605462,0.0
5,0.464844,0.0
6,0.591626,0.591626
7,0.794803,0.0
8,0.453648,0.0
9,0.82957,0.0


In [20]:
child1 = pd.DataFrame ({"type": ["Snickers-funsize", "Snickers-minisize", "Twizzlers", "Nerds"], 
                        "number": [2, 4, 3, 1]})
child1

Unnamed: 0,type,number
0,Snickers-funsize,2
1,Snickers-minisize,4
2,Twizzlers,3
3,Nerds,1


In [21]:
child2 = pd.DataFrame ({"type": ["Snickers-minisize", "Twizzlers", "Payday", "Jolly Rancher", "M&Ms"], 
                        "number": [3, 2, 2, 3, 1]})
child2

Unnamed: 0,type,number
0,Snickers-minisize,3
1,Twizzlers,2
2,Payday,2
3,Jolly Rancher,3
4,M&Ms,1


In [22]:
child1.merge(child2, on="type")

Unnamed: 0,type,number_x,number_y
0,Snickers-minisize,4,3
1,Twizzlers,3,2


In [23]:
# Why does this give me only two rows?

In [24]:
merged_df = child1.merge(child2, on="type",  how="outer")
#merged_df = merged_df.fillna(0)
merged_df

Unnamed: 0,type,number_x,number_y
0,Snickers-funsize,2.0,
1,Snickers-minisize,4.0,3.0
2,Twizzlers,3.0,2.0
3,Nerds,1.0,
4,Payday,,2.0
5,Jolly Rancher,,3.0
6,M&Ms,,1.0


In [25]:
merged_df = child1.merge(child2, on="type",  how="outer").fillna(0)
merged_df

Unnamed: 0,type,number_x,number_y
0,Snickers-funsize,2.0,0.0
1,Snickers-minisize,4.0,3.0
2,Twizzlers,3.0,2.0
3,Nerds,1.0,0.0
4,Payday,0.0,2.0
5,Jolly Rancher,0.0,3.0
6,M&Ms,0.0,1.0


In [26]:
# Here is a sample of two of the rows of this dataframe.
r = p.sample(2)
r

Unnamed: 0,0
5,0.464844
2,0.563943


In [27]:
# Note that the subset dataframe remembers the indexes of each row.
# I can use this to merge.

In [28]:
# And here I merge on the INDEXES.  
pr_merged = p.merge(r, how="outer"  , left_index=True, right_index=True )
pr_merged

Unnamed: 0,0_x,0_y
0,0.946138,
1,0.093076,
2,0.563943,0.563943
3,0.73209,
4,0.605462,
5,0.464844,0.464844
6,0.591626,
7,0.794803,
8,0.453648,
9,0.82957,


In [29]:
# If the NaN values are an issue, I can replace them with zero
pr_merged.fillna(0)

Unnamed: 0,0_x,0_y
0,0.946138,0.0
1,0.093076,0.0
2,0.563943,0.563943
3,0.73209,0.0
4,0.605462,0.0
5,0.464844,0.464844
6,0.591626,0.0
7,0.794803,0.0
8,0.453648,0.0
9,0.82957,0.0


In [30]:
pr_merged.dropna()

Unnamed: 0,0_x,0_y
2,0.563943,0.563943
5,0.464844,0.464844


In [31]:
child1 = pd.DataFrame ({"type": ["Snickers-funsize", "Snickers-minisize", "Twizzlers", "Nerds"], 
                        "number": [2, 4, 3, 1]})
child1

Unnamed: 0,type,number
0,Snickers-funsize,2
1,Snickers-minisize,4
2,Twizzlers,3
3,Nerds,1


In [32]:
child2 = pd.DataFrame ({"type": ["Snickers-minisize", "Twizzlers", "Payday", "Jolly Rancher", "M&Ms"], 
                        "number": [3, 2, 2, 3, 1]})
child2

Unnamed: 0,type,number
0,Snickers-minisize,3
1,Twizzlers,2
2,Payday,2
3,Jolly Rancher,3
4,M&Ms,1


In [33]:
child1.merge(child2, on="type")

Unnamed: 0,type,number_x,number_y
0,Snickers-minisize,4,3
1,Twizzlers,3,2


In [34]:
# Why does this give me only two rows?

In [35]:
merged_df = child1.merge(child2, on="type",  how="outer")
#merged_df = merged_df.fillna(0)
merged_df

Unnamed: 0,type,number_x,number_y
0,Snickers-funsize,2.0,
1,Snickers-minisize,4.0,3.0
2,Twizzlers,3.0,2.0
3,Nerds,1.0,
4,Payday,,2.0
5,Jolly Rancher,,3.0
6,M&Ms,,1.0


In [36]:
merged_df = child1.merge(child2, on="type",  how="outer").fillna(0)
merged_df

Unnamed: 0,type,number_x,number_y
0,Snickers-funsize,2.0,0.0
1,Snickers-minisize,4.0,3.0
2,Twizzlers,3.0,2.0
3,Nerds,1.0,0.0
4,Payday,0.0,2.0
5,Jolly Rancher,0.0,3.0
6,M&Ms,0.0,1.0


In [37]:
# Here is a dataframe with some not-particularly interesting (uniformly distributed) numbers
p = pd.DataFrame(np.random.random(10))
p

Unnamed: 0,0
0,0.254464
1,0.665185
2,0.60161
3,0.801856
4,0.02608
5,0.179229
6,0.245878
7,0.487954
8,0.243729
9,0.910497


In [38]:
# Here is a sample of two of the rows of this dataframe.
r = p.sample(2)
r

Unnamed: 0,0
8,0.243729
7,0.487954


In [39]:
# Note that the subset dataframe remembers the indexes of each row.
# I can use this to merge.

In [40]:
# And here I join on the INDEXES.  
pr_merged = p.merge(r, how="outer"  , left_index=True, right_index=True )
pr_merged

Unnamed: 0,0_x,0_y
0,0.254464,
1,0.665185,
2,0.60161,
3,0.801856,
4,0.02608,
5,0.179229,
6,0.245878,
7,0.487954,0.487954
8,0.243729,0.243729
9,0.910497,


In [41]:
pr_merged.fillna(0)

Unnamed: 0,0_x,0_y
0,0.254464,0.0
1,0.665185,0.0
2,0.60161,0.0
3,0.801856,0.0
4,0.02608,0.0
5,0.179229,0.0
6,0.245878,0.0
7,0.487954,0.487954
8,0.243729,0.243729
9,0.910497,0.0


In [42]:
child1 = pd.DataFrame ({"type": ["Snickers-funsize", "Snickers-minisize", "Twizzlers", "Nerds"], 
                        "number": [2, 4, 3, 1]})
child1

Unnamed: 0,type,number
0,Snickers-funsize,2
1,Snickers-minisize,4
2,Twizzlers,3
3,Nerds,1


In [43]:
child2 = pd.DataFrame ({"type": ["Snickers-minisize", "Twizzlers", "Payday", "Jolly Rancher", "M&Ms"], 
                        "number": [3, 2, 2, 3, 1]})
child2

Unnamed: 0,type,number
0,Snickers-minisize,3
1,Twizzlers,2
2,Payday,2
3,Jolly Rancher,3
4,M&Ms,1


In [44]:
child1.merge(child2, on="type")

Unnamed: 0,type,number_x,number_y
0,Snickers-minisize,4,3
1,Twizzlers,3,2


In [45]:
# Why does this give me only two rows?

In [46]:
merged_df = child1.merge(child2, on="type",  how="outer")
#merged_df = merged_df.fillna(0)
merged_df

Unnamed: 0,type,number_x,number_y
0,Snickers-funsize,2.0,
1,Snickers-minisize,4.0,3.0
2,Twizzlers,3.0,2.0
3,Nerds,1.0,
4,Payday,,2.0
5,Jolly Rancher,,3.0
6,M&Ms,,1.0


In [47]:
merged_df = child1.merge(child2, on="type",  how="outer").fillna(0)
merged_df

Unnamed: 0,type,number_x,number_y
0,Snickers-funsize,2.0,0.0
1,Snickers-minisize,4.0,3.0
2,Twizzlers,3.0,2.0
3,Nerds,1.0,0.0
4,Payday,0.0,2.0
5,Jolly Rancher,0.0,3.0
6,M&Ms,0.0,1.0


In [48]:
# Keeping track of data.
# Suppose I'm writing a for loop to calculate all the squares from 1 to 400.
# And suppose I'd like to sum these first 20 squares.
# The syntax I need depends on which data structure I use to keep my temporary
# data in.

In [49]:
# The plain for loop calculates what I want, and keeps a running tally
# of the sum so far.  
n=0
for i in range(1,20+1):
    print(i, i**2)
    n = n+i**2
print(n)

1 1
2 4
3 9
4 16
5 25
6 36
7 49
8 64
9 81
10 100
11 121
12 144
13 169
14 196
15 225
16 256
17 289
18 324
19 361
20 400
2870


In [50]:
# This keeps the partial results in a python list that gets
# longer with each iteration.  np.sum() is smart enough
# to add up the elements of a list even though it isn't a numpy 
# data object.
# Note:  the running_list doesn't keep track of an index, other than
# the order of the items (0...19) that are added to the list.
running_list=[]
for i in range(1,20+1):
    print(i, i**2)
    running_list.append(i**2)
print(running_list)
np.sum(running_list)

1 1
2 4
3 9
4 16
5 25
6 36
7 49
8 64
9 81
10 100
11 121
12 144
13 169
14 196
15 225
16 256
17 289
18 324
19 361
20 400
[1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225, 256, 289, 324, 361, 400]


2870

In [51]:
# Adding one row at a time to a numpy array:
# requires that we create an empty numpy array
# and then add one item at a time.
# NOTE: this array does not store an index apart from the 0...19 row position.
running_np=np.array([])
for i in range(1,20+1):
    print(i, i**2)
    running_np = np.append(running_np, i**2)
print(running_np)
running_np.sum()

1 1
2 4
3 9
4 16
5 25
6 36
7 49
8 64
9 81
10 100
11 121
12 144
13 169
14 196
15 225
16 256
17 289
18 324
19 361
20 400
[  1.   4.   9.  16.  25.  36.  49.  64.  81. 100. 121. 144. 169. 196.
 225. 256. 289. 324. 361. 400.]


2870.0

In [52]:
# Adding one row at a time to a dataframe 
running_df=pd.DataFrame()
for i in range(1,20+1):
    print(i, i**2)
    running_df = running_df.append( pd.DataFrame( [i**2], index=[i] ))
running_df

1 1
2 4
3 9
4 16
5 25
6 36
7 49
8 64
9 81
10 100
11 121
12 144
13 169
14 196
15 225
16 256
17 289
18 324
19 361
20 400


  running_df = running_df.append( pd.DataFrame( [i**2], index=[i] ))
  running_df = running_df.append( pd.DataFrame( [i**2], index=[i] ))
  running_df = running_df.append( pd.DataFrame( [i**2], index=[i] ))
  running_df = running_df.append( pd.DataFrame( [i**2], index=[i] ))
  running_df = running_df.append( pd.DataFrame( [i**2], index=[i] ))
  running_df = running_df.append( pd.DataFrame( [i**2], index=[i] ))
  running_df = running_df.append( pd.DataFrame( [i**2], index=[i] ))
  running_df = running_df.append( pd.DataFrame( [i**2], index=[i] ))
  running_df = running_df.append( pd.DataFrame( [i**2], index=[i] ))
  running_df = running_df.append( pd.DataFrame( [i**2], index=[i] ))
  running_df = running_df.append( pd.DataFrame( [i**2], index=[i] ))
  running_df = running_df.append( pd.DataFrame( [i**2], index=[i] ))
  running_df = running_df.append( pd.DataFrame( [i**2], index=[i] ))
  running_df = running_df.append( pd.DataFrame( [i**2], index=[i] ))
  running_df = running_df.append( 

Unnamed: 0,0
1,1
2,4
3,9
4,16
5,25
6,36
7,49
8,64
9,81
10,100


In [53]:
# But I get a warning that says that df.append() is deprecated, so I should
# use pd.concat:

# Adding one row at a time to a dataframe with concat
running_df=pd.DataFrame()
for i in range(1,20+1):
    print(i, i**2)
    running_df = pd.concat( [running_df, pd.DataFrame( [i**2], index=[i] )])
running_df


1 1
2 4
3 9
4 16
5 25
6 36
7 49
8 64
9 81
10 100
11 121
12 144
13 169
14 196
15 225
16 256
17 289
18 324
19 361
20 400


Unnamed: 0,0
1,1
2,4
3,9
4,16
5,25
6,36
7,49
8,64
9,81
10,100


In [54]:
# Instead of a for loop, I could use elementwise operations to 
# calculate the squares:
p = pd.DataFrame({"i": np.arange(1,20+1)})
p["i^2"] = p.i**2
p

Unnamed: 0,i,i^2
0,1,1
1,2,4
2,3,9
3,4,16
4,5,25
5,6,36
6,7,49
7,8,64
8,9,81
9,10,100


In [55]:
def square(x):
    return x**2
# Instead of a for loop, I could use .apply() on one column to 
# calculate the squares:
p = pd.DataFrame({"i": np.arange(1,20+1)})
p["i^2"] = p.i.apply(square)
p

Unnamed: 0,i,i^2
0,1,1
1,2,4
2,3,9
3,4,16
4,5,25
5,6,36
6,7,49
7,8,64
8,9,81
9,10,100
