# 2.2 Wrangling

In this notebook, we practice with operations on tidy data frames.

In [1]:
# imports

import pandas as pd
import json, os, codecs
from collections import defaultdict, OrderedDict
import seaborn as sns

## Import the dataset
Let us import the tidy sample dataset in memory.

In [2]:
root_folder = "../data/bl_books/sample_tidy/"
df_book = pd.read_csv(os.path.join(root_folder,"df_book.csv"))
df_author = pd.read_csv(os.path.join(root_folder,"df_author.csv"))
df_author_book = pd.read_csv(os.path.join(root_folder,"df_author_book.csv"))
df_book_text = pd.read_csv(os.path.join(root_folder,"df_book_text.csv"))

## Take a look
Let's take another look to the dataset to refresh out memories.

In [3]:
df_book.head(1)

Unnamed: 0,datefield,publisher,title,edition,place,issuance,first_pdf,number_volumes,identifier,fulltext_filename,type,genre
0,1841.0,Privately printed,"The Poetical Aviary, with a bird's-eye view of...",,Calcutta,monographic,lsidyv35c55757,1,196,000000196_01_text.json,poet,Poetry


In [4]:
df_author.head(5)

Unnamed: 0,name
0,A. A.
1,"Abbott, Evelyn"
2,"A'BECKETT, Gilbert Abbott."
3,"Laffan, De Courcy - Mrs"
4,"Adams, W. H. Davenport (William Henry Davenport)"


In [5]:
df_author_book.head(5)

Unnamed: 0,book_id,author_id
0,196,0
1,4047,1
2,5382,2
3,14627,3
4,17057,4


In [6]:
df_book_text.head(3)

Unnamed: 0,fulltext_filename,fulltext,book_id
0,000551646_01_text.json,"' -■"" ' LiLitr-- )Wm&, HISTORY OF THE...",551646
1,002674278_01_text.json,The Great Revolution of 1840. REMINISC...,2674278
2,001975731_01_text.json,THE REAR-GUARD OF THE REVOLUTION. BY E...,1975731


In [7]:
df_book.shape

(452, 12)

In [8]:
# let's create a few smaller datasets to play with transformations, via selection

df_book_tragedy = df_book[df_book["type"] == "tragedy"]
df_book_comedy = df_book[df_book["type"] == "comedy"]

In [11]:
df_book_tragedy.shape

(32, 12)

In [10]:
df_book_comedy.shape

(18, 12)

In [9]:
df_book_tragedy.head()

Unnamed: 0,datefield,publisher,title,edition,place,issuance,first_pdf,number_volumes,identifier,fulltext_filename,type,genre
16,1887.0,J. W. Jarvis & Son,"Arden of Feversham, a tragedy: reprinted from ...",,London,monographic,lsidyv35e77d2f,1,106624,000106624_01_text.json,tragedy,Drama
22,1776.0,J. Dodsley,"Semiramis, a tragedy, etc. [In verse.]",,London,monographic,lsidyv36799899,1,154917,000154917_01_text.json,tragedy,Drama
31,1794.0,For the Author,"The Fall of the French Monarchy; or, Louis XVI...",,London,monographic,lsidyv3679ae3b,1,215686,000215686_01_text.json,tragedy,Drama
65,1884.0,G. Bell & Sons,[Strafford: an historical tragedy.],[Another edition.] With notes and preface by E...,London,monographic,lsidyv35e8b99c,1,499453,000499453_01_text.json,tragedy,Drama
70,1821.0,John Murray,"Sardanapalus, a tragedy. The Two Foscari, a tr...",,London,monographic,lsidyv386108c3,1,558547,000558547_01_text.json,tragedy,Drama


## Set operations

In [12]:
# A refresher in python

A = set([1,2,3,4,4,5])
B = set([1,4,5,6,7,8])

In [13]:
print(A)

{1, 2, 3, 4, 5}


In [14]:
print(A.union(B))

{1, 2, 3, 4, 5, 6, 7, 8}


In [15]:
print(A.difference(B))

{2, 3}


In [16]:
print(A.intersection(B))

{1, 4, 5}


In [17]:
df_book_tragedy.shape

(32, 12)

In [18]:
df_book_comedy.shape

(18, 12)

### Projection

In [19]:
df_book_comedy[["identifier","title"]].head(10)

Unnamed: 0,identifier,title
71,558610,Arnaldo; Gaddo; and other unacknowledged poems...
75,570834,"['Tis Well it's no Worse: a comedy, etc. [Adap..."
115,835116,"The Jew: a comedy, etc"
117,849202,"Psyche debauch'd, a comedy, as it was acted at..."
177,1552331,The Womens Conquest. A tragicomedy [in five ac...
183,1593978,Thomas Hardy's works
184,1594129,The hand of Ethelberta. A comedy in chapters ....
196,1710613,Love's Frailties: a comedy in five acts [in pr...
206,1793956,The Wedding Day; a comedy in two acts [and in ...
221,1920080,Looking Glass for my Poli-comedie actors in Eu...


### Union

In [20]:
df_temp = pd.concat([df_book_tragedy, df_book_comedy])

In [21]:
df_temp.shape

(50, 12)

In [42]:
pd.concat([df_book_tragedy, df_book_tragedy]).reset_index(drop=False).shape

(64, 13)

In [23]:
df_temp = pd.concat([df_book_tragedy, df_book_comedy]).reset_index(drop=True)

In [25]:
df_temp.shape

(50, 12)

### Difference

In [30]:
df_book_tragedy[~df_book_tragedy.index.isin(df_book_comedy.index)].shape

(32, 12)

In [31]:
df_book_tragedy.index.isin(df_book_comedy.index)

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False])

In [32]:
~df_book_tragedy.index.isin(df_book_comedy.index)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True])

In [33]:
df_book_tragedy[~df_book_tragedy.index.isin(df_book_tragedy.index)].shape

(0, 12)

### Intersection

In [34]:
pd.merge(right=df_book_tragedy, left=df_book_tragedy, how="inner").shape

(32, 12)

In [35]:
pd.merge(right=df_book_tragedy, left=df_book_comedy, how="inner").shape

(0, 12)

## Joins

![](https://www.datasciencemadesimple.com/wp-content/uploads/2017/09/join-or-merge-in-python-pandas-1.png?ezimgfmt=ng:webp/ngcb1)

- **Inner Join** or Natural join: To keep only rows that match from the data frames, specify the argument `how=‘inner’`.
- **Outer Join** or Full outer join:To keep all rows from both data frames, specify `how=‘outer’`.
- **Left Join** or Left outer join:To include all the rows of your data frame x and only those from y that match, specify `how=‘left’`.
- **Right Join** or Right outer join:To include all the rows of your data frame y and only those from x that match, specify `how=‘right’`.

In [36]:
# Inner between books and authors passing by the intermediate table author_book

In [60]:
df_book_tragedy.head(5)

Unnamed: 0,datefield,publisher,title,edition,place,issuance,first_pdf,number_volumes,identifier,fulltext_filename,type,genre
16,1887.0,J. W. Jarvis & Son,"Arden of Feversham, a tragedy: reprinted from ...",,London,monographic,lsidyv35e77d2f,1,106624,000106624_01_text.json,tragedy,Drama
22,1776.0,J. Dodsley,"Semiramis, a tragedy, etc. [In verse.]",,London,monographic,lsidyv36799899,1,154917,000154917_01_text.json,tragedy,Drama
31,1794.0,For the Author,"The Fall of the French Monarchy; or, Louis XVI...",,London,monographic,lsidyv3679ae3b,1,215686,000215686_01_text.json,tragedy,Drama
65,1884.0,G. Bell & Sons,[Strafford: an historical tragedy.],[Another edition.] With notes and preface by E...,London,monographic,lsidyv35e8b99c,1,499453,000499453_01_text.json,tragedy,Drama
70,1821.0,John Murray,"Sardanapalus, a tragedy. The Two Foscari, a tr...",,London,monographic,lsidyv386108c3,1,558547,000558547_01_text.json,tragedy,Drama


In [61]:
df_book_tragedy_authors = pd.merge(right=df_book_tragedy, 
                                   left=df_author_book, how="inner", 
                                   right_on="identifier", left_on="book_id")

In [62]:
df_book_tragedy_authors.head(3)

Unnamed: 0,book_id,author_id,datefield,publisher,title,edition,place,issuance,first_pdf,number_volumes,identifier,fulltext_filename,type,genre
0,106624,14,1887.0,J. W. Jarvis & Son,"Arden of Feversham, a tragedy: reprinted from ...",,London,monographic,lsidyv35e77d2f,1,106624,000106624_01_text.json,tragedy,Drama
1,154917,20,1776.0,J. Dodsley,"Semiramis, a tragedy, etc. [In verse.]",,London,monographic,lsidyv36799899,1,154917,000154917_01_text.json,tragedy,Drama
2,215686,29,1794.0,For the Author,"The Fall of the French Monarchy; or, Louis XVI...",,London,monographic,lsidyv3679ae3b,1,215686,000215686_01_text.json,tragedy,Drama


In [63]:
df_book_tragedy_authors = pd.merge(right=df_book_tragedy_authors, 
                                   left=df_author, how="inner", right_on="author_id", 
                                   left_index=True)

In [64]:
df_book_tragedy_authors.head(3)

Unnamed: 0,name,book_id,author_id,datefield,publisher,title,edition,place,issuance,first_pdf,number_volumes,identifier,fulltext_filename,type,genre
12,,1256889,7,1874.0,,"The Maid of Florence; or, a Woman's Vengeance....",,London,monographic,lsidyv35e87a54,1,1256889,001256889_01_text.json,tragedy,Drama
19,,2422448,7,1775.0,,Matilda: a tragedy [in five acts and in verse]...,,London,monographic,lsidyv367991fa,1,2422448,002422448_01_text.json,tragedy,Drama
22,,3091234,7,1861.0,,"Richard Cœur de Lion, an historical tragedy [i...",,London,monographic,lsidyv35e86dc3,1,3091234,003091234_01_text.json,tragedy,Drama


In [65]:
df_book_tragedy_authors = df_book_tragedy_authors[["author_id","name","title","datefield","identifier"]]

In [66]:
df_book_tragedy_authors.head(3)

Unnamed: 0,author_id,name,title,datefield,identifier
12,7,,"The Maid of Florence; or, a Woman's Vengeance....",1874.0,1256889
19,7,,Matilda: a tragedy [in five acts and in verse]...,1775.0,2422448
22,7,,"Richard Cœur de Lion, an historical tragedy [i...",1861.0,3091234


In [67]:
df_book[df_book["identifier"] == 1256889]

Unnamed: 0,datefield,publisher,title,edition,place,issuance,first_pdf,number_volumes,identifier,fulltext_filename,type,genre
151,1874.0,,"The Maid of Florence; or, a Woman's Vengeance....",,London,monographic,lsidyv35e87a54,1,1256889,001256889_01_text.json,tragedy,Drama


In [68]:
df_author_book[df_author_book["book_id"] == 1256889]

Unnamed: 0,book_id,author_id
151,1256889,7


In [69]:
df_author[df_author.index == 7]

Unnamed: 0,name
7,


**Question**: try to drop the author_id 7 (no or unknown author) from the df_author table, and try to inner join again. What happens?

In [73]:
print(df_author.shape)
df_author_reduced = df_author[~df_author["name"].isna()]
print(df_author_reduced.shape)

(365, 1)
(364, 1)


In [48]:
# Left

In [77]:
df_author_reduced.loc[31]

name    BEECHAM, John.
Name: 31, dtype: object

In [49]:
pd.merge(right=df_book_tragedy_authors, left=df_author_reduced, how="left", right_on="author_id", left_index=True).head(10)

Unnamed: 0,name_x,author_id,name_y,title,datefield,identifier
31,A. A.,0,,,,
31,"Abbott, Evelyn",1,,,,
31,"A'BECKETT, Gilbert Abbott.",2,,,,
31,"Laffan, De Courcy - Mrs",3,,,,
31,"Adams, W. H. Davenport (William Henry Davenport)",4,,,,
31,"ALGIE, James.",5,,,,
31,"ALLAN, J. T.",6,,,,
31,"Andersen, H. C. (Hans Christian)",8,,,,
31,"ANDERSON, Alexander - of Kirkconnell",9,,,,
31,"ANDREWS, Christopher Columbus.",10,,,,


In [50]:
# Right

In [51]:
pd.merge(right=df_book_tragedy_authors, left=df_author_reduced, how="right", right_on="author_id", left_index=True).head(10)

Unnamed: 0,name_x,author_id,name_y,title,datefield,identifier
12,,7,,"The Maid of Florence; or, a Woman's Vengeance....",1874.0,1256889
19,,7,,Matilda: a tragedy [in five acts and in verse]...,1775.0,2422448
22,,7,,"Richard Cœur de Lion, an historical tragedy [i...",1861.0,3091234
26,,7,,"The King's Stratagem, or the Pearl of Poland. ...",1874.0,3492293
27,,7,,"The Count de Villeroi; or, the fate of Patriot...",1794.0,3795731
0,"ARDEN, Thomas - of Faversham",14,"ARDEN, Thomas - of Faversham","Arden of Feversham, a tragedy: reprinted from ...",1887.0,106624
1,"AYSCOUGH, George Edward.",20,"AYSCOUGH, George Edward.","Semiramis, a tragedy, etc. [In verse.]",1776.0,154917
2,"BARTHOLOMEW, John - Dramatist",29,"BARTHOLOMEW, John - Dramatist","The Fall of the French Monarchy; or, Louis XVI...",1794.0,215686
3,"BROWNING, Robert - the Poet",60,"BROWNING, Robert - the Poet",[Strafford: an historical tragedy.],1884.0,499453
4,"Byron, George Gordon Byron - Baron",65,"Byron, George Gordon Byron - Baron","Sardanapalus, a tragedy. The Two Foscari, a tr...",1821.0,558547


In [52]:
# Question above

In [53]:
pd.merge(right=df_book_tragedy_authors, left=df_author_reduced, how="inner", right_on="author_id", left_index=True).head(10)

Unnamed: 0,name_x,author_id,name_y,title,datefield,identifier
0,"ARDEN, Thomas - of Faversham",14,"ARDEN, Thomas - of Faversham","Arden of Feversham, a tragedy: reprinted from ...",1887.0,106624
1,"AYSCOUGH, George Edward.",20,"AYSCOUGH, George Edward.","Semiramis, a tragedy, etc. [In verse.]",1776.0,154917
2,"BARTHOLOMEW, John - Dramatist",29,"BARTHOLOMEW, John - Dramatist","The Fall of the French Monarchy; or, Louis XVI...",1794.0,215686
3,"BROWNING, Robert - the Poet",60,"BROWNING, Robert - the Poet",[Strafford: an historical tragedy.],1884.0,499453
4,"Byron, George Gordon Byron - Baron",65,"Byron, George Gordon Byron - Baron","Sardanapalus, a tragedy. The Two Foscari, a tr...",1821.0,558547
5,"CAUNTER, Richard Macdonald.",78,"CAUNTER, Richard Macdonald.","Attila, a tragedy; and other poems",1832.0,638231
6,"CLARKE, Joseph Ignatius Constantine.",83,"CLARKE, Joseph Ignatius Constantine.",Robert Emmet. A tragedy of Irish history. [A p...,1888.0,715387
7,"Cumberland, Richard",102,"Cumberland, Richard","The Battle of Hastings, a tragedy, etc",1778.0,835040
8,"DE LA PASTURE, Elizabeth Lydia Rosabelle - aft...",111,"DE LA PASTURE, Elizabeth Lydia Rosabelle - aft...",A Toy Tragedy,1894.0,897980
9,"Dryden, John.",119,"Dryden, John.",The Duke of Guise. A tragedy. Acted by their M...,1683.0,987705


**Questions**:

* Add the authors to the 'df_book_comedy' dataframe, dropping books without an author.
* How many books has the most prolific author in our dataset authored?
* Create a dataframe without the repeated name_y and name_x columns above.

In [57]:
df_book_tragedy_authors.dropna(how="any", axis=0)

Unnamed: 0,author_id,name,title,datefield,identifier
0,14,"ARDEN, Thomas - of Faversham","Arden of Feversham, a tragedy: reprinted from ...",1887.0,106624
1,20,"AYSCOUGH, George Edward.","Semiramis, a tragedy, etc. [In verse.]",1776.0,154917
2,29,"BARTHOLOMEW, John - Dramatist","The Fall of the French Monarchy; or, Louis XVI...",1794.0,215686
3,60,"BROWNING, Robert - the Poet",[Strafford: an historical tragedy.],1884.0,499453
4,65,"Byron, George Gordon Byron - Baron","Sardanapalus, a tragedy. The Two Foscari, a tr...",1821.0,558547
5,78,"CAUNTER, Richard Macdonald.","Attila, a tragedy; and other poems",1832.0,638231
6,83,"CLARKE, Joseph Ignatius Constantine.",Robert Emmet. A tragedy of Irish history. [A p...,1888.0,715387
7,102,"Cumberland, Richard","The Battle of Hastings, a tragedy, etc",1778.0,835040
8,111,"DE LA PASTURE, Elizabeth Lydia Rosabelle - aft...",A Toy Tragedy,1894.0,897980
9,119,"Dryden, John.",The Duke of Guise. A tragedy. Acted by their M...,1683.0,987705


In [58]:
from collections import Counter

c = Counter(df_author_book.author_id.values)

In [59]:
c.most_common()[:10]

[(7, 67),
 (60, 2),
 (65, 2),
 (85, 2),
 (102, 2),
 (119, 2),
 (136, 2),
 (148, 2),
 (156, 2),
 (193, 2)]

In [60]:
df_author[df_author.index == 7]

Unnamed: 0,name
7,


In [61]:
df_author[df_author.index == 60]

Unnamed: 0,name
60,"BROWNING, Robert - the Poet"


## Pivoting

This is bonus content!

For more (including stacking with multi-indexes and unpivoting or melting), see https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html

In [62]:
data = {'place': ["London","London","Berlin","Berlin","Rome","Rome"],
       'year': [1800, 1900, 1800, 1900, 1800, 1900],
       'values': [10,20,30,40,50,60]}
toy_df = pd.DataFrame(data, columns=['place','year','values'])

In [63]:
toy_df

Unnamed: 0,place,year,values
0,London,1800,10
1,London,1900,20
2,Berlin,1800,30
3,Berlin,1900,40
4,Rome,1800,50
5,Rome,1900,60


In [64]:
pivoted = toy_df.pivot(index='year', columns='place', values='values')

In [65]:
pivoted

place,Berlin,London,Rome
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1800,30,10,50
1900,40,20,60
