In [1]:
import pandas as pd
import Levenshtein as lev

In [2]:
may_docs = pd.read_csv('volume_representations_may_3.csv')
apr_docs = pd.read_csv('volume_representations_apr_3.csv')
dec_docs = pd.read_csv('volume_representations_dec_21.csv')

### Preview

In [3]:
may_docs.head()

Unnamed: 0,volume,doc_id,doc_title
0,frus1917-72PubDipv06,frus1917-72PubDipv06d1,1. Memorandum From Thomas C.\n ...
1,frus1917-72PubDipv06,frus1917-72PubDipv06d2,2. Memorandum Prepared by Thomas C.\n ...
2,frus1917-72PubDipv06,frus1917-72PubDipv06d3,3. Report Prepared by Deputy Director-Designat...
3,frus1917-72PubDipv06,frus1917-72PubDipv06d4,4. Memorandum From James J.\n ...
4,frus1917-72PubDipv06,frus1917-72PubDipv06d5,5. Report Prepared by the Task Force on the Un...


### Compare # of docs

In [4]:
print('December 21:', len(dec_docs), "\n",
      'April 3:', len(apr_docs), "\n",
      'May 3:', len(may_docs))

December 21: 306329 
 April 3: 307048 
 May 3: 307048


In [5]:
print('Doc count increased by',
      (len(apr_docs) - len(dec_docs)),
      'between December and April')

Doc count increased by 719 between December and April


### Compare dataframe contents

In [6]:
may_docs.equals(apr_docs)

False

In [7]:
apr_docs.equals(dec_docs)

False

### Compare document ids listed

In [8]:
set(may_docs['doc_id']) == set(apr_docs['doc_id'])

True

In [9]:
set(apr_docs['doc_id']) == set(dec_docs['doc_id'])

False

### Get doc id diffs between Dec 2018 and April 2019

In [10]:
added = list(set(apr_docs['doc_id']).difference(set(dec_docs['doc_id'])))
lost  = list(set(dec_docs['doc_id']).difference(set(apr_docs['doc_id'])))

In [11]:
print(len(added), "new document IDs were added \n\n", added)

719 new document IDs were added 

 ['frus1977-80v24d6', 'frus1977-80v12d237', 'frus1977-80v12d318', 'frus1977-80v24d170', 'frus1977-80v24d209', 'frus1977-80v12d32', 'frus1977-80v24d282', 'frus1977-80v12d232', 'frus1977-80v12d227', 'frus1977-80v12d1', 'frus1977-80v24d219', 'frus1977-80v24d157', 'frus1977-80v12d87', 'frus1977-80v24d17', 'frus1977-80v24d289', 'frus1977-80v12d201', 'frus1977-80v24d295', 'frus1977-80v12d97', 'frus1977-80v24d159', 'frus1977-80v24d197', 'frus1977-80v12d298', 'frus1977-80v24d353', 'frus1977-80v24d90', 'frus1977-80v12d156', 'frus1977-80v12d167', 'frus1977-80v24d13', 'frus1977-80v12d239', 'frus1977-80v12d70', 'frus1977-80v24d114', 'frus1977-80v24d155', 'frus1977-80v24d126', 'frus1977-80v12d46', 'frus1977-80v12d303', 'frus1977-80v24d348', 'frus1977-80v24d80', 'frus1977-80v12d66', 'frus1977-80v24d115', 'frus1977-80v24d49', 'frus1977-80v24d150', 'frus1977-80v24d235', 'frus1977-80v24d350', 'frus1977-80v24d304', 'frus1977-80v12d307', 'frus1977-80v12d125', 'frus1977-8

In [12]:
print(len(lost), "docs were removed \n\n", lost)

0 docs were removed 

 []


### Get doc id diffs between April 2019 and May 2019

In [13]:
added_2 = list(set(may_docs['doc_id']).difference(set(apr_docs['doc_id'])))
lost_2  = list(set(apr_docs['doc_id']).difference(set(may_docs['doc_id'])))

In [14]:
print(len(added_2), "docs were added \n\n", added_2)

0 docs were added 

 []


In [15]:
print(len(lost_2), "docs were removed \n\n", lost_2)

0 docs were removed 

 []


### For docs ids that have stayed present, compare titles to ensure IDs have consistently referred to the same document

In [16]:
may_docs.columns = ['volume', 'doc_id', 'may_doc_title']
apr_docs.columns = ['volume', 'doc_id', 'apr_doc_title']
dec_docs.columns = ['volume', 'doc_id', 'dec_doc_title']

In [17]:
merged_docs = pd.merge(may_docs, apr_docs, on='doc_id', how='right')
merged_docs = pd.merge(merged_docs, dec_docs, on='doc_id', how='right')
merged_docs = merged_docs.drop(['volume', 'volume_x', 'volume_y'], axis=1)
merged_docs = merged_docs.replace(r'\\n',' ', regex=True)
merged_docs = merged_docs.replace(r'\\t',' ', regex=True)
merged_docs = merged_docs.replace(r'\n',' ', regex=True)

In [18]:
merged_docs['may_doc_title'] = merged_docs['may_doc_title'].apply(str)
merged_docs['apr_doc_title'] = merged_docs['apr_doc_title'].apply(str)
merged_docs['dec_doc_title'] = merged_docs['dec_doc_title'].apply(str)

In [19]:
changed = []

In [20]:
for index, row in merged_docs.iterrows():
    if not (row['may_doc_title'] == row['apr_doc_title'] == row['dec_doc_title']):
        changed.append(row)

In [21]:
changed_df = pd.DataFrame(changed)
changed_df.head()

Unnamed: 0,doc_id,may_doc_title,apr_doc_title,dec_doc_title
7230,frus1961-63v11d186,186. Memorandum From the Joint Chiefs of Staff...,186. Memorandum From the Joint Chiefs of Staff...,186. Memorandum From the Joint Chiefs of Staff...
9783,frus1952-54v01p2d295,The Under Secretary of State’s Special Assista...,The Under Secretary of State’s Special Assista...,The Under Secretary of State’s Special Assista...
26149,frus1961-63v12d11,11. Memorandum of Conversation,11. Memorandum of Conversation,11.Memorandum of Conversation
43264,frus1964-68v04d348,348. Telegram From the Embassy in Poland to ...,348. Telegram From the Embassy in Poland to ...,348.Telegram From the Embassy in Poland to ...
53467,frus1961-63v14d15,15. Memorandum of Conversation,15. Memorandum of Conversation,15.Memorandum of Conversation


In [22]:
dec_april_distance = []
for index, row in changed_df.iterrows():
    dec_april_distance.append(lev.distance(row['dec_doc_title'], row['apr_doc_title']))

changed_df['distance_dec_to_apr'] = dec_april_distance

In [23]:
may_april_distance = []
for index, row in changed_df.iterrows():
    may_april_distance.append(lev.distance(row['may_doc_title'], row['apr_doc_title']))

changed_df['distance_may_to_apr'] = may_april_distance

In [24]:
changed_df.head()

Unnamed: 0,doc_id,may_doc_title,apr_doc_title,dec_doc_title,distance_dec_to_apr,distance_may_to_apr
7230,frus1961-63v11d186,186. Memorandum From the Joint Chiefs of Staff...,186. Memorandum From the Joint Chiefs of Staff...,186. Memorandum From the Joint Chiefs of Staff...,1,0
9783,frus1952-54v01p2d295,The Under Secretary of State’s Special Assista...,The Under Secretary of State’s Special Assista...,The Under Secretary of State’s Special Assista...,0,1
26149,frus1961-63v12d11,11. Memorandum of Conversation,11. Memorandum of Conversation,11.Memorandum of Conversation,1,0
43264,frus1964-68v04d348,348. Telegram From the Embassy in Poland to ...,348. Telegram From the Embassy in Poland to ...,348.Telegram From the Embassy in Poland to ...,1,0
53467,frus1961-63v14d15,15. Memorandum of Conversation,15. Memorandum of Conversation,15.Memorandum of Conversation,1,0


In [25]:
changed_df.to_csv('changelist.csv', encoding='utf-8', index=False)