# Library Usage in Seattle 2005-2020

## Data Cleaning

### Import required libraries

In [1]:
# standard dataframe packages
import pandas as pd
import numpy as np

# saving packages
import pickle
import gzip

### Load checkout data

Since the data set is so large, I'll specify only the columns that I want in the DataFrame. This will effectively drop the following columns:
- `ID`
- `CheckoutYear`
- `BibNumber`
- `ItemBarcode`
- `ItemType`
    
I want to note that the `ItemType` and `Collection` columns are very similar, but the code in the `Collection` column contains more information within the `category_group` column that I add onto the DataFrame using the `data_dictionary.csv` file (see below). More specifically, the `ItemType` code yields mostly "Miscellaneous" results, whereas the `Collection` code yields differentiates between "Fiction" and "Nonfiction", among others. This could be useful information later on, so I found it best to drop the `ItemType` column.

In [2]:
%%time

# columns to load
usecols = ['Collection', 'CallNumber', 'ItemTitle', 'Subjects', 'CheckoutDateTime']

# load data (first 1 million rows)
df = pd.read_csv('data/Checkouts_By_Title__Physical_Items_.csv', nrows=1000000, usecols=usecols)

# rename columns to my preferred format
df.columns = ['collection', 'call_number', 'title', 'subjects', 'date']

CPU times: user 2.34 s, sys: 157 ms, total: 2.5 s
Wall time: 2.5 s


In [3]:
df.head()

Unnamed: 0,collection,call_number,title,subjects,date
0,nadvd,DVD FIREWAL,Firewall,"Kidnapping Drama, Video recordings for the hea...",02/13/2008 07:38:00 PM
1,nanf,793.2 C7744B 2001,best baby shower book a complete guide for par...,Showers Parties,07/23/2008 02:53:00 PM
2,nyfic,YA WESTERF,Uglies,"Fantasy, Teenage girls Fiction, Beauty Persona...",12/23/2009 04:20:00 PM
3,napar,618.4 L9511D 2009,doula guide to birth secrets every pregnant wo...,"Doulas, Childbirth",11/16/2010 12:04:00 PM
4,canf,641.692 M8216S 2005,Salmon a cookbook,Cookery Salmon,04/26/2009 01:29:00 PM


In [4]:
# check shape
df.shape

(1000000, 5)

In [5]:
# check for nan values
df.isna().sum()

collection         0
call_number      267
title          10600
subjects       18609
date               0
dtype: int64

The most important columns (`collection` and `date`) have no NaN values.

In [6]:
# check datatypes
df.dtypes

collection     object
call_number    object
title          object
subjects       object
date           object
dtype: object

### Convert `date` column to datetime

In [7]:
# look at an example before conversion
df.loc[0, 'date']

'02/13/2008 07:38:00 PM'

In [8]:
# specify the format
dt_format = '%m/%d/%Y %I:%M:%S %p'

In [9]:
# convert to datetime, dropping the hour-minute-second stamp using the `dt.date` attribute
df['date'] = pd.to_datetime(df.date, format=dt_format).dt.date

# confirm it worked
df.loc[0, 'date']

datetime.date(2008, 2, 13)

### Load other info from data dictionary

In [10]:
# load data
dd = pd.read_csv('data/data_dictionary.csv')

# rename columns to my preferred format
dd.columns = ['code', 'description', 'code_type', 'format_group', 'format_subgroup', 
              'category_group', 'category_subgroup', 'age_group']

# take a look
dd.head()

Unnamed: 0,code,description,code_type,format_group,format_subgroup,category_group,category_subgroup,age_group
0,cazover,CA7-zine collection oversize,ItemCollection,Print,Book,Periodical,,Adult
1,caziner,CA7-zine collection reference,ItemCollection,Print,Book,Periodical,,Adult
2,cazval,CA7-zine collection valuable mat.,ItemCollection,Print,Book,Periodical,,Adult
3,nga,Northgate Branch,Location,,,,,
4,hip,High Point Branch,Location,,,,,


In [11]:
# check shape
dd.shape

(580, 8)

In [12]:
# check for nan values
dd.isna().sum()

code                   0
description            0
code_type              0
format_group          35
format_subgroup       86
category_group        36
category_subgroup    533
age_group             35
dtype: int64

Again, with the size of the eventual DataFrame in mind, I want to drop any unnecessary columns before merging, so I'll drop the following columns:
- `code_type`, since that is superfluous information
- `category_subgroup`, since that is mostly NaN values

In [13]:
# drop columns
dd.drop(columns=['code_type', 'category_subgroup'], inplace=True)

In [14]:
# merge checkouts dataframe with info from data dictionary
df_merged = df.merge(dd, left_on='collection', right_on='code')

# take a look
df_merged.head()

Unnamed: 0,collection,call_number,title,subjects,date,code,description,format_group,format_subgroup,category_group,age_group
0,nadvd,DVD FIREWAL,Firewall,"Kidnapping Drama, Video recordings for the hea...",2008-02-13,nadvd,"NA-DVD, Fiction",Media,Video Disc,Fiction,Adult
1,nadvd,DVD MARLEY,Marley me,"Comedy films, Married people Drama, Philadelph...",2009-07-03,nadvd,"NA-DVD, Fiction",Media,Video Disc,Fiction,Adult
2,nadvd,DVD SIX FEE Season 4,Six feet under The complete fourth season,"Video recordings for the hearing impaired, Pro...",2008-10-26,nadvd,"NA-DVD, Fiction",Media,Video Disc,Fiction,Adult
3,nadvd,DVD DOCTOR,Doctor Who The next doctor,"London England Drama, Doctor Who Fictitious ch...",2010-11-10,nadvd,"NA-DVD, Fiction",Media,Video Disc,Fiction,Adult
4,nadvd,DVD SCHOOL,School ties,"Antisemitism Drama, Video recordings for the h...",2008-12-28,nadvd,"NA-DVD, Fiction",Media,Video Disc,Fiction,Adult


After merging, I can now drop the `collection` and `code` columns, since those are no longer necessary.

In [15]:
# drop columns
df_merged.drop(columns=['collection', 'code'], inplace=True)

### Set `date` column as index

In [16]:
# set `date` column as index, sort by index, and drop it outside the index
# note: in the past, the dropping of the column was done by default, but that no longer seems to be the case?
df_merged = df_merged.set_index(df_merged.date).sort_index().drop(columns='date')

# take a look
df_merged.head()

Unnamed: 0_level_0,call_number,title,subjects,description,format_group,format_subgroup,category_group,age_group
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2008-01-02,J394.2683 JACKSON 1994,winter solstice,"Winter solstice, Winter solstice Juvenile lite...",NC-Children's Holiday,Print,Book,Fiction,Juvenile
2008-01-02,CD 782.42166 Si56R,Remixed reimagined,"Remixes, Jazz vocals, Popular music 2001 2010",NA-Compact Discs,Media,Audio Disc,Nonfiction,Adult
2008-01-02,YA DESSEN,This lullaby a novel,"Mothers and daughters Fiction, Musicians Ficti...",NY-Teen - Fiction,Print,Book,Fiction,Teen
2008-01-02,YA TRUEMAN,Cruise control,"Basketball Fiction, Fathers and sons Fiction, ...",NY-Teen - Fiction,Print,Book,Fiction,Teen
2008-01-02,FIC BLOOM2007,Away a novel,"Immigrants New York State New York Fiction, Lo...",NA-Fiction,Print,Book,Fiction,Adult


#### NOTE: I may be able to drop even more columns (thinking especially of `call_number`, `title`, and `subjects`), since I'll mostly be looking at sheer numbers of items checked out each day. I'll keep them in for now in case they end up being useful for EDA.

### 💾 Save/Load

In [17]:
%%time

# uncomment to save
with gzip.open('data/seattle_lib_temp.pkl', 'wb') as goodbye:
    pickle.dump(df_merged, goodbye, protocol=pickle.HIGHEST_PROTOCOL)
    
# # uncomment to load
# with gzip.open('data/seattle_lib_temp.pkl', 'rb') as hello:
#     df = pickle.load(hello)

CPU times: user 25.3 s, sys: 261 ms, total: 25.5 s
Wall time: 25.6 s


#### NOTE: File size for first million rows is just under 38 MB; takes 25 seconds to save.

## GRAVEYARD

In [18]:
please break code

SyntaxError: invalid syntax (<ipython-input-18-b8306b2d38fe>, line 1)

In [None]:
dd[dd.code_type == 'ItemType']

In [None]:
dd[dd.code == 'nadvd']

In [None]:
dd[dd.code == 'acdvd']

In [None]:
dd.code_type.unique()

In [None]:
dd_item = dd[dd.code_type == 'ItemType'][['code', 'description', 'format_group', 'format_subgroup', 'category_group', 
             'category_subgroup', 'age_group']]

dd_item.head()

In [None]:
dd_item2 = dd[dd.code_type == 'ItemCollection'][['code', 'description', 'format_group', 'format_subgroup', 'category_group', 
             'category_subgroup', 'age_group']]

dd_item2.head()

In [None]:
sorted(df.item_type.unique())

In [None]:
dd_loc = dd[dd.code_type == 'Location'][['code', 'description']]

dd_loc.head()

In [None]:
test = df.merge(dd_item, left_on='item_type', right_on='code')
# test = test.merge(dd_loc, left_on='collection', right_on='code')

test.head()

In [None]:
test.isna().sum()

In [None]:
test.format_group.value_counts()

In [None]:
test.collection.unique()

In [None]:
test.shape

In [None]:
test2 = df.merge(dd_item2, left_on='Collection', right_on='code')

test2.head()

In [None]:
test.groupby('format_group').category_group.value_counts()

In [None]:
test2.groupby('format_group').category_group.value_counts()

In [None]:
dd[dd.code == 'nybot']

In [None]:
sorted(df.collection.unique())

In [None]:
dd_loc.code.unique()

In [None]:
[cod for cod in df.collection.unique() if cod in dd_loc.code.unique()]