# Petfinder.com adoption factor exploration by Paige McKenzie

Implements methods discussed in related [blog post](#).

Data can be acquired using associated `scraper.py` file.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re
%pylab inline

In [None]:
pets = pd.read_csv('pets_new.csv', index_col=0, parse_dates=['published_at', 'status_changed_at'])

for col in ['attributes', 'breeds', 'colors', 'environment']:
    pets = pd.concat([pets, pd.DataFrame(pets[col].apply(eval).tolist()).rename(columns=lambda subcol:'{}_{}'.format(col, subcol))],
                     axis=1)
    del pets[col]

In [None]:
# lowercase
# remove anything inside parenthesis
# remove anything inside asterisks
# remove any word with a digit in it
# split on common punctuation and take first word
# replace multiple punctuation with single
# take first name if "and/&" is included
# only keep word characters
# strip outside punctuation
pets['clean_name'] = pets['name'].str.lower().apply(lambda 
                        name:re.sub(r"\(.+\)", " ", name)).apply(lambda 
                        name:re.sub(r"\*.+\*", " ", name)).apply(lambda 
                        name:re.sub(r"\d+\b", " ", name)).apply(lambda 
                        name:re.split(r"[-,]\s+", name)[0]).apply(lambda
                        name:re.split(r"(\&|and) ", name)[0]).apply(lambda
                        name:re.sub(r"[^\w' ]", " ", name)).apply(lambda
                        name:re.sub(r"\s+", " ", name)).str.strip()

In [None]:
# only retain clean words in description

a = set(re.findall(r"\b[a-z']+\b", ' '.join(cats['description'].fillna(''))))

In [None]:
# stem words in description
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("english")
{stemmer.stem(word) for word in a}

In [None]:
cats['description'].fillna('').str.lower().apply(lambda sentence:re.findall(r"\b[^\d\s]+\b", sentence))

In [None]:
a = set(re.findall(r"\b[^\d\s,\/\-]+\b", ' '.join(cats['description'].fillna(''))))

In [None]:
[word for word in a if pos_tag([word])[0][1].startswith('JJ')]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(max_features=500)

vect.fit(cats['description'].fillna(''))

In [None]:
criteria = pd.Series([tag.startswith('JJ') or tag=='VBN' for word, tag in pos_tag(vect.vocabulary_.keys())],
          index=vect.vocabulary_.keys())

pd.DataFrame(vect.transform(cats['description'].fillna('')).todense(),
             columns=vect.vocabulary_, index=cats.index).loc[:,criteria[criteria].index]

In [None]:
from nltk import pos_tag, word_tokenize

cats['description'].head(500).fillna('').apply(lambda sentence:
                                 ' '.join([word for word, pos in pos_tag(word_tokenize(sentence.lower())) if pos.startswith('JJ')]))

In [None]:
pd.DataFrame(vect.transform(pets['description']).todense(), index=pets.index,
             columns=vect.vocabulary_)

In [None]:
def calc_lift(a, b):
    total_size = len(a)
    num_a = a.sum()
    num_b = b.sum()
    num_a_b = (a&b).sum()
    return total_size*float(num_a_b)/float(num_a*num_b)

#pd.Series({name:calc_lift(name, 'XL', pets) for name in pets['clean_name'].unique()}).sort_values(ascending=False)

In [None]:
pets = pets[pets['animal']=='Dog']

In [None]:
breeds = pd.get_dummies(pets['breeds'].apply(pd.Series).stack()).groupby(level=0).sum()

In [None]:
calc_lift((pets['mix']=='yes'), (breeds['Pit Bull Terrier']==1))

In [None]:
pets['time_since_update'] = (pets['date_pulled'] - pd.to_datetime(pets['lastUpdate']).dt.date).dt.days / 30

pets['time_since_update'].hist(bins=30)
plt.title("Distribution of time since the listing was updated")
plt.show()

In [None]:
# assume any pet posted for more than 20 months is an abandoned listing
pets = pets[pets['time_since_update']<20]