In [1]:
# Import the dependencies
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
from sklearn import utils
from collections import Counter
import os
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from imblearn.metrics import classification_report_imbalanced

In [2]:
# Import and read the csv file
shelter_dogs_df = pd.read_csv(Path('ShelterDogs.csv'))
shelter_dogs_df.head()

Unnamed: 0,ID,name,age,sex,breed,date_found,adoptable_from,posted,color,coat,size,neutered,housebroken,likes_people,likes_children,get_along_males,get_along_females,get_along_cats,keep_in
0,23807,Gida,0.25,female,Unknown Mix,2019-12-10,2019-12-11,2019-12-11,red,short,small,no,,,,,,,
1,533,Frida És Ricsi,0.17,female,Unknown Mix,2019-12-01,2019-12-01,2019-12-09,black and white,short,small,no,,yes,yes,yes,yes,yes,
2,23793,,4.0,male,Unknown Mix,2019-12-08,2019-12-23,2019-12-08,saddle back,short,medium,no,,,,,,,
3,23795,,1.0,male,Unknown Mix,2019-12-08,2019-12-23,2019-12-08,yellow-brown,medium,medium,no,,,,,,,
4,23806,Amy,2.0,female,French Bulldog Mix,2019-12-10,2019-12-11,2019-12-11,black,short,small,no,,,,,,,


In [3]:
# Drop non-beneficial columns
shelter_dogs_df = shelter_dogs_df.drop('date_found', axis=1)


# Drop non-beneficial columns --- because some name values are null, dropping column and using ID
shelter_dogs_df = shelter_dogs_df.drop('name', axis = 1)


# Drop non-beneficial columns
shelter_dogs_df = shelter_dogs_df.drop('keep_in', axis =1)


shelter_dogs_df = shelter_dogs_df.drop('neutered', axis=1)
shelter_dogs_df.head()

Unnamed: 0,ID,age,sex,breed,adoptable_from,posted,color,coat,size,housebroken,likes_people,likes_children,get_along_males,get_along_females,get_along_cats
0,23807,0.25,female,Unknown Mix,2019-12-11,2019-12-11,red,short,small,,,,,,
1,533,0.17,female,Unknown Mix,2019-12-01,2019-12-09,black and white,short,small,,yes,yes,yes,yes,yes
2,23793,4.0,male,Unknown Mix,2019-12-23,2019-12-08,saddle back,short,medium,,,,,,
3,23795,1.0,male,Unknown Mix,2019-12-23,2019-12-08,yellow-brown,medium,medium,,,,,,
4,23806,2.0,female,French Bulldog Mix,2019-12-11,2019-12-11,black,short,small,,,,,,


In [4]:
# Transform posted date and compare to data pull date
shelter_dogs_df['posted']= pd.to_datetime(shelter_dogs_df['posted'])
shelter_dogs_df['shelter_time']=np.datetime64('2019-12-12')- shelter_dogs_df['posted']

In [5]:
# Add shelter_time column (days dog in shelter)
shelter_dogs_df.assign(shelter_time = shelter_dogs_df['shelter_time'].dt.days)

Unnamed: 0,ID,age,sex,breed,adoptable_from,posted,color,coat,size,housebroken,likes_people,likes_children,get_along_males,get_along_females,get_along_cats,shelter_time
0,23807,0.25,female,Unknown Mix,2019-12-11,2019-12-11,red,short,small,,,,,,,1
1,533,0.17,female,Unknown Mix,2019-12-01,2019-12-09,black and white,short,small,,yes,yes,yes,yes,yes,3
2,23793,4.00,male,Unknown Mix,2019-12-23,2019-12-08,saddle back,short,medium,,,,,,,4
3,23795,1.00,male,Unknown Mix,2019-12-23,2019-12-08,yellow-brown,medium,medium,,,,,,,4
4,23806,2.00,female,French Bulldog Mix,2019-12-11,2019-12-11,black,short,small,,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2932,118,16.92,male,Unknown Mix,2003-12-25,2006-03-22,yellow-brown,short,medium,no,yes,yes,no,yes,no,5013
2933,262,17.33,female,Staffordshire Terrier Mix,2004-08-27,2005-07-08,striped,short,large,,,,,,,5270
2934,4,18.17,male,Unknown Mix,2005-09-21,2005-10-26,black,short,medium,,,,,,,5160
2935,141,17.17,male,Unknown Mix,2004-11-27,2005-05-02,black and brown,medium,medium,,,,,,,5337


In [6]:
# convert column "shelter_time" of a DataFrame
shelter_dogs_df["shelter_time"] = shelter_dogs_df["shelter_time"].dt.days.astype(int)
shelter_dogs_df

Unnamed: 0,ID,age,sex,breed,adoptable_from,posted,color,coat,size,housebroken,likes_people,likes_children,get_along_males,get_along_females,get_along_cats,shelter_time
0,23807,0.25,female,Unknown Mix,2019-12-11,2019-12-11,red,short,small,,,,,,,1
1,533,0.17,female,Unknown Mix,2019-12-01,2019-12-09,black and white,short,small,,yes,yes,yes,yes,yes,3
2,23793,4.00,male,Unknown Mix,2019-12-23,2019-12-08,saddle back,short,medium,,,,,,,4
3,23795,1.00,male,Unknown Mix,2019-12-23,2019-12-08,yellow-brown,medium,medium,,,,,,,4
4,23806,2.00,female,French Bulldog Mix,2019-12-11,2019-12-11,black,short,small,,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2932,118,16.92,male,Unknown Mix,2003-12-25,2006-03-22,yellow-brown,short,medium,no,yes,yes,no,yes,no,5013
2933,262,17.33,female,Staffordshire Terrier Mix,2004-08-27,2005-07-08,striped,short,large,,,,,,,5270
2934,4,18.17,male,Unknown Mix,2005-09-21,2005-10-26,black,short,medium,,,,,,,5160
2935,141,17.17,male,Unknown Mix,2004-11-27,2005-05-02,black and brown,medium,medium,,,,,,,5337


In [7]:
# drop additional columns
shelter_dogs_df = shelter_dogs_df.drop('adoptable_from', axis=1)

shelter_dogs_df = shelter_dogs_df.drop('posted', axis=1)


shelter_dogs_df = shelter_dogs_df.drop('color', axis =1)

shelter_dogs_df = shelter_dogs_df.drop('size', axis=1)

In [8]:
# set conditions for adoptability
shelter_dogs_df.loc[shelter_dogs_df['shelter_time'].astype(int)>= 1800, 'availability_likely']="high"

# set conditions for adoptability 
shelter_dogs_df.loc[shelter_dogs_df['shelter_time'].astype(int)< 1800, 'availability_likely']="low"

In [9]:
# fill in NAs with 0
shelter_dogs_df.housebroken = shelter_dogs_df.housebroken.fillna('no')
shelter_dogs_df.likes_people = shelter_dogs_df.likes_people.fillna('no')
shelter_dogs_df.likes_children = shelter_dogs_df.likes_children.fillna('no')
shelter_dogs_df.get_along_males = shelter_dogs_df.get_along_males.fillna('no')
shelter_dogs_df.get_along_females = shelter_dogs_df.get_along_females.fillna('no')
shelter_dogs_df.get_along_cats = shelter_dogs_df.get_along_cats.fillna('no')
shelter_dogs_df.breed= shelter_dogs_df.breed.fillna('Unknown Mix')

In [10]:
shelter_dogs_df['breed'].value_counts().head(10)    

Unknown Mix                  1524
German Shepherd Dog Mix       190
Dachshund Mix                 147
Labrador Retriever Mix         83
Staffordshire Terrier Mix      62
Puli Mix                       40
German Shepherd Dog            37
Schnauzer Mix                  34
Fox Terrier Mix                32
Greyhound Mix                  32
Name: breed, dtype: int64

In [11]:
shelter_dogs_df['breed'] = shelter_dogs_df['breed'].apply(lambda x: x if x in ('Unknown Mix')else 'Known')

In [12]:
shelter_dogs_df['breed'].nunique()

2

In [13]:
shelter_dogs_df

Unnamed: 0,ID,age,sex,breed,coat,housebroken,likes_people,likes_children,get_along_males,get_along_females,get_along_cats,shelter_time,availability_likely
0,23807,0.25,female,Unknown Mix,short,no,no,no,no,no,no,1,low
1,533,0.17,female,Unknown Mix,short,no,yes,yes,yes,yes,yes,3,low
2,23793,4.00,male,Unknown Mix,short,no,no,no,no,no,no,4,low
3,23795,1.00,male,Unknown Mix,medium,no,no,no,no,no,no,4,low
4,23806,2.00,female,Known,short,no,no,no,no,no,no,1,low
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2932,118,16.92,male,Unknown Mix,short,no,yes,yes,no,yes,no,5013,high
2933,262,17.33,female,Known,short,no,no,no,no,no,no,5270,high
2934,4,18.17,male,Unknown Mix,short,no,no,no,no,no,no,5160,high
2935,141,17.17,male,Unknown Mix,medium,no,no,no,no,no,no,5337,high


In [14]:
shelter_dogs_df.rename(columns = {'ID':'id_tag'}, inplace= True)

In [15]:
#age_breed_df.head()-- need to come back to this one

In [17]:
## combined
shelter_dogs_df.sex = shelter_dogs_df.sex.map({'male': 0, 'female': 1})
shelter_dogs_df.breed = shelter_dogs_df.breed.map({'Unknown Mix': 0, 'Known': 1})
shelter_dogs_df.availability_likely =shelter_dogs_df.availability_likely.map({'low': 0, 'high': 1})

In [18]:
shelter_dogs_df.head()

Unnamed: 0,id_tag,age,sex,breed,coat,housebroken,likes_people,likes_children,get_along_males,get_along_females,get_along_cats,shelter_time,availability_likely
0,23807,0.25,1,0,short,no,no,no,no,no,no,1,0
1,533,0.17,1,0,short,no,yes,yes,yes,yes,yes,3,0
2,23793,4.0,0,0,short,no,no,no,no,no,no,4,0
3,23795,1.0,0,0,medium,no,no,no,no,no,no,4,0
4,23806,2.0,1,1,short,no,no,no,no,no,no,1,0


In [19]:
import joblib
import pickle

In [20]:
## Combined Machine Learning
# Assign the data to X and y
# Note: Sklearn requires a two-dimensional array of values
# so we use reshape() to create this

X = shelter_dogs_df[['age', 'breed', 'sex']]
y = shelter_dogs_df['availability_likely']

print("Shape: ", X.shape, y.shape)

Shape:  (2937, 3) (2937,)


In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [22]:
from sklearn.linear_model import LogisticRegression
classifier_2 = LogisticRegression()
classifier_2

LogisticRegression()

In [23]:
classifier_2.fit(X_train, y_train)

LogisticRegression()

In [24]:
print(f"(Combined) Training Data Score: {classifier_2.score(X_train, y_train)}")
print(f" (Combined) Testing Data Score: {classifier_2.score(X_test, y_test)}")

(Combined) Training Data Score: 0.8188010899182562
 (Combined) Testing Data Score: 0.8258503401360544


In [25]:
print(f'Actual:\t\t{list(y_test[:10])}')
print(f'Predicted:\t{list(classifier_2.predict(X_test[:10]))}')

Actual:		[1, 1, 1, 1, 0, 1, 1, 0, 1, 1]
Predicted:	[1, 1, 1, 1, 0, 1, 1, 1, 1, 1]


In [26]:
predictions = classifier_2.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
1294,1,1
2718,1,1
2709,1,1
2026,1,1
351,0,0
...,...,...
2401,1,1
1748,1,1
2063,1,1
816,0,0


In [27]:
pickle.dump(classifier_2, open('model_1.pkl','wb'))

In [28]:
model = pickle.load(open('model_1.pkl','rb'))