In [1]:
# Import the dependencies
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
from sklearn import utils
from collections import Counter
import os
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from imblearn.metrics import classification_report_imbalanced

In [2]:
# Import and read the csv file
shelter_dogs_df = pd.read_csv(Path('./Resources/ShelterDogs.csv'))
shelter_dogs_df.head()

Unnamed: 0,ID,name,age,sex,breed,date_found,adoptable_from,posted,color,coat,size,neutered,housebroken,likes_people,likes_children,get_along_males,get_along_females,get_along_cats,keep_in
0,23807,Gida,0.25,female,Unknown Mix,2019-12-10,2019-12-11,2019-12-11,red,short,small,no,,,,,,,
1,533,Frida És Ricsi,0.17,female,Unknown Mix,2019-12-01,2019-12-01,2019-12-09,black and white,short,small,no,,yes,yes,yes,yes,yes,
2,23793,,4.0,male,Unknown Mix,2019-12-08,2019-12-23,2019-12-08,saddle back,short,medium,no,,,,,,,
3,23795,,1.0,male,Unknown Mix,2019-12-08,2019-12-23,2019-12-08,yellow-brown,medium,medium,no,,,,,,,
4,23806,Amy,2.0,female,French Bulldog Mix,2019-12-10,2019-12-11,2019-12-11,black,short,small,no,,,,,,,


In [3]:
# Drop non-beneficial columns
shelter_dogs_df = shelter_dogs_df.drop('date_found', axis=1)


# Drop non-beneficial columns --- because some name values are null, dropping column and using ID
shelter_dogs_df = shelter_dogs_df.drop('name', axis = 1)


# Drop non-beneficial columns
shelter_dogs_df = shelter_dogs_df.drop('keep_in', axis =1)
shelter_dogs_df.head()

Unnamed: 0,ID,age,sex,breed,adoptable_from,posted,color,coat,size,neutered,housebroken,likes_people,likes_children,get_along_males,get_along_females,get_along_cats
0,23807,0.25,female,Unknown Mix,2019-12-11,2019-12-11,red,short,small,no,,,,,,
1,533,0.17,female,Unknown Mix,2019-12-01,2019-12-09,black and white,short,small,no,,yes,yes,yes,yes,yes
2,23793,4.0,male,Unknown Mix,2019-12-23,2019-12-08,saddle back,short,medium,no,,,,,,
3,23795,1.0,male,Unknown Mix,2019-12-23,2019-12-08,yellow-brown,medium,medium,no,,,,,,
4,23806,2.0,female,French Bulldog Mix,2019-12-11,2019-12-11,black,short,small,no,,,,,,


In [4]:
# Transform posted date and compare to data pull date
shelter_dogs_df['posted']= pd.to_datetime(shelter_dogs_df['posted'])
shelter_dogs_df['shelter_time']=np.datetime64('2019-12-12')- shelter_dogs_df['posted']

In [5]:
# Add shelter_time column (days dog in shelter)
shelter_dogs_df.assign(shelter_time = shelter_dogs_df['shelter_time'].dt.days)

Unnamed: 0,ID,age,sex,breed,adoptable_from,posted,color,coat,size,neutered,housebroken,likes_people,likes_children,get_along_males,get_along_females,get_along_cats,shelter_time
0,23807,0.25,female,Unknown Mix,2019-12-11,2019-12-11,red,short,small,no,,,,,,,1
1,533,0.17,female,Unknown Mix,2019-12-01,2019-12-09,black and white,short,small,no,,yes,yes,yes,yes,yes,3
2,23793,4.00,male,Unknown Mix,2019-12-23,2019-12-08,saddle back,short,medium,no,,,,,,,4
3,23795,1.00,male,Unknown Mix,2019-12-23,2019-12-08,yellow-brown,medium,medium,no,,,,,,,4
4,23806,2.00,female,French Bulldog Mix,2019-12-11,2019-12-11,black,short,small,no,,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2932,118,16.92,male,Unknown Mix,2003-12-25,2006-03-22,yellow-brown,short,medium,no,no,yes,yes,no,yes,no,5013
2933,262,17.33,female,Staffordshire Terrier Mix,2004-08-27,2005-07-08,striped,short,large,yes,,,,,,,5270
2934,4,18.17,male,Unknown Mix,2005-09-21,2005-10-26,black,short,medium,no,,,,,,,5160
2935,141,17.17,male,Unknown Mix,2004-11-27,2005-05-02,black and brown,medium,medium,no,,,,,,,5337


In [6]:
# convert column "shelter_time" of a DataFrame
shelter_dogs_df["shelter_time"] = shelter_dogs_df["shelter_time"].dt.days.astype(int)
shelter_dogs_df

Unnamed: 0,ID,age,sex,breed,adoptable_from,posted,color,coat,size,neutered,housebroken,likes_people,likes_children,get_along_males,get_along_females,get_along_cats,shelter_time
0,23807,0.25,female,Unknown Mix,2019-12-11,2019-12-11,red,short,small,no,,,,,,,1
1,533,0.17,female,Unknown Mix,2019-12-01,2019-12-09,black and white,short,small,no,,yes,yes,yes,yes,yes,3
2,23793,4.00,male,Unknown Mix,2019-12-23,2019-12-08,saddle back,short,medium,no,,,,,,,4
3,23795,1.00,male,Unknown Mix,2019-12-23,2019-12-08,yellow-brown,medium,medium,no,,,,,,,4
4,23806,2.00,female,French Bulldog Mix,2019-12-11,2019-12-11,black,short,small,no,,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2932,118,16.92,male,Unknown Mix,2003-12-25,2006-03-22,yellow-brown,short,medium,no,no,yes,yes,no,yes,no,5013
2933,262,17.33,female,Staffordshire Terrier Mix,2004-08-27,2005-07-08,striped,short,large,yes,,,,,,,5270
2934,4,18.17,male,Unknown Mix,2005-09-21,2005-10-26,black,short,medium,no,,,,,,,5160
2935,141,17.17,male,Unknown Mix,2004-11-27,2005-05-02,black and brown,medium,medium,no,,,,,,,5337


In [7]:
# drop additional columns
shelter_dogs_df = shelter_dogs_df.drop('adoptable_from', axis=1)

shelter_dogs_df = shelter_dogs_df.drop('posted', axis=1)


shelter_dogs_df = shelter_dogs_df.drop('color', axis =1)

shelter_dogs_df = shelter_dogs_df.drop('size', axis=1)

In [8]:
# set conditions for adoptability
shelter_dogs_df.loc[shelter_dogs_df['shelter_time'].astype(int)>= 1800, 'adoptability']=0

# set conditions for adoptability 
shelter_dogs_df.loc[shelter_dogs_df['shelter_time'].astype(int)< 1800, 'adoptability']=1

In [9]:
# fill in NAs with 0
shelter_dogs_df.housebroken = shelter_dogs_df.housebroken.fillna('no')
shelter_dogs_df.likes_people = shelter_dogs_df.likes_people.fillna('no')
shelter_dogs_df.likes_children = shelter_dogs_df.likes_children.fillna('no')
shelter_dogs_df.get_along_males = shelter_dogs_df.get_along_males.fillna('no')
shelter_dogs_df.get_along_females = shelter_dogs_df.get_along_females.fillna('no')
shelter_dogs_df.get_along_cats = shelter_dogs_df.get_along_cats.fillna('no')
shelter_dogs_df.breed= shelter_dogs_df.breed.fillna('Unknown Mix')

In [10]:
# change variables to binaries
shelter_dogs_df.sex = shelter_dogs_df.sex.map({'male': 0, 'female': 1})
shelter_dogs_df.housebroken = shelter_dogs_df.housebroken.map({'no': 0, 'yes': 1})
shelter_dogs_df.likes_people = shelter_dogs_df.likes_people.map({'no': 0, 'yes': 1})
shelter_dogs_df.likes_children = shelter_dogs_df.likes_children.map({'no': 0, 'yes': 1})
shelter_dogs_df.get_along_males = shelter_dogs_df.get_along_males.map({'no': 0, 'yes': 1})
shelter_dogs_df.get_along_females = shelter_dogs_df.get_along_females.map({'no': 0, 'yes': 1})
shelter_dogs_df.get_along_cats = shelter_dogs_df.get_along_cats.map({'no': 0, 'yes': 1})

In [11]:
shelter_dogs_df

Unnamed: 0,ID,age,sex,breed,coat,neutered,housebroken,likes_people,likes_children,get_along_males,get_along_females,get_along_cats,shelter_time,adoptability
0,23807,0.25,1,Unknown Mix,short,no,0,0,0,0,0,0,1,1.0
1,533,0.17,1,Unknown Mix,short,no,0,1,1,1,1,1,3,1.0
2,23793,4.00,0,Unknown Mix,short,no,0,0,0,0,0,0,4,1.0
3,23795,1.00,0,Unknown Mix,medium,no,0,0,0,0,0,0,4,1.0
4,23806,2.00,1,French Bulldog Mix,short,no,0,0,0,0,0,0,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2932,118,16.92,0,Unknown Mix,short,no,0,1,1,0,1,0,5013,0.0
2933,262,17.33,1,Staffordshire Terrier Mix,short,yes,0,0,0,0,0,0,5270,0.0
2934,4,18.17,0,Unknown Mix,short,no,0,0,0,0,0,0,5160,0.0
2935,141,17.17,0,Unknown Mix,medium,no,0,0,0,0,0,0,5337,0.0


In [12]:
# create categorical dummies for neutered variable
spay_neuter = pd.get_dummies(shelter_dogs_df.neutered, prefix='Spayed_Neutered')
spay_neuter.head()

Unnamed: 0,Spayed_Neutered_no,Spayed_Neutered_yes
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0


In [13]:
# Drop original neutered column
shelter_dogs_df = shelter_dogs_df.join(spay_neuter)
shelter_dogs_df.drop(['neutered'], axis=1, inplace=True)

shelter_dogs_df.head()

Unnamed: 0,ID,age,sex,breed,coat,housebroken,likes_people,likes_children,get_along_males,get_along_females,get_along_cats,shelter_time,adoptability,Spayed_Neutered_no,Spayed_Neutered_yes
0,23807,0.25,1,Unknown Mix,short,0,0,0,0,0,0,1,1.0,1,0
1,533,0.17,1,Unknown Mix,short,0,1,1,1,1,1,3,1.0,1,0
2,23793,4.0,0,Unknown Mix,short,0,0,0,0,0,0,4,1.0,1,0
3,23795,1.0,0,Unknown Mix,medium,0,0,0,0,0,0,4,1.0,1,0
4,23806,2.0,1,French Bulldog Mix,short,0,0,0,0,0,0,1,1.0,1,0


In [14]:
# create categorical dummies for coat variable
coat = pd.get_dummies(shelter_dogs_df.coat, prefix='Coat')
coat.head()

Unnamed: 0,Coat_long,Coat_medium,Coat_short,Coat_wirehaired
0,0,0,1,0
1,0,0,1,0
2,0,0,1,0
3,0,1,0,0
4,0,0,1,0


In [15]:
# Drop original coat column 
shelter_dogs_df = shelter_dogs_df.join(coat)
shelter_dogs_df.drop(['coat'], axis=1, inplace=True)

shelter_dogs_df.head()

Unnamed: 0,ID,age,sex,breed,housebroken,likes_people,likes_children,get_along_males,get_along_females,get_along_cats,shelter_time,adoptability,Spayed_Neutered_no,Spayed_Neutered_yes,Coat_long,Coat_medium,Coat_short,Coat_wirehaired
0,23807,0.25,1,Unknown Mix,0,0,0,0,0,0,1,1.0,1,0,0,0,1,0
1,533,0.17,1,Unknown Mix,0,1,1,1,1,1,3,1.0,1,0,0,0,1,0
2,23793,4.0,0,Unknown Mix,0,0,0,0,0,0,4,1.0,1,0,0,0,1,0
3,23795,1.0,0,Unknown Mix,0,0,0,0,0,0,4,1.0,1,0,0,1,0,0
4,23806,2.0,1,French Bulldog Mix,0,0,0,0,0,0,1,1.0,1,0,0,0,1,0


In [17]:
shelter_dogs_df

Unnamed: 0,ID,age,sex,breed,housebroken,likes_people,likes_children,get_along_males,get_along_females,get_along_cats,shelter_time,adoptability,Spayed_Neutered_no,Spayed_Neutered_yes,Coat_long,Coat_medium,Coat_short,Coat_wirehaired
0,23807,0.25,1,Unknown Mix,0,0,0,0,0,0,1,1.0,1,0,0,0,1,0
1,533,0.17,1,Unknown Mix,0,1,1,1,1,1,3,1.0,1,0,0,0,1,0
2,23793,4.00,0,Unknown Mix,0,0,0,0,0,0,4,1.0,1,0,0,0,1,0
3,23795,1.00,0,Unknown Mix,0,0,0,0,0,0,4,1.0,1,0,0,1,0,0
4,23806,2.00,1,French Bulldog Mix,0,0,0,0,0,0,1,1.0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2932,118,16.92,0,Unknown Mix,0,1,1,0,1,0,5013,0.0,1,0,0,0,1,0
2933,262,17.33,1,Staffordshire Terrier Mix,0,0,0,0,0,0,5270,0.0,0,1,0,0,1,0
2934,4,18.17,0,Unknown Mix,0,0,0,0,0,0,5160,0.0,1,0,0,0,1,0
2935,141,17.17,0,Unknown Mix,0,0,0,0,0,0,5337,0.0,1,0,0,1,0,0


In [18]:
shelter_dogs_df['breed'].value_counts().head(10)                                         

Unknown Mix                  1524
German Shepherd Dog Mix       190
Dachshund Mix                 147
Labrador Retriever Mix         83
Staffordshire Terrier Mix      62
Puli Mix                       40
German Shepherd Dog            37
Schnauzer Mix                  34
Fox Terrier Mix                32
Greyhound Mix                  32
Name: breed, dtype: int64

In [19]:
shelter_dogs_df['breed'] = shelter_dogs_df['breed'].apply(lambda x: x if x in ('Unknown Mix', 'German Shepherd Dog Mix', 'Dachshund Mix', 'Labrador Retriever Mix', 'Staffordshire Terrier Mix', 'Puli Mix', 'German Shepherd Dog', 'Schnauzer Mix', 'Fox Terrier Mix', 'Greyhound Mix') else 'Other')

In [20]:
shelter_dogs_df

Unnamed: 0,ID,age,sex,breed,housebroken,likes_people,likes_children,get_along_males,get_along_females,get_along_cats,shelter_time,adoptability,Spayed_Neutered_no,Spayed_Neutered_yes,Coat_long,Coat_medium,Coat_short,Coat_wirehaired
0,23807,0.25,1,Unknown Mix,0,0,0,0,0,0,1,1.0,1,0,0,0,1,0
1,533,0.17,1,Unknown Mix,0,1,1,1,1,1,3,1.0,1,0,0,0,1,0
2,23793,4.00,0,Unknown Mix,0,0,0,0,0,0,4,1.0,1,0,0,0,1,0
3,23795,1.00,0,Unknown Mix,0,0,0,0,0,0,4,1.0,1,0,0,1,0,0
4,23806,2.00,1,Other,0,0,0,0,0,0,1,1.0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2932,118,16.92,0,Unknown Mix,0,1,1,0,1,0,5013,0.0,1,0,0,0,1,0
2933,262,17.33,1,Staffordshire Terrier Mix,0,0,0,0,0,0,5270,0.0,0,1,0,0,1,0
2934,4,18.17,0,Unknown Mix,0,0,0,0,0,0,5160,0.0,1,0,0,0,1,0
2935,141,17.17,0,Unknown Mix,0,0,0,0,0,0,5337,0.0,1,0,0,1,0,0


In [21]:
# create categorical dummies for breed variable---- creating A LOT of values
breed = pd.get_dummies(shelter_dogs_df.breed)
breed.head()

Unnamed: 0,Dachshund Mix,Fox Terrier Mix,German Shepherd Dog,German Shepherd Dog Mix,Greyhound Mix,Labrador Retriever Mix,Other,Puli Mix,Schnauzer Mix,Staffordshire Terrier Mix,Unknown Mix
0,0,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,1,0,0,0,0


In [22]:
# Drop original breed column
shelter_dogs_df = shelter_dogs_df.join(breed)
shelter_dogs_df.drop(['breed'], axis=1, inplace=True)

shelter_dogs_df.head()

Unnamed: 0,ID,age,sex,housebroken,likes_people,likes_children,get_along_males,get_along_females,get_along_cats,shelter_time,...,Fox Terrier Mix,German Shepherd Dog,German Shepherd Dog Mix,Greyhound Mix,Labrador Retriever Mix,Other,Puli Mix,Schnauzer Mix,Staffordshire Terrier Mix,Unknown Mix
0,23807,0.25,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1,533,0.17,1,0,1,1,1,1,1,3,...,0,0,0,0,0,0,0,0,0,1
2,23793,4.0,0,0,0,0,0,0,0,4,...,0,0,0,0,0,0,0,0,0,1
3,23795,1.0,0,0,0,0,0,0,0,4,...,0,0,0,0,0,0,0,0,0,1
4,23806,2.0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0


In [23]:
# Drop shelter-time column so as not to be confusing--- here forward will be only assessing against adoptability variable
shelter_dogs_df.drop(['shelter_time'], axis =1, inplace = True)

shelter_dogs_df.head()

Unnamed: 0,ID,age,sex,housebroken,likes_people,likes_children,get_along_males,get_along_females,get_along_cats,adoptability,...,Fox Terrier Mix,German Shepherd Dog,German Shepherd Dog Mix,Greyhound Mix,Labrador Retriever Mix,Other,Puli Mix,Schnauzer Mix,Staffordshire Terrier Mix,Unknown Mix
0,23807,0.25,1,0,0,0,0,0,0,1.0,...,0,0,0,0,0,0,0,0,0,1
1,533,0.17,1,0,1,1,1,1,1,1.0,...,0,0,0,0,0,0,0,0,0,1
2,23793,4.0,0,0,0,0,0,0,0,1.0,...,0,0,0,0,0,0,0,0,0,1
3,23795,1.0,0,0,0,0,0,0,0,1.0,...,0,0,0,0,0,0,0,0,0,1
4,23806,2.0,1,0,0,0,0,0,0,1.0,...,0,0,0,0,0,1,0,0,0,0


In [24]:
# Get info on dataframe
shelter_dogs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2937 entries, 0 to 2936
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID                         2937 non-null   int64  
 1   age                        2937 non-null   float64
 2   sex                        2937 non-null   int64  
 3   housebroken                2937 non-null   int64  
 4   likes_people               2937 non-null   int64  
 5   likes_children             2937 non-null   int64  
 6   get_along_males            2937 non-null   int64  
 7   get_along_females          2937 non-null   int64  
 8   get_along_cats             2937 non-null   int64  
 9   adoptability               2937 non-null   float64
 10  Spayed_Neutered_no         2937 non-null   uint8  
 11  Spayed_Neutered_yes        2937 non-null   uint8  
 12  Coat_long                  2937 non-null   uint8  
 13  Coat_medium                2937 non-null   uint8

In [25]:
# Determine the number of unique values in each column
shelter_dogs_df.nunique()

ID                           1694
age                           233
sex                             2
housebroken                     2
likes_people                    2
likes_children                  2
get_along_males                 2
get_along_females               2
get_along_cats                  2
adoptability                    2
Spayed_Neutered_no              2
Spayed_Neutered_yes             2
Coat_long                       2
Coat_medium                     2
Coat_short                      2
Coat_wirehaired                 2
Dachshund Mix                   2
Fox Terrier Mix                 2
German Shepherd Dog             2
German Shepherd Dog Mix         2
Greyhound Mix                   2
Labrador Retriever Mix          2
Other                           2
Puli Mix                        2
Schnauzer Mix                   2
Staffordshire Terrier Mix       2
Unknown Mix                     2
dtype: int64

In [26]:
shelter_dogs_df.head()

Unnamed: 0,ID,age,sex,housebroken,likes_people,likes_children,get_along_males,get_along_females,get_along_cats,adoptability,...,Fox Terrier Mix,German Shepherd Dog,German Shepherd Dog Mix,Greyhound Mix,Labrador Retriever Mix,Other,Puli Mix,Schnauzer Mix,Staffordshire Terrier Mix,Unknown Mix
0,23807,0.25,1,0,0,0,0,0,0,1.0,...,0,0,0,0,0,0,0,0,0,1
1,533,0.17,1,0,1,1,1,1,1,1.0,...,0,0,0,0,0,0,0,0,0,1
2,23793,4.0,0,0,0,0,0,0,0,1.0,...,0,0,0,0,0,0,0,0,0,1
3,23795,1.0,0,0,0,0,0,0,0,1.0,...,0,0,0,0,0,0,0,0,0,1
4,23806,2.0,1,0,0,0,0,0,0,1.0,...,0,0,0,0,0,1,0,0,0,0


In [27]:
shelter_dogs_df.tail()

Unnamed: 0,ID,age,sex,housebroken,likes_people,likes_children,get_along_males,get_along_females,get_along_cats,adoptability,...,Fox Terrier Mix,German Shepherd Dog,German Shepherd Dog Mix,Greyhound Mix,Labrador Retriever Mix,Other,Puli Mix,Schnauzer Mix,Staffordshire Terrier Mix,Unknown Mix
2932,118,16.92,0,0,1,1,0,1,0,0.0,...,0,0,0,0,0,0,0,0,0,1
2933,262,17.33,1,0,0,0,0,0,0,0.0,...,0,0,0,0,0,0,0,0,1,0
2934,4,18.17,0,0,0,0,0,0,0,0.0,...,0,0,0,0,0,0,0,0,0,1
2935,141,17.17,0,0,0,0,0,0,0,0.0,...,0,0,0,0,0,0,0,0,0,1
2936,248,16.08,0,0,1,0,1,1,0,0.0,...,0,0,0,0,0,1,0,0,0,0


In [28]:
shelter_dogs_df.describe()

Unnamed: 0,ID,age,sex,housebroken,likes_people,likes_children,get_along_males,get_along_females,get_along_cats,adoptability,...,Fox Terrier Mix,German Shepherd Dog,German Shepherd Dog Mix,Greyhound Mix,Labrador Retriever Mix,Other,Puli Mix,Schnauzer Mix,Staffordshire Terrier Mix,Unknown Mix
count,2937.0,2937.0,2937.0,2937.0,2937.0,2937.0,2937.0,2937.0,2937.0,2937.0,...,2937.0,2937.0,2937.0,2937.0,2937.0,2937.0,2937.0,2937.0,2937.0,2937.0
mean,1742.655431,8.569551,0.427647,0.137896,0.677903,0.399047,0.500511,0.543071,0.093633,0.399387,...,0.010895,0.012598,0.064692,0.010895,0.02826,0.257406,0.013619,0.011576,0.02111,0.518897
std,4023.313413,4.140216,0.494822,0.344849,0.46736,0.489786,0.500085,0.498226,0.291367,0.489856,...,0.103829,0.11155,0.246023,0.103829,0.165743,0.437279,0.115924,0.106987,0.143775,0.499728
min,1.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,167.0,5.83,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,556.0,8.67,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,1451.0,11.25,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
max,23809.0,21.92,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [29]:
# Assign the data to X and y
# Note: Sklearn requires a two-dimensional array of values
# so we use reshape() to create this

X = shelter_dogs_df[['age','housebroken', 'likes_people', 'likes_children', 'get_along_males', 'get_along_females', 'get_along_cats', 'Spayed_Neutered_no', 'Spayed_Neutered_yes', 'Coat_long', 'Coat_medium', 'Coat_short', 'Coat_wirehaired', 'Dachshund Mix', 'Fox Terrier Mix', 'German Shepherd Dog', 'German Shepherd Dog Mix', 'Greyhound Mix', 'Labrador Retriever Mix', 'Other', 'Puli Mix', 'Schnauzer Mix', 'Staffordshire Terrier Mix', 'Unknown Mix']]

y = shelter_dogs_df['adoptability']

print("Shape: ", X.shape, y.shape)

Shape:  (2937, 24) (2937,)


In [30]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [31]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression()

In [32]:
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [33]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.8460490463215259
Testing Data Score: 0.8476190476190476


In [34]:
print(f'Actual:\t\t{list(y_test[:10])}')
print(f'Predicted:\t{list(classifier.predict(X_test[:10]))}')

Actual:		[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
Predicted:	[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [35]:
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
1294,0.0,0.0
2718,0.0,0.0
2709,0.0,0.0
2026,0.0,0.0
351,1.0,1.0
...,...,...
2401,0.0,0.0
1748,0.0,0.0
2063,0.0,0.0
816,1.0,1.0
