In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

In [None]:
train = pd.read_csv('train/train.csv')
test = pd.read_csv('test/test.csv')

In [36]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14993 entries, 0 to 14992
Data columns (total 24 columns):
Type             14993 non-null int64
Name             14993 non-null object
Age              14993 non-null int64
Breed1           14993 non-null int64
Breed2           14993 non-null int64
Gender           14993 non-null int64
Color1           14993 non-null int64
Color2           14993 non-null int64
Color3           14993 non-null int64
MaturitySize     14993 non-null int64
FurLength        14993 non-null int64
Vaccinated       14993 non-null int64
Dewormed         14993 non-null int64
Sterilized       14993 non-null int64
Health           14993 non-null int64
Quantity         14993 non-null int64
Fee              14993 non-null int64
State            14993 non-null int64
RescuerID        14993 non-null object
VideoAmt         14993 non-null int64
Description      14993 non-null object
PetID            14993 non-null object
PhotoAmt         14993 non-null float64
AdoptionSpe

In [42]:
train.Color1.value_counts()

1    7427
2    3750
3     947
5     884
6     684
7     667
4     634
Name: Color1, dtype: int64

In [43]:
train.Color2.value_counts()

0    4471
7    3438
2    3313
5    1128
6    1063
4     870
3     710
Name: Color2, dtype: int64

In [44]:
train.Color3.value_counts()

0    10604
7     3221
5      417
6      378
4      198
3      175
Name: Color3, dtype: int64

In [None]:
test.info()

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

In [None]:
train.Name.fillna('None', inplace=True)
train.Description.fillna('None', inplace=True)
test.Name.fillna('None', inplace=True)
test.Description.fillna('None', inplace=True)

In [None]:
labels = train.drop(['Name', 'RescuerID', 'Description', 'PetID', 'AdoptionSpeed'], axis=1)
test_labels = test.drop(['Name', 'RescuerID', 'Description', 'PetID'], axis=1)

In [None]:
labels = pd.get_dummies(labels, columns = ['Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3',
                                 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health',
                                 'State', 'Type'
                                ])
test_labels = pd.get_dummies(test_labels, columns = ['Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3',
                                 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health',
                                 'State', 'Type'
                                ])

In [None]:
labels.columns

In [None]:
test_labels.columns

In [None]:
diff_columns = set(labels.columns).difference(set(test_labels.columns))

In [None]:
for i in diff_columns:
    test_labels[i] = test_labels.apply(lambda _: 0, axis=1)

In [None]:
diff_columns2 = set(test_labels.columns).difference(set(labels.columns))

In [None]:
for i in diff_columns2:
    labels[i] = labels.apply(lambda _: 0, axis=1)

In [None]:
target = train['AdoptionSpeed']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(labels, target, test_size=0.2)

In [None]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
pred = pd.DataFrame(clf.predict(test_labels))

In [None]:
pred.to_csv("submission.csv", index=True, index_label="PetID", header=["AdoptionSpeed"])

In [106]:
breeds = pd.read_csv('breed_labels.csv')

In [107]:
breeds

Unnamed: 0,BreedID,Type,BreedName
0,1,1,Affenpinscher
1,2,1,Afghan Hound
2,3,1,Airedale Terrier
3,4,1,Akbash
4,5,1,Akita
5,6,1,Alaskan Malamute
6,7,1,American Bulldog
7,8,1,American Eskimo Dog
8,9,1,American Hairless Terrier
9,10,1,American Staffordshire Terrier


In [108]:
groups = pd.read_csv('dog_groups.csv')

In [109]:
groups

Unnamed: 0,Herding,Hound,Toy,Non-Sporting,Sporting,Terrier,Working,Misc,FSS
0,Australian Cattle Dog,Afghan Hound,Affenpinscher,American Eskimo Dog,American Water Spaniel,Airedale Terrier,Akita,Barbet,American Leopard Hound
1,Australian Shepherd,American English Coonhound,Brussels Griffon,Bichon Frise,Boykin Spaniel,American Hairless Terrier,Alaskan Malamute,Belgian Laekenois,Appenzeller Sennenhund
2,Bearded Collie,American Foxhound,Cavalier King Charles Spaniel,Boston Terrier,Brittany,American Staffordshire Terrier,Anatolian Shepherd Dog,Dogo Argentino,Australian Kelpie
3,Beauceron,Azawakh,Chihuahua,Bulldog,Chesapeake Bay Retriever,Australian Terrier,Bernese Mountain Dog,Dutch Shepherd,Australian Stumpy Tail Cattle Dog
4,Belgian Malinois,Basenji,Chinese Crested,Chinese Shar-Pei,Clumber Spaniel,Bedlington Terrier,Black Russian Terrier,Lancashire Heeler,Basset Fauve de Bretagne
5,Belgian Sheepdog,Basset Hound,English Toy Spaniel,Chow Chow,Cocker Spaniel,Border Terrier,Boerboel,Mudi,Bavarian Mountain Scent Hound
6,Belgian Tervuren,Beagle,Havanese,Coton De Tulear,Curly-Coated Retriever,Bull Terrier,Boxer,Norrbottenspets,Biewer Terrier
7,Bergamasco,Black and Tan Coonhound,Italian Greyhound,Dalmatian,English Cocker Spaniel,Cairn Terrier,Bullmastiff,Peruvian Inca Orchid,Bolognese
8,Berger Picard,Bloodhound,Japanese Chin,Finish Spitz,English Setter,Cesky Terrier,Cane Corso,Portuguese Podengo,Bracco Italiano
9,Border Collie,Bluetick Coonhound,Maltese,French Bulldog,English Springer Spaniel,Dandie Dinmont Terrier,Chinook,Russian Toy,Braque de Bourbonnais


In [110]:
breeds['Group'] = breeds.apply(lambda _: 'i', axis=1)

In [111]:
dogs = breeds[breeds['Type'] == 1].copy()

In [112]:
for index, row in dogs.iterrows():
    for j in groups.columns:
        for h in groups[j]:
            if row['BreedName'] == h:
                dogs.set_value(index, 'Group', j)

  """


In [113]:
dogs['Group'].value_counts()

i               88
Working         25
Hound           24
Terrier         22
Sporting        22
Toy             17
Herding         15
Non-Sporting    14
FSS             10
Misc             4
Name: Group, dtype: int64

In [114]:
dogs[dogs['Group'] == 'i']

Unnamed: 0,BreedID,Type,BreedName,Group
3,4,1,Akbash,i
6,7,1,American Bulldog,i
11,12,1,Anatolian Shepherd,i
12,13,1,Appenzell Mountain Dog,i
13,14,1,Australian Cattle Dog/Blue Heeler,i
23,24,1,Belgian Shepherd Dog Sheepdog,i
24,25,1,Belgian Shepherd Laekenois,i
25,26,1,Belgian Shepherd Malinois,i
26,27,1,Belgian Shepherd Tervuren,i
30,31,1,Black Labrador Retriever,i


In [126]:
for index, row in dogs[dogs['Group']=='i'].iterrows():
    if 'TERRIER' in row['BreedName'].upper():
        dogs.set_value(index, 'Group', 'Terrier')
    if 'BULLDOG' in row['BreedName'].upper():
        dogs.set_value(index, 'Group', 'Non-Sporting')
    if 'POODLE' in row['BreedName'].upper():
        dogs.set_value(index, 'Group', 'Non-Sporting')
    if 'CORGI' in row['BreedName'].upper():
        dogs.set_value(index, 'Group', 'Herding')
    if 'RETRIEVER' in row['BreedName'].upper():
        dogs.set_value(index, 'Group', 'Sporting')
    if 'SCHNAUZER' in row['BreedName'].upper():
        dogs.set_value(index, 'Group', 'Working')

  del sys.path[0]


In [127]:
dogs[dogs['Group'] == 'i']

Unnamed: 0,BreedID,Type,BreedName,Group
3,4,1,Akbash,i
11,12,1,Anatolian Shepherd,i
12,13,1,Appenzell Mountain Dog,i
13,14,1,Australian Cattle Dog/Blue Heeler,i
23,24,1,Belgian Shepherd Dog Sheepdog,i
24,25,1,Belgian Shepherd Laekenois,i
25,26,1,Belgian Shepherd Malinois,i
26,27,1,Belgian Shepherd Tervuren,i
31,32,1,Black Mouth Cur,i
34,35,1,Blue Lacy,i


In [129]:
dogs.to_csv('dog_breeds.csv')

In [130]:
dogs = pd.read_csv('dog_breeds.csv')

In [136]:
joined = train.join(dogs, lsuffix='Breed1', rsuffix='BreedID')

In [139]:
joined[joined['TypeBreed1'] == 1]['Group'].value_counts()

Working         23
Hound           21
Sporting        20
Terrier         20
Herding         16
Non-Sporting    12
Toy             11
FSS              9
Misc             5
Hunting          3
Name: Group, dtype: int64

In [140]:
joined.head()

Unnamed: 0.1,TypeBreed1,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed,Unnamed: 0,BreedID,TypeBreedID,BreedName,Group
0,2,Nibble,3,299,0,1,1,7,0,1,...,0,Nibble is a 3+ month old ball of cuteness. He ...,86e1089a3,1.0,2,0.0,1.0,1.0,Affenpinscher,Toy
1,2,No Name Yet,1,265,0,1,1,2,0,2,...,0,I just found it alone yesterday near my apartm...,6296e909a,2.0,0,1.0,2.0,1.0,Afghan Hound,Hound
2,1,Brisco,1,307,0,1,2,7,0,2,...,0,Their pregnant mother was dumped by her irresp...,3422e4906,7.0,3,2.0,3.0,1.0,Airedale Terrier,Terrier
3,1,Miko,4,307,0,2,1,2,0,2,...,0,"Good guard dog, very alert, active, obedience ...",5842f1ff5,8.0,2,3.0,4.0,1.0,Akbash,Working
4,1,Hunter,1,307,0,1,1,0,0,2,...,0,This handsome yet cute boy is up for adoption....,850a43f90,3.0,2,4.0,5.0,1.0,Akita,Working


In [146]:
joined = joined.apply(lambda x: 'Misc' if x == None)

SyntaxError: invalid syntax (<ipython-input-146-dd7fa922b0d4>, line 1)

In [147]:
joined.isna().sum()

TypeBreed1           0
Name                 0
Age                  0
Breed1               0
Breed2               0
Gender               0
Color1               0
Color2               0
Color3               0
MaturitySize         0
FurLength            0
Vaccinated           0
Dewormed             0
Sterilized           0
Health               0
Quantity             0
Fee                  0
State                0
RescuerID            0
VideoAmt             0
Description          0
PetID                0
PhotoAmt             0
AdoptionSpeed        0
Unnamed: 0       14752
BreedID          14752
TypeBreedID      14752
BreedName        14752
Group            14752
dtype: int64

In [148]:
train.columns

Index(['Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'RescuerID',
       'VideoAmt', 'Description', 'PetID', 'PhotoAmt', 'AdoptionSpeed'],
      dtype='object')

In [149]:
train['Breed1']

0        299
1        265
2        307
3        307
4        307
5        266
6        264
7        307
8        265
9        265
10       307
11       264
12       307
13       265
14       307
15       218
16       266
17       307
18       307
19       266
20       307
21       307
22       114
23       307
24       266
25       285
26       285
27       266
28       189
29       266
        ... 
14963    276
14964    265
14965    266
14966    265
14967    265
14968    266
14969    266
14970    307
14971    265
14972    266
14973    307
14974    307
14975    307
14976    264
14977    254
14978    307
14979    307
14980    307
14981    307
14982    266
14983    307
14984    307
14985    179
14986    266
14987    195
14988    266
14989    265
14990    265
14991    266
14992    307
Name: Breed1, Length: 14993, dtype: int64

In [154]:
train['Group'] = breeds.apply(lambda _: 'i', axis=1)
              
for index, row in train.iterrows():
    for i, r in dogs.iterrows():
        if row['Breed1'] == r['BreedID']:
            train.set_value(index, 'Group', r['BreedID'])

  


In [None]:
dogs.head()