In [None]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics

# Import Decision Tree Classifier model from Scikit-Learn
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.tree import export_graphviz
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

import json
import graphviz

In [None]:
animalData = pd.read_csv('train.csv')
animalData.head()

In [None]:
petIDs = np.array(animalData["PetID"])
magnitude = []
score = []

# loop through IDs, do some math with the magnitudes and score and save that into a Numpy array, then add it to the animals Database
for id in petIDs:
  try: 
    with open("./train_sentiment/" + id + '.json') as json_file:
      itemData = json.load(json_file)
      magnitude.append(itemData["documentSentiment"]["magnitude"])
      score.append(itemData["documentSentiment"]["score"])
  except:
    magnitude.append(0.0)
    score.append(0.0)
    pass

jsonData = {"Magnitude" : magnitude, "Score" : score}
animalData = animalData.join(pd.DataFrame(data = jsonData))

In [None]:
predictors = animalData.drop(["Name", "State", "RescuerID", "Description", "PetID", "AdoptionSpeed", "VideoAmt", "PhotoAmt"], axis = 1)
response = pd.DataFrame(animalData["AdoptionSpeed"])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(predictors, response, test_size = 0.25)

svcModel = LinearSVC()

svcModel.fit(X_train, y_train)

y_train_pred = svcModel.predict(X_train)
y_test_pred = svcModel.predict(X_test)

In [None]:
# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Classification Accuracy \t:", svcModel.score(X_train, y_train))
print()

# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Classification Accuracy \t:", svcModel.score(X_test, y_test))
print()

In [None]:
matrix = confusion_matrix(y_train, y_train_pred)
sb.heatmap(matrix, annot = True, fmt=".0f", annot_kws={"size": 18})

In [None]:
matrix = confusion_matrix(y_test, y_test_pred)
sb.heatmap(matrix, annot = True, fmt=".0f", annot_kws={"size": 18})

In [None]:
# Import essential models and functions from sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix


y = response
X = pd.DataFrame(predictors) 

# Split the Dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

# Random Forest using Train Data
rforest = RandomForestClassifier(n_estimators = 100, max_depth = 15)  # create the object
rforest.fit(X_train, y_train.values.ravel())                         # train the model

# Predict Response corresponding to Predictors
y_train_pred = rforest.predict(X_train)
y_test_pred = rforest.predict(X_test)

# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Classification Accuracy \t:", rforest.score(X_train, y_train))
print()

# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Classification Accuracy \t:", rforest.score(X_test, y_test))
print()

# Plot the Confusion Matrix for Train and Test
f, axes = plt.subplots(1, 2, figsize=(12, 4))
sb.heatmap(confusion_matrix(y_train, y_train_pred),
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[0])
sb.heatmap(confusion_matrix(y_test, y_test_pred), 
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[1])

In [None]:
animaldescription = pd.DataFrame(animalData[["Description", "AdoptionSpeed"]])
animaldescription.head()

In [None]:
#for counting frequently occurence of spam and ham.

from collections import Counter
from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, confusion_matrix
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.tokenize import word_tokenize

#create empty list
description1=[]
alist = ["good","play","love","care","vaccin",'friendli',"healthi","activ","cute","rescu"]
blist =[]


# Combines words with similar meaning together
stemmer = PorterStemmer()

# Removes words like "it", "the", etc.
sw = stopwords.words("english")

for i in animaldescription['Description']:
    i = str(i)
    if i == 'nan':
        i = 'blank'
    text = i.lower()
    words = word_tokenize(i)
    words = [word for word in words if word not in sw]
    words = [stemmer.stem(word) for word in words]
    words = [w for w in words if len(w) > 3]
    blist.append(len([word for word in words if word in alist]))
    words = ' '.join(str(e) for e in words)
    description1.append(words)
    
    
animaldescription['description1']=description1

animaldescription= animaldescription.join(pd.DataFrame(data = {"goodwords" : np.array(blist)}))



In [None]:
count1 = Counter(" ".join(animaldescription["description1"]).split()).most_common(30)
data1 = pd.DataFrame.from_dict(count1)
data1 = data1.rename(columns={0: "words of description", 1 : "count"})

In [None]:
data1.plot.bar(legend = False, color = 'purple',figsize = (20,15))
y_pos = np.arange(len(data1["words of description"]))
plt.xticks(y_pos, data1["words of description"])
plt.title('Top 30 words of description')
plt.xlabel('words')
plt.ylabel('number')
plt.show()

In [None]:
y = response
X = predictors.join(pd.DataFrame(animaldescription["goodwords"]))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

# Decision Tree using Train Data
dectree = DecisionTreeClassifier(max_depth = 8)  # create the decision tree object
dectree.fit(X_train, y_train)                    # train the decision tree model

# Plot the Decision Tree
treedot = export_graphviz(dectree,                                      # the model
                          feature_names = X_train.columns,              # the features 
                          out_file = None,                              # output file
                          filled = True,                                # node colors
                          rounded = True,                               # make pretty
                          special_characters = True)                    # postscript


graphviz.Source(treedot)

In [None]:
# Predict Legendary values corresponding to Total
y_train_pred = dectree.predict(X_train)
y_test_pred = dectree.predict(X_test)

# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Classification Accuracy \t:", dectree.score(X_train, y_train))
print()

# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Classification Accuracy \t:", dectree.score(X_test, y_test))
print()

# Plot the Confusion Matrix for Train and Test
f, axes = plt.subplots(1, 2, figsize=(12, 4))
sb.heatmap(confusion_matrix(y_train, y_train_pred),
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[0])
sb.heatmap(confusion_matrix(y_test, y_test_pred), 
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[1])