In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

# Import our animals data frame

In [2]:
animals_df = pd.read_csv("ANIMALS DATA - final.csv")
animals_df

Unnamed: 0,Scientific Name,Class,Order,Family,Trend,Status,Life Span (years),Length (cm),Weight (kg),Is Social,...,Europe,Oceania,South America,Central America,North America,Cold,Dry/Desert,Polar/Tundra,Temperate,Tropical
0,Canis lupus arctos,1,8,canidae,1,0,12.0,50.00,51.0000,1,...,0,0,0,0,0,0,0,1,0,0
1,Vulpes lagopus,1,8,canidae,1,0,8.5,53.50,5.0000,0,...,1,0,0,0,0,0,0,1,0,0
2,Tamiasciurus hudsonicus,1,49,sciuridae,1,0,7.5,32.50,0.2395,0,...,0,0,0,0,0,1,1,1,1,0
3,Canis lupus,1,8,canidae,1,0,15.0,132.50,38.0000,1,...,1,0,0,0,0,1,1,1,1,0
4,Mustela erminea,1,8,mustelidae,1,0,6.5,24.75,0.2190,0,...,1,1,0,0,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1543,Melanerpes erythrocephalus,0,42,picidae,0,0,10.5,22.00,0.0765,0,...,0,0,0,0,0,0,0,0,1,1
1544,Mniotilta varia,0,36,parulidae,0,0,11.0,12.00,0.0115,0,...,0,0,0,0,0,1,0,0,1,1
1545,Leuconotopicus albolarvatus,0,42,picidae,1,0,6.5,22.00,0.0600,0,...,0,0,0,0,0,0,0,0,1,0
1546,Dryobates borealis,0,42,picidae,0,0,14.0,20.50,0.0480,1,...,0,0,0,0,0,0,0,0,1,0


# we merged all climate columns into one "Climate" column

#### We found missing climate data for some animals, so we manually add their data

In [3]:
Climate_edited_df = animals_df.copy()
Climate_edited_df.set_index('Scientific Name', inplace=True)

In [4]:
Climate_edited_df['cold_climate'] = Climate_edited_df['Cold'] | Climate_edited_df['Polar/Tundra']
Climate_edited_df['hot_climate'] = Climate_edited_df['Dry/Desert'] | Climate_edited_df['Temperate'] | Climate_edited_df['Tropical']
Climate_edited_df = Climate_edited_df.drop(labels = ['Cold', 'Polar/Tundra', 'Dry/Desert',
                                                    'Temperate', 'Tropical'], axis = 1)

Climate = []

for i in range(0, Climate_edited_df.shape[0]):
    animal = Climate_edited_df.iloc[i, :]
    if animal['cold_climate'] == 1 and animal['hot_climate'] == 1:
        Climate.append(2)    # all climates
    elif animal['cold_climate'] == 1:
        Climate.append(0)    # cold climate
    elif animal['hot_climate'] == 1:
        Climate.append(1)    # hot climate
    else:
        Climate.append(4)    # strange
        
Climate_edited_df['Climates'] = Climate

Climate_edited_df.loc['Mustela putorius', 'Climates'] = 2
Climate_edited_df.loc['Glaucomys volans', 'Climates'] = 2
Climate_edited_df.loc['Hystrix cristata', 'Climates'] = 1
Climate_edited_df.loc['Callorhinus ursinus', 'Climates'] = 0
Climate_edited_df.loc['Pseudomys novaehollandiae', 'Climates'] = 1
Climate_edited_df.loc['Cercocebus chrysogaster', 'Climates'] = 1
Climate_edited_df.loc['Nymphicus hollandicus', 'Climates'] = 1
Climate_edited_df.loc['Sterna paradisaea', 'Climates'] = 0

Climate_edited_df

Unnamed: 0_level_0,Class,Order,Family,Trend,Status,Life Span (years),Length (cm),Weight (kg),Is Social,Is Nocturnal,...,Antarctica,Asia,Europe,Oceania,South America,Central America,North America,cold_climate,hot_climate,Climates
Scientific Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Canis lupus arctos,1,8,canidae,1,0,12.0,50.00,51.0000,1,0,...,0,0,0,0,0,0,0,1,0,0
Vulpes lagopus,1,8,canidae,1,0,8.5,53.50,5.0000,0,0,...,0,1,1,0,0,0,0,1,0,0
Tamiasciurus hudsonicus,1,49,sciuridae,1,0,7.5,32.50,0.2395,0,0,...,0,0,0,0,0,0,0,1,1,2
Canis lupus,1,8,canidae,1,0,15.0,132.50,38.0000,1,1,...,0,1,1,0,0,0,0,1,1,2
Mustela erminea,1,8,mustelidae,1,0,6.5,24.75,0.2190,0,1,...,0,1,1,1,0,0,0,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Melanerpes erythrocephalus,0,42,picidae,0,0,10.5,22.00,0.0765,0,0,...,0,0,0,0,0,0,0,0,1,1
Mniotilta varia,0,36,parulidae,0,0,11.0,12.00,0.0115,0,0,...,0,0,0,0,0,0,0,1,1,2
Leuconotopicus albolarvatus,0,42,picidae,1,0,6.5,22.00,0.0600,0,0,...,0,0,0,0,0,0,0,0,1,1
Dryobates borealis,0,42,picidae,0,0,14.0,20.50,0.0480,1,0,...,0,0,0,0,0,0,0,0,1,1


## Family column is a string variable, so we had to drop it from our dataframe.

In [5]:
ML_animals_df = Climate_edited_df.drop(labels=['Family'], axis=1).reset_index()
del ML_animals_df["Scientific Name"]
ML_animals_df

Unnamed: 0,Class,Order,Trend,Status,Life Span (years),Length (cm),Weight (kg),Is Social,Is Nocturnal,Diet,...,Antarctica,Asia,Europe,Oceania,South America,Central America,North America,cold_climate,hot_climate,Climates
0,1,8,1,0,12.0,50.00,51.0000,1,0,1,...,0,0,0,0,0,0,0,1,0,0
1,1,8,1,0,8.5,53.50,5.0000,0,0,4,...,0,1,1,0,0,0,0,1,0,0
2,1,49,1,0,7.5,32.50,0.2395,0,0,2,...,0,0,0,0,0,0,0,1,1,2
3,1,8,1,0,15.0,132.50,38.0000,1,1,0,...,0,1,1,0,0,0,0,1,1,2
4,1,8,1,0,6.5,24.75,0.2190,0,1,1,...,0,1,1,1,0,0,0,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1543,0,42,0,0,10.5,22.00,0.0765,0,0,0,...,0,0,0,0,0,0,0,0,1,1
1544,0,36,0,0,11.0,12.00,0.0115,0,0,0,...,0,0,0,0,0,0,0,1,1,2
1545,0,42,1,0,6.5,22.00,0.0600,0,0,0,...,0,0,0,0,0,0,0,0,1,1
1546,0,42,0,0,14.0,20.50,0.0480,1,0,0,...,0,0,0,0,0,0,0,0,1,1


In [6]:
ML_animals_df.to_csv("df_for_ML.csv", index_label=False)

# We applied here feature engineering

In [7]:
ML_animals_df['life_span_incubation_period'] = ML_animals_df['Incubation Period (days)'] * ML_animals_df['Life Span (years)']
ML_animals_df['Baby Carrying_independent_Age_days'] = ML_animals_df['Independent Age (days)'] * ML_animals_df['Baby Carrying']

ML_animals_df = ML_animals_df.drop(labels = ['Incubation Period (days)', 'Life Span (years)', 'Baby Carrying', 'Independent Age (days)'], axis = 1)

In [8]:
ML_animals_df = ML_animals_df.drop(labels = ['Africa', 'Antarctica', 'Asia', 'Europe', 'Oceania',
                                            'South America', 'Central America', 'North America'], axis = 1)

In [9]:
ML_animals_df.corr()

Unnamed: 0,Class,Order,Trend,Status,Length (cm),Weight (kg),Is Social,Is Nocturnal,Diet,Mating Behavior,cold_climate,hot_climate,Climates,life_span_incubation_period,Baby Carrying_independent_Age_days
Class,1.0,0.27482,-0.032077,0.074743,0.142365,0.024634,-0.328113,0.27566,0.08167,0.679555,-0.209423,0.042074,-0.168769,0.210438,0.22473
Order,0.27482,1.0,-0.010691,-0.009478,-0.214233,-0.090208,-0.124738,-0.044975,-0.032842,0.065859,-0.162868,0.079885,-0.0766,-0.073015,-0.031168
Trend,-0.032077,-0.010691,1.0,-0.370875,0.097261,0.092146,0.000856,-0.046894,-0.056945,-0.077245,0.189719,-0.053527,0.106312,0.029858,-0.002408
Status,0.074743,-0.009478,-0.370875,1.0,0.140997,0.058881,0.081273,0.001599,0.125749,0.10123,-0.153245,0.026315,-0.126563,0.241511,0.117681
Length (cm),0.142365,-0.214233,0.097261,0.140997,1.0,0.768615,0.068088,-0.045406,0.070544,0.157077,0.143028,-0.047039,0.072568,0.622118,0.134449
Weight (kg),0.024634,-0.090208,0.092146,0.058881,0.768615,1.0,0.021591,-0.050337,-0.001817,0.026502,0.12368,-0.025964,0.085837,0.314313,-0.0058
Is Social,-0.328113,-0.124738,0.000856,0.081273,0.068088,0.021591,1.0,-0.290515,0.078498,-0.230888,0.074937,-0.006024,0.060459,0.120847,-0.071629
Is Nocturnal,0.27566,-0.044975,-0.046894,0.001599,-0.045406,-0.050337,-0.290515,1.0,0.046228,0.205845,-0.109936,0.016605,-0.068778,-0.034023,0.081603
Diet,0.08167,-0.032842,-0.056945,0.125749,0.070544,-0.001817,0.078498,0.046228,1.0,0.134077,-0.010212,0.01703,-0.019717,0.127208,0.011521
Mating Behavior,0.679555,0.065859,-0.077245,0.10123,0.157077,0.026502,-0.230888,0.205845,0.134077,1.0,-0.123228,0.017146,-0.105914,0.259789,0.094629


In [10]:
ML_animals_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1548 entries, 0 to 1547
Data columns (total 15 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Class                               1548 non-null   int64  
 1   Order                               1548 non-null   int64  
 2   Trend                               1548 non-null   int64  
 3   Status                              1548 non-null   int64  
 4   Length (cm)                         1548 non-null   float64
 5   Weight (kg)                         1548 non-null   float64
 6   Is Social                           1548 non-null   int64  
 7   Is Nocturnal                        1548 non-null   int64  
 8   Diet                                1548 non-null   int64  
 9   Mating Behavior                     1548 non-null   int64  
 10  cold_climate                        1548 non-null   int64  
 11  hot_climate                         1548 no

# Machine Learning's process

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression, Perceptron 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB

# Evaluate the model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score


X = ML_animals_df.drop('Status', 1)
y = ML_animals_df['Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

ml_models={
    'LogReg'            : LogisticRegression(solver='liblinear'),
    'DecisionTree'      : DecisionTreeClassifier(),
    'RandomForest'      : RandomForestClassifier(),
    'SVM'               : SVC(kernel="linear",gamma=0.001),
    'LinearSVC'         : LinearSVC(),
    'KNN'               : KNeighborsClassifier(n_neighbors=9, metric='manhattan'),
    'AdaBoost'          : AdaBoostClassifier(), 
    'Naive Bayes'       : MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
    'Perceptron'        : Perceptron(),
    'MLPClassifier'     : MLPClassifier()
}

df_metrics=pd.DataFrame([])

for model_name in ml_models:
    
    metrics={}

    
    clf_model = ml_models[model_name].fit(X_train, y_train)
    y_pred = clf_model.predict(X_test)
    
    metrics['accuracy']= accuracy_score(y_test, y_pred)
    metrics['precision'] = precision_score(y_test, y_pred)
    metrics['recall'] = recall_score(y_test, y_pred)
    metrics['f1'] = f1_score(y_test, y_pred, average='macro')
    df_metrics=pd.concat([df_metrics,pd.DataFrame(metrics,index=[model_name]).T],axis=1)
    
    
df_metrics.T.style.highlight_max(color='lightgreen').set_precision(3)

Unnamed: 0,accuracy,precision,recall,f1
LogReg,0.794,0.643,0.45,0.699
DecisionTree,0.761,0.536,0.562,0.693
RandomForest,0.803,0.656,0.5,0.72
SVM,0.823,0.705,0.537,0.748
LinearSVC,0.565,0.356,0.85,0.558
KNN,0.745,0.51,0.325,0.618
AdaBoost,0.813,0.677,0.525,0.735
Naive Bayes,0.745,0.556,0.062,0.482
Perceptron,0.739,0.474,0.113,0.513
MLPClassifier,0.748,0.75,0.037,0.463
