In [127]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sn

#Scalers
from sklearn.preprocessing import StandardScaler

In [128]:
# loading the data
df = pd.read_csv('train.csv')

In [129]:
df.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [130]:
df.shape

(26729, 10)

In [131]:
# drop outcomesubtype - not relevant for model, teh same as Name, DateTIme
# df = df.drop(['OutcomeSubtype'], axis=1)

# DATA CLEANING

## Missing Data



In [132]:
total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent * 100], axis=1, keys=['Total', 'Percent(%)'])
missing_data.head(10)

Unnamed: 0,Total,Percent(%)
OutcomeSubtype,13612,50.925961
Name,7691,28.773991
AgeuponOutcome,18,0.067343
SexuponOutcome,1,0.003741
AnimalID,0,0.0
DateTime,0,0.0
OutcomeType,0,0.0
AnimalType,0,0.0
Breed,0,0.0
Color,0,0.0


In [133]:
df = df.loc[df['AgeuponOutcome'].notnull()]
df = df.loc[df['SexuponOutcome'].notnull()]

## AnimalType

In [134]:
df['AnimalType'] = df['AnimalType'].replace({'Dog': 0, 'Cat': 1})

## SexuponOutcome

In [135]:
temp = pd.get_dummies(df['SexuponOutcome'])
df = pd.concat([df, temp], axis=1)
df.drop("SexuponOutcome", axis=1, inplace=True)
df = df.rename(columns={'Unknown': 'Unknown Sex'})

## AgeuponOutcome

In [136]:
age_dict = {'1 year':12, '2 years':24, '3 weeks':0.75, '1 month':1, '5 months':5, '4 years':48,
       '3 months':3, '2 weeks':0.5, '2 months':2, '10 months':10, '6 months':6,
       '5 years':60, '7 years':84, '3 years':36, '4 months':4, '12 years':144, '9 years':108,
       '6 years':72, '1 weeks':0.25, '11 years':132, '4 weeks':1, '7 months':7, '8 years':12,
       '11 months':11, '4 days':0, '9 months':9, '8 months':8, '15 years':180,
       '10 years':120, '1 week':0.25, '0 years':0, '14 years':168, '3 days':0, '6 days':0,
       '5 days':0, '5 weeks':1.25, '2 days':0, '16 years':192, '1 day':0, '13 years':156,
       '17 years':204, '18 years':216, '19 years':228, '20 years':240}

df['AgeuponOutcome'] = df['AgeuponOutcome'].map(age_dict)

## Breed

In [137]:
# utworzenie nowej kolumny MixOrNot- przypisanie dla Mix wartości 0, pozostałe - 1
df['MixOrNot'] = df['Breed'].apply(lambda x: 0 if 'Mix' in x else 1)

## Color

In [138]:
unique_colors = df['Color'].unique()
df['Color'].sum()
df['numberOfColor'] = df['Color'].str.count('/') + 1

## Dropping

In [139]:
df = df.drop(['Name'], axis=1)
df = df.drop(['OutcomeSubtype'], axis=1)
df = df.drop(['DateTime'], axis=1)
df = df.drop(['Breed'], axis=1)
#df = df.drop(['Color'], axis=1)

## Data after cleaning


In [140]:
df.head()

Unnamed: 0,AnimalID,OutcomeType,AnimalType,AgeuponOutcome,Color,Intact Female,Intact Male,Neutered Male,Spayed Female,Unknown Sex,MixOrNot,numberOfColor
0,A671945,Return_to_owner,0,12.0,Brown/White,0,0,1,0,0,0,2
1,A656520,Euthanasia,1,12.0,Cream Tabby,0,0,0,1,0,0,1
2,A686464,Adoption,0,24.0,Blue/White,0,0,1,0,0,0,2
3,A683430,Transfer,1,0.75,Blue Cream,0,1,0,0,0,0,1
4,A667013,Transfer,0,24.0,Tan,0,0,1,0,0,1,1


In [141]:
df['OutcomeType'].value_counts()

Adoption           10769
Transfer            9406
Return_to_owner     4785
Euthanasia          1553
Died                 197
Name: OutcomeType, dtype: int64

In [142]:
from scipy.stats import chi2_contingency
# Przykładowe dane
cecha1 = df['Color']
cecha2 = df['OutcomeType']

# Tworzenie tablicy kontyngencji (cross-tabulation)
tablica = pd.crosstab(cecha1, cecha2)

# Przeprowadzenie testu chi-kwadrat
statystyka, p_value, _, _ = chi2_contingency(tablica)

# Wyświetlenie wyników
print("Statystyka testowa:", statystyka)
print("Wartość p:", p_value)

Statystyka testowa: 3926.1844089353926
Wartość p: 1.0885513390980872e-224


# Value we are looking for.. y

In [143]:
df['OutcomeType'] = df['OutcomeType'].replace({'Euthanasia': 0,'Transfer': 0,'Return_to_owner': 0,'Died': 0, 'Adoption': 1})
# Adpoted = 1
# Not adopted = 0

In [144]:
df.head()

Unnamed: 0,AnimalID,OutcomeType,AnimalType,AgeuponOutcome,Color,Intact Female,Intact Male,Neutered Male,Spayed Female,Unknown Sex,MixOrNot,numberOfColor
0,A671945,0,0,12.0,Brown/White,0,0,1,0,0,0,2
1,A656520,0,1,12.0,Cream Tabby,0,0,0,1,0,0,1
2,A686464,1,0,24.0,Blue/White,0,0,1,0,0,0,2
3,A683430,0,1,0.75,Blue Cream,0,1,0,0,0,0,1
4,A667013,0,0,24.0,Tan,0,0,1,0,0,1,1


# REGRESJA LOGISTYCZNA

In [145]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree

In [146]:
X = df[['AnimalType', 'AgeuponOutcome','Intact Female', 'Intact Male', 'Neutered Male', 'Spayed Female', 'Unknown Sex', 'MixOrNot', 'numberOfColor']]
y = df['OutcomeType']

In [147]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [148]:
from sklearn.linear_model import LinearRegression, LogisticRegression

rl = LogisticRegression(solver='lbfgs', max_iter=1000)
rl.fit(X_train, y_train)

y_pred = rl.predict(X_test)

# Lub jeśli chcemy otrzymać prawdopodobieństwa:
y_pred_rl = rl.predict_proba(X_test)

In [149]:
## Regresja lasso:
rlr = LogisticRegression(C=0.01, solver = 'liblinear', penalty='l1')
rlr.fit(X_train, y_train)
y_pred_lasso = rlr.predict(X_test)

## Regresja grzbietowa:
rlr = LogisticRegression(random_state=30, solver = 'liblinear', penalty = 'l2', C = 0.5)
rlr.fit(X_train, y_train)
y_pred_grz = rlr.predict(X_test)

In [150]:
from sklearn.metrics import confusion_matrix, recall_score, f1_score, precision_score

In [151]:
tp, tn, fp, fn = confusion_matrix(y_test, y_pred_lasso).ravel()

In [152]:
print(f'TP:{tp}, TN:{tn}')
print(f'Ile obserwacji zaklasyfikowaliśmy poprawnie: {tp+tn}')
print(f'FP:{fp}, FN:{fn}')
print(f'Ile obserwacji zaklasyfikowaliśmy niepoprawnie: {fp+fn}')

TP:2188, TN:1007
Ile obserwacji zaklasyfikowaliśmy poprawnie: 3195
FP:391, FN:1756
Ile obserwacji zaklasyfikowaliśmy niepoprawnie: 2147


In [153]:
recall_score(y_test, y_pred_lasso) # better closer to 1

0.8178854215183977

In [154]:
precision_score(y_test, y_pred_lasso)  # im mniejsza precyzja tym więcej False Positives

0.6355410785378212

In [155]:
f1_score(y_test, y_pred_lasso)  # im wyższy F1 Score tym lepszy model

0.715274949083503