<a href="https://colab.research.google.com/github/mrj760/Data-Science-Assignments/blob/main/CS410_Assignment_6_Predictive_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Set-up

###Imports

In [None]:
from google.colab import drive
from pathlib import Path
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.compose import make_column_transformer as mkxform, make_column_selector as mksel
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import random

###Drive

In [None]:
drive.flush_and_unmount()
drive.mount("/content/drive")
drivepath = Path() / '/content' / 'drive' / 'MyDrive'

Drive not mounted, so nothing to flush and unmount.
Mounted at /content/drive


###Pandas Printing Options

In [None]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.max_colwidth', None)
pd.set_option("expand_frame_repr", False)

## Obtaining the Data

In [None]:
DONK = pd.read_csv(drivepath / 'donkeys.csv')
print(DONK)

     BCS    Age       Sex  Length  Girth  Height  Weight  WeightAlt
0    3.0     <2  stallion      78     90      90      77        NaN
1    2.5     <2  stallion      91     97      94     100        NaN
2    1.5     <2  stallion      74     93      95      74        NaN
3    3.0     <2    female      87    109      96     116        NaN
4    2.5     <2    female      79     98      91      91        NaN
..   ...    ...       ...     ...    ...     ...     ...        ...
539  3.0  10-15  stallion      98    115     101     145        NaN
540  3.0  10-15  stallion     102    126     110     183        NaN
541  2.5  10-15  stallion     103    118     103     174        NaN
542  3.0    2-5  stallion      91    112     100     139        NaN
543  3.0   5-10  stallion     104    124     110     189        NaN

[544 rows x 8 columns]


### Feature Descriptions


BCS : Body Condition Score: 1 (emaciated), 3 (healthy), 5 (obese)

Age : Age in years : (\<2), (2-5), (5-10), (10-15), (15-20), (\>20)

Sex : Sex categories: stallion, gelding, female 

Length : Body length (cm) from front leg elbow to back of pelvis 

Girth : Body circumference (cm), measured just behind front legs

Height : Body height (cm) up to point where neck connects to back 

Weight : Weight (kilogram)

WeightAlt : Second weight measurement taken on a small subset of donkeys



## Task 1 

### State the predictive question you would like to answer.

What is a donkey's weight, based on its : Body Condition; Age; Sex; Length; Girth; Height; Weight?

### Identify whether it is a classification or a regression problem and describe the dataset.


The output is numeric so this is a regression problem.

## Task 2 / Task 3

Perform at least one quality check. Clean if necessary.

Prepare the data for machine learning by transforming it.

In [None]:
donk = DONK

donk = donk.assign(
    Weight=donk[['Weight','WeightAlt']].mean(axis=1) # Turn weight into average if there were two measurements
    )
donk = donk.drop(columns='WeightAlt')

categs = mksel(dtype_include=object)
nums = mksel(dtype_include=np.number)
# print(f"Ordinal Columns   : {categs(donk)}")
# print(f"Numerical Columns : {nums(donk)}")

categ_linear_proc = OneHotEncoder(handle_unknown="ignore")
num_linear_proc = MinMaxScaler()

xform = mkxform((categ_linear_proc, categs),
                (num_linear_proc, nums), 
                remainder="passthrough")

categ_linear_proc = OneHotEncoder(handle_unknown="ignore")
num_linear_proc = MinMaxScaler()

donk = xform.fit_transform(donk)
donk = pd.DataFrame(donk, columns=xform.get_feature_names_out())
donk = donk.rename(columns=(lambda x: x.replace('minmaxscaler__','').replace('onehotencoder__','')))
# print(donk)

## Task 4

###Split into training/testing datasets

In [None]:
X = list(donk.columns)
X.remove('Weight')
X = donk[X]
y = donk['Weight']

rand = random.randint(0,69420)
print(f'Seed used for train/test split: {rand}')

Xtr, Xtest, ytr, ytest = train_test_split(X, y, test_size=.2, random_state=rand)


# print(len(Xtr))
# print(len(Xtest))
# print(Xtr,end='\n=================\n\n')
# print(ytr,end='\n=================\n\n')
# print(Xtest,end='\n=================\n\n')
# print(ytest,end='\n=================\n\n')

Seed used for train/test split: 55522


###Train via training set

In [None]:
skmodel = LinearRegression().fit(Xtr,ytr)

###Evaluate via test set

In [None]:
def mae(output, pred): # Mean Absolute Error
    return np.mean(abs(output-pred))

def mse(output, pred): # Mean Squared Error
    return np.mean((output-pred)**2)

skpred = skmodel.predict(Xtest)

mae_nofilter = mae(ytest, skpred)
mse_nofilter = mse(ytest, skpred)

print(f'Absolute Error of theta in Test Set: {mae_nofilter}')
print(f'Mean-Squared Error of theta in Test Set: {mse_nofilter}')

Absolute Error of theta in Test Set: 0.03087682133959644
Mean-Squared Error of theta in Test Set: 0.001484612850322486


## Finding the Best Z-Score to filter by

### Iterating

In [None]:
bestz = 9000
# minmae = mae_nofilter
# minmse = mse_nofilter
minmae = 9000
minmse = 9000
z = 1.14

while z < 4:
    donk = DONK

    donk = donk.assign(
        Weight=donk[['Weight','WeightAlt']].mean(axis=1) # Turn weight into average if there were two measurements
        )
    donk = donk.drop(columns='WeightAlt')

    categs = mksel(dtype_include=object)
    nums = mksel(dtype_include=np.number)
    # print(f"Ordinal Columns   : {categs(donk)}")
    # print(f"Numerical Columns : {nums(donk)}")

    categ_linear_proc = OneHotEncoder(handle_unknown="ignore")
    num_linear_proc = MinMaxScaler()

    xform = mkxform((categ_linear_proc, categs),
                    (num_linear_proc, nums), 
                    remainder="passthrough")

    categ_linear_proc = OneHotEncoder(handle_unknown="ignore")
    num_linear_proc = MinMaxScaler()

    donk = xform.fit_transform(donk)
    donk = pd.DataFrame(donk, columns=xform.get_feature_names_out())
    donk = donk.rename(columns=(lambda x: x.replace('minmaxscaler__','').replace('onehotencoder__','')))
    # print(donk)

    donk_filtered = donk[(np.abs(stats.zscore(donk)) < z).all(axis=1)] # get rid of donkeys who fall outside of 99% of measurements
    # donk_filtered = donk
    # print(donk_filtered)

    X = list(donk_filtered.columns)
    X.remove('Weight')
    X = donk_filtered[X]
    y = donk_filtered['Weight']

    Xtr, Xtest, ytr, ytest = train_test_split(X, y, test_size=.2, random_state=6)


    # print(len(Xtr))
    # print(len(Xtest))
    # print(Xtr,end='\n=================\n\n')
    # print(ytr,end='\n=================\n\n')
    # print(Xtest,end='\n=================\n\n')
    # print(ytest,end='\n=================\n\n')

    skmodel = LinearRegression().fit(Xtr,ytr)

    def mae(output, pred): # Mean Absolute Error
        return np.mean(abs(output-pred))

    def mse(output, pred): # Mean Squared Error
        return np.mean((output-pred)**2)

    X = list(donk.columns)
    X.remove('Weight')
    X = donk[X]
    y = donk['Weight']
    Xtr, Xtest, ytr, ytest = train_test_split(X, y, test_size=.2, random_state=rand)

    skpred = skmodel.predict(Xtest)

    curmae = mae(ytest, skpred)
    curmse = mse(ytest, skpred)
    if np.mean([curmse, curmae]) < np.mean([minmae,minmse]):
        bestz = z
        minmae = curmae
        minmse = curmse
    z += .01

### Displaying

In [None]:
print(f'Best z score to filter by: {bestz}')
print(f'Absolute Error of theta in Test Set: {minmae} (compared to {mae_nofilter})')
print(f'Mean-Squared Error of theta in Test Set: {minmse} (compared to {mse_nofilter})')

filtered_mae_diff = minmae / mae_nofilter
filtered_mse_diff = minmse / mse_nofilter

if filtered_mae_diff > 1 :
    print(f'mae is {round((filtered_mae_diff-1)*100,2)}% worse')
elif filtered_mae_diff < 1 :
    print(f'mae is {round((1-filtered_mae_diff)*100,2)}% better')
else:
    print('No difference in mae values')

if filtered_mse_diff > 1 :
    print(f'mse is {round((filtered_mse_diff-1)*100,2)}% worse')
elif filtered_mse_diff < 1 :
    print(f'mse is {round((1-filtered_mse_diff)*100,2)}% better')
else:
    print('No difference in mse values')

Best z score to filter by: 3.5999999999999663
Absolute Error of theta in Test Set: 0.02997430753205772 (compared to 0.03087682133959644)
Mean-Squared Error of theta in Test Set: 0.0014099519621956736 (compared to 0.001484612850322486)
mae is 2.92% better
mse is 5.03% better


## Task 5 : Present Conclusions

uhh