## LOAD LIBRARIES!

In [15]:
import pandas as pd
import numpy as np
import sklearn.impute
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
import acquire
import env


### Q1:

Iris Data

- Use the function defined in acquire.py to load the iris data.

- Drop the species_id and measurement_id columns.

- Rename the species_name column to just species.

- Encode the species name using a sklearn label encoder. Research the inverse_transform method of the label encoder. How might this be useful?

- Create a function named prep_iris that accepts the untransformed iris data, and returns the data with the transformations above applied.

In [11]:
# load iris data

iris_df = acquire.get_iris_data()
iris_df

Unnamed: 0,species_id,species_name,sepal_length,sepal_width,petal_length,petal_width
0,1,setosa,5.1,3.5,1.4,0.2
1,1,setosa,4.9,3.0,1.4,0.2
2,1,setosa,4.7,3.2,1.3,0.2
3,1,setosa,4.6,3.1,1.5,0.2
4,1,setosa,5.0,3.6,1.4,0.2
...,...,...,...,...,...,...
145,3,virginica,6.7,3.0,5.2,2.3
146,3,virginica,6.3,2.5,5.0,1.9
147,3,virginica,6.5,3.0,5.2,2.0
148,3,virginica,6.2,3.4,5.4,2.3


In [12]:
# drop species_id and measurement_id columns 

# Remember, I have to create a new dataframe to permanently drop that column.
# If I don't assign the drop to a new df, it'll keep popping up when I check the df.

iris_df = iris_df.drop("species_id", axis=1)
iris_df

Unnamed: 0,species_name,sepal_length,sepal_width,petal_length,petal_width
0,setosa,5.1,3.5,1.4,0.2
1,setosa,4.9,3.0,1.4,0.2
2,setosa,4.7,3.2,1.3,0.2
3,setosa,4.6,3.1,1.5,0.2
4,setosa,5.0,3.6,1.4,0.2
...,...,...,...,...,...
145,virginica,6.7,3.0,5.2,2.3
146,virginica,6.3,2.5,5.0,1.9
147,virginica,6.5,3.0,5.2,2.0
148,virginica,6.2,3.4,5.4,2.3


In [13]:
# rename 'species_name' column to just 'species'
# same thing as above - gotta assign it to a new df

iris_df = iris_df.rename(columns={"species_name":"species"})
iris_df

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width
0,setosa,5.1,3.5,1.4,0.2
1,setosa,4.9,3.0,1.4,0.2
2,setosa,4.7,3.2,1.3,0.2
3,setosa,4.6,3.1,1.5,0.2
4,setosa,5.0,3.6,1.4,0.2
...,...,...,...,...,...
145,virginica,6.7,3.0,5.2,2.3
146,virginica,6.3,2.5,5.0,1.9
147,virginica,6.5,3.0,5.2,2.0
148,virginica,6.2,3.4,5.4,2.3


In [26]:
# Encode the species name using a sklearn label encoder. 
# Research the inverse_transform method of the label encoder. 
# How might this be useful?

train, test = train_test_split(iris_df, train_size=.75, random_state=123)

# Choosing OneHotEncoder because the name of a species has no inherent order of importance

# OneHotEncoder takes a column and turns each unique value into its own column with
# a value of 1 or 0... 1 means that the value is an attribute for that observation
# and 0 means that it isn't.

encoder = LabelEncoder()
encoder.fit(train.species)
train.encoded = encoder.transform(train.species)
test.encoded = encoder.transform(test.species)

print(type(train.encoded))
train.encoded[0:10]


<class 'numpy.ndarray'>


  
  from ipykernel import kernelapp as app


array([0, 1, 1, 0, 2, 2, 2, 2, 2, 1])

#### Above, I get the 1-D array of the encoded columns representing 'species'.  To change it to the 2-D array needed by OneHotEncoder, convert the dimensions by using 'reshape(observations, variables):

In [27]:
train_array = np.array(train.encoded).reshape(len(train.encoded), 1)
test_array = np.array(test.encoded).reshape(len(test.encoded), 1)

In [30]:
# OneHotEncoder takes the array of integers and returns them into an array with 
# shape nXd.  n = number of observations in train, and 
# d = the number of distinct value in the variable being encoded
# Recall: 1 means the observation HAS the attribute, 0 means it DOES NOT.

ohe = OneHotEncoder(sparse=False, categories="auto")

train_ohe = ohe.fit_transform(train_array)
test_ohe = ohe.fit_transform(test_array)

#### To answer the question, the inverse_transform method would be useful so that I can get the original string values of the new variables

In [None]:
# Create a function named prep_iris that accepts the untransformed iris data, 
# and returns the data with the transformations above applied.

def prep_iris():
    # gets iris in its natural, unaltered database state
    acquire.get_iris_data()
    # and returns it back to us after we do all the above stuff to it
    iris_df.drop("species_id", axis=1)
    iris_df.rename(columns={"species_name":"species"})
    encoded_values = sorted(list(train[species].unique()))
    int_encoder = LableEncoder()
    train.encoded = encoder.transform(train.species)
    test.encoded = encoder.transform(test.species)



### Q2:

Titanic Data

- Use the function you defined in acquire.py to load the titanic data set.

- Handle the missing values in the embark_town and embarked columns.

- Remove the deck column.

- Use a label encoder to transform the embarked column.

- Scale the age and fare columns using a min max scaler. Why might this be beneficial? When might you not want to do this?

- Fill the missing values in age. The way you fill these values is up to you. Consider the tradeoffs of different methods.

- Create a function named prep_titanic that accepts the untransformed titanic data, and returns the data with the transformations above applied.

In [9]:
# load titanic dataset

df = acquire.get_titanic_data()
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
passenger_id    891 non-null int64
survived        891 non-null int64
pclass          891 non-null int64
sex             891 non-null object
age             714 non-null float64
sibsp           891 non-null int64
parch           891 non-null int64
fare            891 non-null float64
embarked        889 non-null object
class           891 non-null object
deck            203 non-null object
embark_town     889 non-null object
alone           891 non-null int64
dtypes: float64(2), int64(6), object(5)
memory usage: 90.6+ KB


In [11]:
# embarked has 2 nulls, embark_town has two nulls.  If they're the same, drop one.
# (we already know they're the same, b/c the module proved it.
# Nevertheless, reprove it here.)

pd.crosstab(df["embarked"], df["embark_town"])

embark_town,Cherbourg,Queenstown,Southampton
embarked,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C,168,0,0
Q,0,77,0
S,0,0,644


In [21]:
train, test = train_test_split(df, train_size=.75, random_state=123)

# function to fill nulls in train and test:
def fill_na(train, test, fill_value):
    train = train.fillna(fill_value)
    test = test.fillna(fill_value)
    return train, test

train, test = fill_na(train, test, np.nan)

# function to drop columns in train and test:
def drop_columns(train, test, drop_cols):
    train.drop(columns=drop_cols, inplace=True)
    test.drop(columns=drop_cols, inplace=True)
    return train, test

train, test = drop_columns(train, test, drop_cols = ["embarked", "deck"])

# I'm basically telling the df "where there's a null in the column 'embark_town,' 
# replace it with the most frequent value found in the column."  In this case, 
# the most frequent value is Shouthampton.

imputer = SimpleImputer(strategy = "most_frequent")

imputer = imputer.fit(train[["embark_town"]])

train[["embark_town"]] = imputer.transform(train[["embark_town"]])

# function that will go thru all these imputing steps when I provide a train and test df,
# a strategy, and a list of columns

def impute(train, test, my_strategy, column_list):
    imputer = SimpleImputer(strategy = my_strategy)
    train[column_list] = imputer.fit_transform(train[column_list])
    test[column_list] = imputer.fit_transform(test[column_list])
    return train, test

train, test = impute(train, test, my_strategy="most_frequent", column_list = ["embark_town"])

# Verify all missing values have been handled:

print(train.embark_town.isnull().sum() + test.embark_town.isnull().sum())

0
