# Classifier for Noxious Plant Species in North America

* Student names: Natasha Kacoroski, Jacob Crabb
* Student pace: full time
* Scheduled project review date/time: 
* Instructor name: Miles Erickson, Greg Damico


## Load Libraries and Data

In [145]:
# Import necessary libraries
from sklearn_pandas import DataFrameMapper, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pandas as pd
import numpy as np

In [143]:
# Import data. Manually removed backslash symbols while troubleshooting data import (write function?)

plant_data = pd.read_csv("plants.csv", skiprows=9, low_memory=False)
plant_data.head()

Unnamed: 0,"outl0strokewidth0 strokec2 ""Accepted Symbol""",Synonym Symbol,Scientific Name,Common Name,Category,Duration,Growth Habit,Native Status,State Noxious Status,Active Growth Period,...,Propogated by Cuttings,Propogated by Seed,Propogated by Sod,Propogated by Sprigs,Propogated by Tubers,Seeds per Pound,Seed Spread Rate,Seedling Vigor,Small Grain,Vegetative Spread Rate
0,ABELI,,Abelia,abelia,Dicot,,,L48(I),,,...,,,,,,,,,,
1,ABGR4,,Abelia 'd7grandiflora,glossy abelia,Dicot,Perennial,Shrub,L48(I),,"Spring, Summer, Fall",...,Yes,No,No,No,No,,,,No,
2,ABELM,,Abelmoschus,okra,Dicot,,,L48(I),,,...,,,,,,,,,,
3,ABES,,Abelmoschus esculentus,okra,Dicot,"Annual, Perennial","Subshrub, Forb/herb",L48(I)PR(I)VI(I),,,...,,,,,,,,,,
4,ABIES,,Abies,fir,Gymnosperm,,,"L48(I,N)CAN(N)SPM(N)",,,...,,,,,,,,,,


In [104]:
plant_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38186 entries, 0 to 38185
Data columns (total 78 columns):
outl0strokewidth0 strokec2 "Accepted Symbol"    38186 non-null object
Synonym Symbol                                  66 non-null object
Scientific Name                                 38185 non-null object
Common Name                                     36115 non-null object
Category                                        38046 non-null object
Duration                                        27808 non-null object
Growth Habit                                    34269 non-null object
Native Status                                   38093 non-null object
State Noxious Status                            461 non-null object
Active Growth Period                            2027 non-null object
After Harvest Regrowth Rate                     1039 non-null object
Bloat                                           2063 non-null object
C:N Ratio                                       1840 non-nu

Column information found here https://plants.usda.gov/charinfo.html

## Clean Data

From looking at data, many columns have the same 2,063 entries. To address null values, decide to take the 2,063 entry subset with a value for Growth Habit because that value is important for determining how to fill remaining nulls.

In [105]:
plant_data.dropna(subset=['Bloat'], inplace=True) # Bloat is first column with 2,063 entries
plant_data.reset_index(drop=True, inplace=True) # Reset index

Target data is the State Noxious Status column. Convert to 1 if noxious and 0 if not.

In [195]:
plant_data['invasives'] = plant_data['State Noxious Status'].notnull().astype('int')

Select most commom growth habit for each entry.

In [115]:
def common_growth_habit(x):
    """Return most common growth habit"""
    return x.split(',')[0]

def common_duration(x):
    """Return most common duration"""
    return x.split(',')[0]

def convert_to_str(x):
    """Convert to string"""
    return str(x)

In [183]:
mapper = DataFrameMapper([
    (['Category'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"),  {'alias': 'category'}),
    (['Duration'], [SimpleImputer(missing_values=np.nan, strategy="most_frequent"), FunctionTransformer(common_duration)], {'alias': 'duration'}),
    (["Growth Habit"], [SimpleImputer(missing_values=np.nan, strategy="most_frequent"), FunctionTransformer(common_growth_habit)], {'alias': 'growth_habit'}),
    (["Active Growth Period"], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'growth_period'}),   
    (['Bloat'], None, {'alias': 'bloat'}),
    (['C:N Ratio'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'cn_ratio'}),
    (['Coppice Potential'], None, {'alias': 'coppice_potential'}),
    (['Fall Conspicuous'], None, {'alias': 'fall_conspicuous'}),
    (['Fire Resistance'], None,  {'alias': 'fire_resistance'}),
    (['Flower Color'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'flower_color'}),
    (['Flower Conspicuous'], None, {'alias': 'flower_conspicuous'}),
    (['Foliage Color'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'foliage_color'}),
    (['Foliage Porosity Summer'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'f_summer_porosity'}),
    (['Foliage Porosity Winter'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'f_winter_porosity'}),
    (['Foliage Texture'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'foliage_texture'}),
    (['Fruit Color'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'fruit_color'}),
    (['Fruit Conspicuous'], None, {'alias': 'fruit_conspicuous'}),
    (['Growth Form'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'growth_form'}),
    (['Growth Rate'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'growth_rate'}),
    (['Height, Mature (feet)'], SimpleImputer(missing_values=np.nan, strategy="median"), {'alias': 'height_in_ft'}),
    (['Known Allelopath'], None, {'alias': 'allelopath'}),
    (['Leaf Retention'], None, {'alias': 'leaf_retention'}),
    (['Lifespan'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'lifespan'}),
    (['Low Growing Grass'], None, {'alias': 'low_growing_grass'}),
    (['Nitrogen Fixation'], None, {'alias': 'nitrogen_fixation'}),
    (['Resprout Ability'], None, {'alias': 'resprout_ability'}),
    (['Shape and Orientation'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'shape_and_or'}),
    (['Toxicity'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'toxicity'}),
    (['Adapted to Coarse Textured Soils'], None,  {'alias': 'coarse_soil'}),
    (['Adapted to Medium Textured Soils'], None, {'alias': 'med_soil'}),
    (['Adapted to Fine Textured Soils'], None, {'alias': 'fine_soil'}),
    (['Anaerobic Tolerance'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'anaerobic_tolerance'}),
    (['CaCO<SUB>3</SUB> Tolerance'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'caco_tolerance'}),
    (['Cold Stratification Required'], None, {'alias': 'cold_strat'}),
    (['Drought Tolerance'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'drought_tolerance'}),
    (['Fertility Requirement'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'fertility'}),
    (['Fire Tolerance'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'fire_tolerance'}),
    (['Frost Free Days, Minimum'], SimpleImputer(missing_values=np.nan, strategy="median"), {'alias':'min_frost_free_days'}),
    (['Hedge Tolerance'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'hedge_tolerance'}),
    (['Moisture Use'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'moisture_use'}),
    (['pH (Minimum)'],  SimpleImputer(missing_values=np.nan, strategy="median"), {'alias': 'ph_min'}),
    (['pH (Maximum)'], SimpleImputer(missing_values=np.nan, strategy="median"), {'alias': 'ph_max'}),
    (['Precipitation (Minimum)'], SimpleImputer(missing_values=np.nan, strategy="median"), {'alias': 'precip_min'}),
    (['Precipitation (Maximum)'], SimpleImputer(missing_values=np.nan, strategy="median"), {'alias': 'precip_max'}),
    (['Root Depth, Minimum (inches)'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'root_depth_min'}),
    (['Salinity Tolerance'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'salinity'}),
    (['Shade Tolerance'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'shade_tolerance'}),
    (["Temperature, Minimum ('b0F)"], SimpleImputer(missing_values=np.nan, strategy="median"), {'alias': 'min_temp'}),
    (['Bloom Period'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'bloom_period'}),
    (['Fruit/Seed Abundance'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'fruit_abundance'}),
    (['Fruit/Seed Period Begin'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"),  {'alias': 'fruit_period_start'}),
    (['Fruit/Seed Period End'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'fruit_period_end'}),
    (['Fruit/Seed Persistence'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'fruit_seed_persistence'}),
    (['Propogated by Bare Root'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'bare_root'}),
    (['Propogated by Bulbs'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'bulb'}),
    (['Propogated by Container'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'container'}),
    (['Propogated by Corms'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'corms'}),
    (['Propogated by Cuttings'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'cuttings'}),
    (['Propogated by Seed'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'seed'}),
    (['Propogated by Sod'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'sod'}),
    (['Propogated by Sprigs'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'sprigs'}),
    (['Propogated by Tubers'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'tubers'}),
    (['Seed Spread Rate'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'seed_spread_rate'}),
    (['Seedling Vigor'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'seed_vigor'}),
    (['Small Grain'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'small_grain'}),
    (['Vegetative Spread Rate'], SimpleImputer(missing_values=np.nan, strategy="most_frequent"), {'alias': 'veg_spread_rate'}),
     ], df_out=True)
    

In [184]:
mapper.fit(plant_data)
test1 = mapper.transform(plant_data)
test.head()

Unnamed: 0,category,duration,growth_habit,growth_period,bloat,cn_ratio,coppice_potential,fall_conspicuous,fire_resistance,flower_color,...,corms,cuttings,seed,sod,sprigs,tubers,seed_spread_rate,seed_vigor,small_grain,veg_spread_rate
0,Dicot,Perennial,Shrub,"Spring, Summer, Fall",,High,No,Yes,No,Purple,...,No,Yes,No,No,No,No,,Medium,No,
1,Gymnosperm,Perennial,Tree,Spring and Summer,,High,No,Yes,No,Yellow,...,No,No,Yes,No,No,No,Slow,Low,No,
2,Gymnosperm,Perennial,Tree,Spring and Summer,,High,No,No,No,Yellow,...,No,No,Yes,No,No,No,Slow,Low,No,
3,Gymnosperm,Perennial,Tree,Spring and Summer,,High,No,No,No,Red,...,No,Yes,Yes,No,No,No,Moderate,Medium,No,
4,Gymnosperm,Perennial,Tree,Spring and Summer,,High,No,No,No,Purple,...,No,Yes,Yes,No,No,No,Slow,Medium,No,


In [185]:
from sklearn.linear_model import LogisticRegression

In [186]:
mapper1 = DataFrameMapper([
    (['Category'], [SimpleImputer(missing_values=np.nan, strategy="most_frequent")], {'alias': 'category'})], df_out=True)

In [187]:
mapper1.fit(plant_data)
test1 = mapper1.transform(plant_data)
test1.head()

Unnamed: 0,category
0,Dicot
1,Dicot
2,Dicot
3,Dicot
4,Gymnosperm


In [132]:
enc = OneHotEncoder(categories='auto')
enc.fit(test[['category']])

OneHotEncoder(categorical_features=None, categories='auto',
       dtype=<class 'numpy.float64'>, handle_unknown='error',
       n_values=None, sparse=True)

In [196]:
numeric_features = ['Height, Mature (feet)', 'Frost Free Days, Minimum', 'pH (Minimum)', 'pH (Maximum)', 'Precipitation (Minimum)', 'Precipitation (Maximum)',
                    'Root Depth, Minimum (inches)', "Temperature, Minimum ('b0F)"]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
    ('scaler',  StandardScaler())])


categorical_features = ['Category']

categorical_transformer = Pipeline(steps=[
    ('mapper', mapper1),
    ('onehot', OneHotEncoder(categories='auto'))
])

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),
                                               ('cat', categorical_transformer, categorical_features)])

logreg = Pipeline(steps=[('preprocessor', preprocessor),('classifier', LogisticRegression(solver='lbfgs'))])

X = plant_data.drop('State Noxious Status', axis=1)
y = plant_data['Invasives']

y_enc = enc.fit(plant_data[['invasives']])

X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.2)


logreg.fit(X_train, y_train)

TypeError: Singleton array array(OneHotEncoder(categorical_features=None, categories='auto',
       dtype=<class 'numpy.float64'>, handle_unknown='error',
       n_values=None, sparse=True), dtype=object) cannot be considered a valid collection.