In [1]:
import pandas as pd
import numpy as np
# import tensorflow as tf

# Essemble
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.naive_bayes import GaussianNB #Naive Bayes
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

#ML steps structure
from sklearn.pipeline import FeatureUnion, Pipeline

#Preprocessing
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler
from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD

from sklearn.model_selection import GridSearchCV

#Metrics
from sklearn.model_selection import cross_val_score

In [2]:
#Read the data using the Unnamed (probably id) as index
url = 'https://s3.amazonaws.com/drivendata/data/4/public/81e8f2de-9915-4934-b9ae-9705685c9d50.csv'
training = pd.read_csv(url, index_col='Unnamed: 0')

labels = ['Function', 'Object_Type', 'Operating_Status', 'Position_Type', 'Pre_K', 'Reporting', 
          'Sharing', 'Student_Type', 'Use']

numeric = ['FTE', 'Total']

categoric = [ 'Facility_or_Department', 'Function_Description', 
            'Fund_Description', 'Job_Title_Description', 'Location_Description', 
            'Object_Description', 'Position_Extra', 'Program_Description', 'SubFund_Description', 
            'Sub_Object_Description', 
            'Text_1', 'Text_2', 'Text_3', 'Text_4']

KeyboardInterrupt: 

# Preprocessing

In [None]:
#Imputing data in Total column
def impute_func_total(data):
    if(pd.isnull(data['Total'])):        
        if(data['Object_Type'] == 'Base Salary/Compensation'):
            return 24146
        if(data['Object_Type'] == 'Benefits'):
            return 38163
        if(data['Object_Type'] == 'Contracted Services'):
            return 24146
        if(data['Object_Type'] == 'Equipment & Equipment Lease'):
            return 11257
        if(data['Object_Type'] == 'NO_LABEL'):
            return 58545
        if(data['Object_Type'] == 'Other Compensation/Stipend'):
            return 1605
        if(data['Object_Type'] == 'Other Non-Compensation'):
            return 10646
        if(data['Object_Type'] == 'Rent/Utilities'):
            return 46611
        if(data['Object_Type'] == 'Substitute Compensation'):
            return 1090
        if(data['Object_Type'] == 'Supplies/Materials'):
            return 7745
        if(data['Object_Type'] == 'Travel & Conferences'):
            return 1659
    else:
        return data['Total']




#Imputing data in FTE column
def impute_func_FTE(data):
    if(pd.isnull(data['FTE'])):        
        if(data['Object_Type'] == 'Base Salary/Compensation'):
            return 0.45
        if(data['Object_Type'] == 'Benefits'):
            return 0.0
        if(data['Object_Type'] == 'Contracted Services'):
            return 0.0
        if(data['Object_Type'] == 'Equipment & Equipment Lease'):
            return 0.0
        if(data['Object_Type'] == 'NO_LABEL'):
            return 0.75
        if(data['Object_Type'] == 'Other Compensation/Stipend'):
            return 0.000107
        if(data['Object_Type'] == 'Other Non-Compensation'):
            return 0.0
        if(data['Object_Type'] == 'Rent/Utilities'):
            return 0.0
        if(data['Object_Type'] == 'Substitute Compensation'):
            return 0.000059
        if(data['Object_Type'] == 'Supplies/Materials'):
            return 0.0
        if(data['Object_Type'] == 'Travel & Conferences'):
            return 0.0
    else:
        return data['FTE']



def preProcessing(training):
    # Remove inconsistent data
    training.loc[(training['FTE'] < 0) | (training['FTE'] > 1), 'FTE'] = np.nan
    training.loc[training['Total'] < 0, 'Total'] = np.nan
    
    training['Total'] = training.apply(impute_func_total, axis = 1)
    
    training['FTE'] = training.apply(impute_func_FTE, axis = 1)
    
    for category in categoric:
        training[category] = training[category].str.lower()
    
    training[categoric] = training[categoric].fillna("")
    
    return training



In [None]:
df_training = preProcessing(training)
df_training = df_training.reset_index(drop = True)

In [None]:
DataSet = df_training.drop(columns=labels)
labels_data = pd.get_dummies(df_training['Object_Type'])

In [None]:
col_names = list(range(0,11))
labels_data.columns = col_names
labels_data = labels_data.idxmax(axis=1)

## Essemble

In [None]:
def combine_text_columns(dataset):
    return dataset[categoric].apply(lambda x: " ".join(x), axis = 1)

get_text_data = FunctionTransformer(combine_text_columns, validate = False)

def combine_numeric_columns(dataset):
    return dataset[numeric]

get_numeric_data = FunctionTransformer(combine_numeric_columns, validate = False)

In [None]:
pl = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer',HashingVectorizer(token_pattern="[A-Za-z0-9]+(?=\\s+)", 
                                                    norm=None, 
                                                    binary=False,
                                                    ngram_range=(1,2), stop_words = 'english') 
                    )
                ]))
             ]
        )),
        ('reduce_dim', TruncatedSVD(n_components = 100)),             
    ])



In [None]:
X_train = pl.fit_transform(df_training, labels_data)

In [None]:
rus = RandomUnderSampler(replacement=True, sampling_strategy='not majority')
X_resampled, y_resampled = rus.fit_resample(X_train, labels_true)

In [2]:
X_train = pd.read_csv('../src/data/processed/X_.csv')
y_= pd.read_csv('../src/data/processed/y_.csv')

In [3]:
labels_data = pd.get_dummies(y_['0'])
col_names = list(range(0,11))
labels_data.columns = col_names
labels_data = labels_data.idxmax(axis=1)

### NB

In [4]:
base_estimator = GaussianNB()

In [5]:
scores_dt = {}
for n_estimator in [10, 15, 20]:
    print(n_estimator)
    eec = EasyEnsembleClassifier(n_estimators=n_estimator,
                             base_estimator=base_estimator,
                             n_jobs=-1)
    scores = cross_val_score(eec, X_train.values, labels_data.values, cv=10)
    scores_dt['nb_'+str(n_estimator)] = scores

10
15
20


### K-NN

In [6]:
base_estimator = KNeighborsClassifier(n_neighbors=7)

In [7]:
for n_estimator in [10, 15, 20]:
    print(n_estimator)
    eec = EasyEnsembleClassifier(n_estimators=n_estimator,
                             base_estimator=base_estimator,
                             n_jobs=-1)
    scores = cross_val_score(eec, X_train.values, labels_data.values, cv=10)
    scores_dt['knn_'+str(n_estimator)] = scores

10
15
20


#### Decision tree

In [8]:
base_estimator = DecisionTreeClassifier(max_depth=25)

In [9]:
for n_estimator in [10, 15, 20]:
    print(n_estimator)
    eec = EasyEnsembleClassifier(n_estimators=n_estimator,
                             base_estimator=base_estimator,
                             n_jobs=-1)
    scores = cross_val_score(eec, X_train.values, labels_data.values, cv=10)
    scores_dt['dt_'+str(n_estimator)] = scores

10
15
20


In [10]:
scores

array([0.87679988, 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 0.51166551])

In [11]:
scores_dt

{'nb_10': array([0.88911989, 0.93285593, 0.97751598, 0.998845  , 0.998614  ,
        0.99214599, 0.98544699, 0.97928698, 0.95772696, 0.79448679]),
 'nb_15': array([0.88911989, 0.93285593, 0.97751598, 0.998845  , 0.998614  ,
        0.99206899, 0.98544699, 0.97928698, 0.95772696, 0.79440979]),
 'nb_20': array([0.88911989, 0.93285593, 0.97751598, 0.998845  , 0.998537  ,
        0.99214599, 0.98544699, 0.97928698, 0.95772696, 0.79348579]),
 'knn_10': array([0.91206591, 0.996304  , 0.99769   , 0.997074  , 0.997459  ,
        0.99692   , 0.997613  , 0.99692   , 0.99499499, 0.53707554]),
 'knn_15': array([0.91214291, 0.996304  , 0.997613  , 0.997074  , 0.997382  ,
        0.99692   , 0.997613  , 0.99692   , 0.99499499, 0.53499653]),
 'knn_20': array([0.91214291, 0.996304  , 0.997613  , 0.997151  , 0.997382  ,
        0.99692   , 0.997613  , 0.99692   , 0.99499499, 0.53707554]),
 'dt_10': array([0.87649188, 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.    

### MLP

In [12]:
base_estimator = MLPClassifier(hidden_layer_sizes=(20,), max_iter=100, alpha=1e-4,
                    solver='sgd', verbose=True, tol=1e-4, random_state=42, momentum = 0.8, 
                    validation_fraction = 0.3, early_stopping = True, n_iter_no_change = 10,
                    learning_rate_init=.1)

In [13]:
for n_estimator in [10, 15, 20]:
    print(n_estimator)
    eec = EasyEnsembleClassifier(n_estimators=n_estimator,
                             base_estimator=base_estimator,
                             n_jobs=-1)
    scores = cross_val_score(eec, X_train.values, labels_data.values, cv=10)
    scores_dt['mlp_'+str(n_estimator)] = scores

10
15
20


In [14]:
scores_dt

{'nb_10': array([0.88911989, 0.93285593, 0.97751598, 0.998845  , 0.998614  ,
        0.99214599, 0.98544699, 0.97928698, 0.95772696, 0.79448679]),
 'nb_15': array([0.88911989, 0.93285593, 0.97751598, 0.998845  , 0.998614  ,
        0.99206899, 0.98544699, 0.97928698, 0.95772696, 0.79440979]),
 'nb_20': array([0.88911989, 0.93285593, 0.97751598, 0.998845  , 0.998537  ,
        0.99214599, 0.98544699, 0.97928698, 0.95772696, 0.79348579]),
 'knn_10': array([0.91206591, 0.996304  , 0.99769   , 0.997074  , 0.997459  ,
        0.99692   , 0.997613  , 0.99692   , 0.99499499, 0.53707554]),
 'knn_15': array([0.91214291, 0.996304  , 0.997613  , 0.997074  , 0.997382  ,
        0.99692   , 0.997613  , 0.99692   , 0.99499499, 0.53499653]),
 'knn_20': array([0.91214291, 0.996304  , 0.997613  , 0.997151  , 0.997382  ,
        0.99692   , 0.997613  , 0.99692   , 0.99499499, 0.53707554]),
 'dt_10': array([0.87649188, 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.    

In [15]:
pd.DataFrame(scores_dt).to_csv('../reports/scores.csv')