In [2]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 50)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, ParameterGrid, GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
import Feature_Importance_Funs
import Splitting_Funs
import Classifiers
import Classifiers_Ashlyn

## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'data-454'
bucket = s3.Bucket(bucket_name)

## Defining the file to be read from s3 bucket
file_key_train = 'Project_3/orders_train.txt'
file_key_test = 'Project_3/orders_class.txt'

bucket_object_train = bucket.Object(file_key_train)
file_object_train = bucket_object_train.get()
file_content_stream_train = file_object_train.get('Body')

bucket_object_test = bucket.Object(file_key_test)
file_object_test = bucket_object_test.get()
file_content_stream_test = file_object_test.get('Body')

## Reading train and test file
train = pd.read_csv(file_content_stream_train, sep = ';', na_values = '?')
test = pd.read_csv(file_content_stream_test, sep = ';', na_values = '?')

## Features on train
train['feature_1'] = np.where((train['manufacturerID'] == 113) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_2'] = np.where((train['manufacturerID'] == 20) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_3'] = np.where((train['manufacturerID'] == 49) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_4'] = np.where((train['manufacturerID'] == 21) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_5'] = np.where((train['manufacturerID'] == 24) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_6'] = np.where((train['manufacturerID'] == 54) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_7'] = np.where((train['manufacturerID'] == 25) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_8'] = np.where((train['manufacturerID'] == 45) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_9'] = np.where((train['manufacturerID'] == 18) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_10'] = np.where((train['manufacturerID'] == 97) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_11'] = np.where((train['manufacturerID'] == 55) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_12'] = np.where((train['manufacturerID'] == 26) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_13'] = np.where((train['manufacturerID'] == 53) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_14'] = np.where((train['manufacturerID'] == 56) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_15'] = np.where((train['manufacturerID'] == 44) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_16'] = np.where((train['manufacturerID'] == 75) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_17'] = np.where((train['manufacturerID'] == 64) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_18'] = np.where((train['manufacturerID'] == 16) & (train['salutation'] == 'Mrs'), 1, 0)

## Splitting Data 

In [3]:
## Defining the input and target variables
X = train.drop(columns = ['returnShipment'], axis = 1)
Y = train['returnShipment']

## Splitting the data 
X_train, X_val, X_test, Y_train, Y_val, Y_test = Splitting_Funs.train_validation_test(X, Y)

In [4]:
## Selecting the inputs based on the feature importance ranking 
X_train = X_train[['feature_1', 'feature_2', 'feature_7', 'feature_8', 'feature_3']]
X_val = X_val[['feature_1', 'feature_2', 'feature_7', 'feature_8', 'feature_3']]
X_test = X_test[['feature_1', 'feature_2', 'feature_7', 'feature_8', 'feature_3']]

In [None]:
X_train.describe()

In [None]:
Y_train.value_counts()

In [4]:
test = Classifiers.Classifier(X_train, Y_train, X_val, Y_val, model = 'Ada')

   n_estimators  max_features  max_depth  learning_rate  evaluation
0           100             3          3          0.001         NaN
1           100             3          3          0.010         NaN
2           100             3          3          0.100         NaN
3           100             3          3          1.000         NaN
4           100             3          5          0.001         NaN
Working on job 1 out of  48
Working on job 2 out of  48
Working on job 3 out of  48
Working on job 4 out of  48
Working on job 5 out of  48
Working on job 6 out of  48
Working on job 7 out of  48
Working on job 8 out of  48
Working on job 9 out of  48
Working on job 10 out of  48
Working on job 11 out of  48
Working on job 12 out of  48
Working on job 13 out of  48
Working on job 14 out of  48
Working on job 15 out of  48
Working on job 16 out of  48
Working on job 17 out of  48
Working on job 18 out of  48
Working on job 19 out of  48
Working on job 20 out of  48
Working on job 21 out

In [5]:
test = test.sort_values(by = ['evaluation'])
test.head()

Unnamed: 0,n_estimators,max_features,max_depth,learning_rate,evaluation
4,100,3,5,0.001,23861.373256
12,100,5,5,0.001,23861.373256
20,300,3,5,0.001,23878.677493
28,300,5,5,0.001,23878.677493
36,500,3,5,0.001,23893.987709


In [5]:
a = np.round(np.linspace(0.05, 0.95, num = 40, endpoint = True), 2)
a

array([0.05, 0.07, 0.1 , 0.12, 0.14, 0.17, 0.19, 0.21, 0.23, 0.26, 0.28,
       0.3 , 0.33, 0.35, 0.37, 0.4 , 0.42, 0.44, 0.47, 0.49, 0.51, 0.53,
       0.56, 0.58, 0.6 , 0.63, 0.65, 0.67, 0.7 , 0.72, 0.74, 0.77, 0.79,
       0.81, 0.83, 0.86, 0.88, 0.9 , 0.93, 0.95])

In [11]:
from sklearn.metrics import confusion_matrix

x = np.array([1, 1, 0])
y = np.array([0, 1, 0])

a = confusion_matrix(x, y)
a[1, 0]

1