In [1]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, ParameterGrid, GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'data-454'
bucket = s3.Bucket(bucket_name)

## Defining the file to be read from s3 bucket
file_key_train = 'Project_3/orders_train.txt'
file_key_test = 'Project_3/orders_class.txt'

bucket_object_train = bucket.Object(file_key_train)
file_object_train = bucket_object_train.get()
file_content_stream_train = file_object_train.get('Body')

bucket_object_test = bucket.Object(file_key_test)
file_object_test = bucket_object_test.get()
file_content_stream_test = file_object_test.get('Body')

## Reading train and test file
train = pd.read_csv(file_content_stream_train, sep = ';', na_values = '?')
test = pd.read_csv(file_content_stream_test, sep = ';', na_values = '?')

## Feature Engineering on Train

In [4]:
train['feature_1'] = np.where((train['manufacturerID'] == 113) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_2'] = np.where((train['manufacturerID'] == 20) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_3'] = np.where((train['manufacturerID'] == 49) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_4'] = np.where((train['manufacturerID'] == 21) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_5'] = np.where((train['manufacturerID'] == 24) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_6'] = np.where((train['manufacturerID'] == 54) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_7'] = np.where((train['manufacturerID'] == 25) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_8'] = np.where((train['manufacturerID'] == 45) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_9'] = np.where((train['manufacturerID'] == 18) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_10'] = np.where((train['manufacturerID'] == 97) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_11'] = np.where((train['manufacturerID'] == 55) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_12'] = np.where((train['manufacturerID'] == 26) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_13'] = np.where((train['manufacturerID'] == 53) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_14'] = np.where((train['manufacturerID'] == 56) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_15'] = np.where((train['manufacturerID'] == 44) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_16'] = np.where((train['manufacturerID'] == 75) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_17'] = np.where((train['manufacturerID'] == 64) & (train['salutation'] == 'Mrs'), 1, 0)
train['feature_18'] = np.where((train['manufacturerID'] == 16) & (train['salutation'] == 'Mrs'), 1, 0)


train.head()

Unnamed: 0,orderItemID,orderDate,deliveryDate,itemID,size,color,manufacturerID,price,customerID,salutation,...,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18
0,1,2012-04-01,2012-04-03,186,m,denim,25,69.9,794,Mrs,...,0,0,0,0,0,0,0,0,0,0
1,2,2012-04-01,2012-04-03,71,9+,ocher,21,69.95,794,Mrs,...,0,0,0,0,0,0,0,0,0,0
2,3,2012-04-01,2012-04-03,71,9+,curry,21,69.95,794,Mrs,...,0,0,0,0,0,0,0,0,0,0
3,4,2012-04-02,,22,m,green,14,39.9,808,Mrs,...,0,0,0,0,0,0,0,0,0,0
4,5,2012-04-02,1990-12-31,151,39,black,53,29.9,825,Mrs,...,0,0,0,0,1,0,0,0,0,0


## Feature Engineering on Test

In [5]:
test['feature_1'] = np.where((test['manufacturerID'] == 113) & (test['salutation'] == 'Mrs'), 1, 0)
test['feature_2'] = np.where((test['manufacturerID'] == 20) & (test['salutation'] == 'Mrs'), 1, 0)
test['feature_3'] = np.where((test['manufacturerID'] == 49) & (test['salutation'] == 'Mrs'), 1, 0)
test['feature_4'] = np.where((test['manufacturerID'] == 21) & (test['salutation'] == 'Mrs'), 1, 0)
test['feature_5'] = np.where((test['manufacturerID'] == 24) & (test['salutation'] == 'Mrs'), 1, 0)
test['feature_6'] = np.where((test['manufacturerID'] == 54) & (test['salutation'] == 'Mrs'), 1, 0)
test['feature_7'] = np.where((test['manufacturerID'] == 25) & (test['salutation'] == 'Mrs'), 1, 0)
test['feature_8'] = np.where((test['manufacturerID'] == 45) & (test['salutation'] == 'Mrs'), 1, 0)
test['feature_9'] = np.where((test['manufacturerID'] == 18) & (test['salutation'] == 'Mrs'), 1, 0)
test['feature_10'] = np.where((test['manufacturerID'] == 97) & (test['salutation'] == 'Mrs'), 1, 0)
test['feature_11'] = np.where((test['manufacturerID'] == 55) & (test['salutation'] == 'Mrs'), 1, 0)
test['feature_12'] = np.where((test['manufacturerID'] == 26) & (test['salutation'] == 'Mrs'), 1, 0)
test['feature_13'] = np.where((test['manufacturerID'] == 53) & (test['salutation'] == 'Mrs'), 1, 0)
test['feature_14'] = np.where((test['manufacturerID'] == 56) & (test['salutation'] == 'Mrs'), 1, 0)
test['feature_15'] = np.where((test['manufacturerID'] == 44) & (test['salutation'] == 'Mrs'), 1, 0)
test['feature_16'] = np.where((test['manufacturerID'] == 75) & (test['salutation'] == 'Mrs'), 1, 0)
test['feature_17'] = np.where((test['manufacturerID'] == 64) & (test['salutation'] == 'Mrs'), 1, 0)
test['feature_18'] = np.where((test['manufacturerID'] == 16) & (test['salutation'] == 'Mrs'), 1, 0)


test.head()

Unnamed: 0,orderItemID,orderDate,deliveryDate,itemID,size,color,manufacturerID,price,customerID,salutation,...,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18
0,1,2013-04-01,2013-04-03,2347,43,magenta,1,89.9,12489,Mrs,...,0,0,0,0,0,0,0,0,0,0
1,2,2013-04-01,2013-04-03,2741,43,grey,1,99.9,12489,Mrs,...,0,0,0,0,0,0,0,0,0,0
2,3,2013-04-01,2013-04-03,2514,9,ecru,19,79.9,12489,Mrs,...,0,0,0,0,0,0,0,0,0,0
3,4,2013-04-01,2013-05-06,2347,42,brown,1,89.9,12489,Mrs,...,0,0,0,0,0,0,0,0,0,0
4,5,2013-04-01,,2690,43,grey,1,119.9,12489,Mrs,...,0,0,0,0,0,0,0,0,0,0
