In [6]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 50) ## to display more columns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, ParameterGrid, GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'data-454'
bucket = s3.Bucket(bucket_name)

## Defining the file to be read from s3 bucket
file_key_train = 'Project_2/transact_train.txt'
file_key_test = 'Project_2/transact_class.txt'

bucket_object_train = bucket.Object(file_key_train)
file_object_train = bucket_object_train.get()
file_content_stream_train = file_object_train.get('Body')

bucket_object_test = bucket.Object(file_key_test)
file_object_test = bucket_object_test.get()
file_content_stream_test = file_object_test.get('Body')

## Reading the train csv file
train = pd.read_csv(file_content_stream_train, sep = '|', na_values = '?')
train.head(10)

Unnamed: 0,sessionNo,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,bMaxPrice,bSumPrice,bStep,onlineStatus,availability,customerNo,maxVal,customerScore,accountLifetime,payments,age,address,lastOrder,order
0,1,6,5,0.0,1,59.99,59.99,59.99,1,59.99,59.99,59.99,,,,1.0,600.0,70.0,21.0,1.0,43.0,1.0,49.0,y
1,1,6,5,11.94,1,59.99,59.99,59.99,1,59.99,59.99,59.99,2.0,y,completely orderable,1.0,600.0,70.0,21.0,1.0,43.0,1.0,49.0,y
2,1,6,5,39.887,1,59.99,59.99,59.99,1,59.99,59.99,59.99,,y,completely orderable,1.0,600.0,70.0,21.0,1.0,43.0,1.0,49.0,y
3,2,6,5,0.0,0,,,,0,,,,2.0,y,completely orderable,,,,,,,,,y
4,2,6,5,15.633,0,,,,0,,,,,y,completely orderable,,,,,,,,,y
5,2,6,5,26.235,0,,,,0,,,,4.0,y,completely orderable,,,,,,,,,y
6,2,6,5,71.2,0,,,,0,,,,4.0,y,completely orderable,,,,,,,,,y
7,2,6,5,94.469,0,,,,0,,,,,y,completely orderable,,,,,,,,,y
8,3,6,5,181.477,9,29.99,29.99,89.97,1,29.99,29.99,29.99,,,,3.0,1800.0,475.0,302.0,12.0,45.0,1.0,11.0,y
9,3,6,5,297.018,11,9.99,29.99,109.95,2,9.99,29.99,39.98,,,,3.0,1800.0,475.0,302.0,12.0,45.0,1.0,11.0,y


In [11]:
train[train['sessionNo'] == 7]

Unnamed: 0,sessionNo,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,bMaxPrice,bSumPrice,bStep,onlineStatus,availability,customerNo,maxVal,customerScore,accountLifetime,payments,age,address,lastOrder,order
26,7,6,5,249.844,6,3.0,20.0,73.0,1,3.0,3.0,3.0,,,,5.0,900.0,575.0,35.0,10.0,31.0,2.0,10.0,y
27,7,6,5,268.713,6,3.0,20.0,73.0,1,3.0,3.0,3.0,1.0,y,completely orderable,5.0,900.0,575.0,35.0,10.0,31.0,2.0,10.0,y
28,7,6,5,274.297,6,3.0,20.0,73.0,1,3.0,3.0,3.0,2.0,y,completely orderable,5.0,900.0,575.0,35.0,10.0,31.0,2.0,10.0,y
29,7,6,5,286.562,6,3.0,20.0,73.0,1,3.0,3.0,3.0,2.0,y,completely orderable,5.0,900.0,575.0,35.0,10.0,31.0,2.0,10.0,y
30,7,6,5,300.32,6,3.0,20.0,73.0,1,3.0,3.0,3.0,,y,completely orderable,5.0,900.0,575.0,35.0,10.0,31.0,2.0,10.0,y
31,7,6,5,304.672,6,3.0,20.0,73.0,1,3.0,3.0,3.0,4.0,y,completely orderable,5.0,900.0,575.0,35.0,10.0,31.0,2.0,10.0,y
32,7,6,5,310.456,6,3.0,20.0,73.0,1,3.0,3.0,3.0,,y,completely orderable,5.0,900.0,575.0,35.0,10.0,31.0,2.0,10.0,y
33,7,6,5,510.161,9,3.0,20.0,82.0,2,3.0,3.0,6.0,,,,5.0,900.0,575.0,35.0,10.0,31.0,2.0,10.0,y
34,7,6,5,610.566,12,3.0,20.0,91.0,2,3.0,3.0,6.0,1.0,y,completely orderable,5.0,900.0,575.0,35.0,10.0,31.0,2.0,10.0,y
35,7,6,5,619.672,13,3.0,20.0,94.0,3,3.0,3.0,9.0,,,,5.0,900.0,575.0,35.0,10.0,31.0,2.0,10.0,y


In [4]:
len(train['sessionNo'].unique())

50000

In [7]:
## Reading the test csv file
test = pd.read_csv(file_content_stream_test, sep = '|', na_values = '?')
test.head(10)

Unnamed: 0,sessionNo,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,bMaxPrice,bSumPrice,bStep,onlineStatus,availability,customerNo,maxVal,customerScore,accountLifetime,payments,age,address,lastOrder
0,1,18,7,136.833,3,39.99,39.99,79.98,1,39.99,39.99,39.99,2.0,y,completely orderable,25039.0,1300.0,489.0,188.0,5.0,49.0,1.0,65.0
1,1,18,7,189.984,3,39.99,39.99,79.98,1,39.99,39.99,39.99,,y,completely orderable,25039.0,1300.0,489.0,188.0,5.0,49.0,1.0,65.0
2,1,18,7,342.894,6,16.99,39.99,113.96,2,16.99,39.99,56.98,,,,25039.0,1300.0,489.0,188.0,5.0,49.0,1.0,65.0
3,1,18,7,411.051,8,16.99,39.99,149.94,3,16.99,39.99,74.97,,,,25039.0,1300.0,489.0,188.0,5.0,49.0,1.0,65.0
4,1,18,7,460.049,10,16.99,39.99,189.92,4,16.99,39.99,94.96,,,,25039.0,1300.0,489.0,188.0,5.0,49.0,1.0,65.0
5,1,18,7,471.502,10,16.99,39.99,189.92,4,16.99,39.99,94.96,1.0,y,completely orderable,25039.0,1300.0,489.0,188.0,5.0,49.0,1.0,65.0
6,1,18,7,560.026,11,16.99,39.99,207.91,5,16.99,39.99,112.95,,,,25039.0,1300.0,489.0,188.0,5.0,49.0,1.0,65.0
7,1,18,7,564.597,11,16.99,39.99,207.91,5,16.99,39.99,112.95,1.0,y,completely orderable,25039.0,1300.0,489.0,188.0,5.0,49.0,1.0,65.0
8,1,18,7,624.606,11,16.99,39.99,207.91,5,16.99,39.99,112.95,,y,completely orderable,25039.0,1300.0,489.0,188.0,5.0,49.0,1.0,65.0
9,2,18,7,133.321,7,34.99,34.99,69.98,1,34.99,34.99,34.99,,,,25040.0,1200.0,543.0,43.0,5.0,29.0,2.0,184.0


## Initial Exploration

In [3]:
train['order'].value_counts() / train['order'].shape[0]

y    0.67604
n    0.32396
Name: order, dtype: float64

In [5]:
train.describe()

Unnamed: 0,sessionNo,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,...,bSumPrice,bStep,customerNo,maxVal,customerScore,accountLifetime,payments,age,address,lastOrder
count,429013.0,429013.0,429013.0,429013.0,429013.0,426248.0,426248.0,426248.0,429013.0,423883.0,...,423883.0,237680.0,277915.0,275273.0,275273.0,275273.0,277915.0,277617.0,277915.0,277915.0
mean,25274.631293,14.617061,5.924839,1573.90164,24.140317,55.289127,146.663005,1189.248209,4.135168,67.625341,...,213.260809,2.292393,12184.130921,2486.35827,485.298449,135.557403,15.218016,44.919861,1.734556,79.883975
std,14441.366146,4.485914,0.79093,2427.123356,30.398164,148.879937,283.217841,3371.173815,4.451778,174.986371,...,459.389852,1.306963,7297.774184,3038.425813,131.02763,109.577139,34.892917,11.935945,0.443903,113.201967
min,1.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,17.0,1.0,3.0
25%,12731.0,11.0,5.0,225.07,5.0,6.99,25.95,137.96,1.0,8.99,...,39.99,1.0,5793.0,600.0,481.0,43.0,3.0,36.0,1.0,14.0
50%,25470.0,15.0,6.0,738.199,13.0,12.0,49.99,388.0,3.0,14.99,...,87.97,2.0,12045.0,1500.0,520.0,109.0,8.0,45.0,2.0,34.0
75%,37542.0,18.0,7.0,1880.265,31.0,29.99,99.99,1046.43,5.0,34.99,...,205.38,3.0,18350.0,4000.0,555.0,219.0,15.0,53.0,2.0,86.0
max,50000.0,23.0,7.0,21580.092,200.0,5999.99,6999.99,115742.0,108.0,6999.99,...,23116.88,5.0,25038.0,50000.0,638.0,600.0,868.0,99.0,3.0,738.0
