In [2]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, ParameterGrid, GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'data-454'
bucket = s3.Bucket(bucket_name)

## Defining the file to be read from s3 bucket
file_key_train = 'Project_3/orders_train.txt'
file_key_test = 'Project_3/orders_class.txt'

bucket_object_train = bucket.Object(file_key_train)
file_object_train = bucket_object_train.get()
file_content_stream_train = file_object_train.get('Body')

bucket_object_test = bucket.Object(file_key_test)
file_object_test = bucket_object_test.get()
file_content_stream_test = file_object_test.get('Body')

## Reading train file
train = pd.read_csv(file_content_stream_train, sep = ';', na_values = '?')
train.head()

Unnamed: 0,orderItemID,orderDate,deliveryDate,itemID,size,color,manufacturerID,price,customerID,salutation,dateOfBirth,state,creationDate,returnShipment
0,1,2012-04-01,2012-04-03,186,m,denim,25,69.9,794,Mrs,1965-01-06,Baden-Wuerttemberg,2011-04-25,0
1,2,2012-04-01,2012-04-03,71,9+,ocher,21,69.95,794,Mrs,1965-01-06,Baden-Wuerttemberg,2011-04-25,1
2,3,2012-04-01,2012-04-03,71,9+,curry,21,69.95,794,Mrs,1965-01-06,Baden-Wuerttemberg,2011-04-25,1
3,4,2012-04-02,,22,m,green,14,39.9,808,Mrs,1959-11-09,Saxony,2012-01-04,0
4,5,2012-04-02,1990-12-31,151,39,black,53,29.9,825,Mrs,1964-07-11,Rhineland-Palatinate,2011-02-16,0


In [None]:
## Reading test file
test = pd.read_csv(file_content_stream_test, sep = ';', na_values = '?')
test

## Initial Exploration

### Return by Size

In [33]:
## Creating table of size by return 
size_return = pd.DataFrame(pd.crosstab(train['size'], train['returnShipment']))
size_return.columns = ['No_Return', 'Return']

## Computing row totals
size_return['Row_Tot'] = size_return['No_Return'] + size_return['Return']

## Computing row-wise percentages
size_return['No_Return_pct'] = size_return['No_Return'] / size_return['Row_Tot']
size_return['Return_pct'] = size_return['Return'] / size_return['Row_Tot']

## Sorting based on Return percentage
size_return = size_return.sort_values(by = 'Return_pct', ascending = False)
size_return['size'] = size_return.index
size_return = size_return.reset_index(drop = True)

## Selecting more than 20 orders
size_return[(size_return['Row_Tot'] >= 20) & (size_return['Return_pct'] > 0.5)]

Unnamed: 0,No_Return,Return,Row_Tot,No_Return_pct,Return_pct,size
6,11,20,31,0.354839,0.645161,96
9,177,248,425,0.416471,0.583529,23
11,286,388,674,0.424332,0.575668,50
12,17,23,40,0.425,0.575,44+
13,1601,2150,3751,0.42682,0.57318,48
14,900,1192,2092,0.43021,0.56979,8
15,302,396,698,0.432665,0.567335,40+
16,457,598,1055,0.433175,0.566825,10
18,972,1238,2210,0.439819,0.560181,6
19,762,970,1732,0.439954,0.560046,7


### Returns by Manufacturer

In [39]:
## Creating a table of return by manufacter
manufacturer_return = pd.DataFrame(pd.crosstab(train['manufacturerID'], train['returnShipment']))
manufacturer_return.columns = ['No_Return', 'Return']

## Computing the row total
manufacturer_return['Row_Tot'] = manufacturer_return['No_Return'] + manufacturer_return['Return']

## Computing row-wise percentages
manufacturer_return['No_Return_pct'] = manufacturer_return['No_Return'] / manufacturer_return['Row_Tot']
manufacturer_return['Return_pct'] = manufacturer_return['Return'] / manufacturer_return['Row_Tot']

## Sorting based on Return percentage
manufacturer_return = manufacturer_return.sort_values(by = 'Return_pct', ascending = False)
manufacturer_return['manufacturerID'] = manufacturer_return.index
manufacturer_return = manufacturer_return.reset_index(drop = True)

## Selecting more than 20 orders
manufacturer_return[(manufacturer_return['Row_Tot'] >= 20) & (manufacturer_return['Return_pct'] > 0.5)]

Unnamed: 0,No_Return,Return,Row_Tot,No_Return_pct,Return_pct,manufacturerID
3,23,118,141,0.163121,0.836879,96
4,35,152,187,0.187166,0.812834,101
5,39,158,197,0.197970,0.802030,89
6,67,150,217,0.308756,0.691244,100
7,94,210,304,0.309211,0.690789,152
...,...,...,...,...,...,...
64,1220,1269,2489,0.490157,0.509843,23
65,2114,2182,4296,0.492086,0.507914,19
66,2486,2557,5043,0.492961,0.507039,31
67,4144,4260,8404,0.493099,0.506901,28
