In [1]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 50) ## to display more columns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, ParameterGrid, GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'data-454'
bucket = s3.Bucket(bucket_name)

## Defining the file to be read from s3 bucket
file_key_train = 'Project_2/transact_train.txt'
file_key_test = 'Project_2/transact_class.txt'

bucket_object_train = bucket.Object(file_key_train)
file_object_train = bucket_object_train.get()
file_content_stream_train = file_object_train.get('Body')

bucket_object_test = bucket.Object(file_key_test)
file_object_test = bucket_object_test.get()
file_content_stream_test = file_object_test.get('Body')

## Reading the train csv file
train = pd.read_csv(file_content_stream_train, sep = '|', na_values = '?')
train.head(10)

Unnamed: 0,sessionNo,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,bMaxPrice,bSumPrice,bStep,onlineStatus,availability,customerNo,maxVal,customerScore,accountLifetime,payments,age,address,lastOrder,order
0,1,6,5,0.0,1,59.99,59.99,59.99,1,59.99,59.99,59.99,,,,1.0,600.0,70.0,21.0,1.0,43.0,1.0,49.0,y
1,1,6,5,11.94,1,59.99,59.99,59.99,1,59.99,59.99,59.99,2.0,y,completely orderable,1.0,600.0,70.0,21.0,1.0,43.0,1.0,49.0,y
2,1,6,5,39.887,1,59.99,59.99,59.99,1,59.99,59.99,59.99,,y,completely orderable,1.0,600.0,70.0,21.0,1.0,43.0,1.0,49.0,y
3,2,6,5,0.0,0,,,,0,,,,2.0,y,completely orderable,,,,,,,,,y
4,2,6,5,15.633,0,,,,0,,,,,y,completely orderable,,,,,,,,,y
5,2,6,5,26.235,0,,,,0,,,,4.0,y,completely orderable,,,,,,,,,y
6,2,6,5,71.2,0,,,,0,,,,4.0,y,completely orderable,,,,,,,,,y
7,2,6,5,94.469,0,,,,0,,,,,y,completely orderable,,,,,,,,,y
8,3,6,5,181.477,9,29.99,29.99,89.97,1,29.99,29.99,29.99,,,,3.0,1800.0,475.0,302.0,12.0,45.0,1.0,11.0,y
9,3,6,5,297.018,11,9.99,29.99,109.95,2,9.99,29.99,39.98,,,,3.0,1800.0,475.0,302.0,12.0,45.0,1.0,11.0,y


In [2]:
## Changing labels to numbers
train['order_numb'] = np.where(train['order'] == 'y', 1, 0)
train.head()

Unnamed: 0,sessionNo,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,bMaxPrice,bSumPrice,bStep,onlineStatus,availability,customerNo,maxVal,customerScore,accountLifetime,payments,age,address,lastOrder,order,order_numb
0,1,6,5,0.0,1,59.99,59.99,59.99,1,59.99,59.99,59.99,,,,1.0,600.0,70.0,21.0,1.0,43.0,1.0,49.0,y,1
1,1,6,5,11.94,1,59.99,59.99,59.99,1,59.99,59.99,59.99,2.0,y,completely orderable,1.0,600.0,70.0,21.0,1.0,43.0,1.0,49.0,y,1
2,1,6,5,39.887,1,59.99,59.99,59.99,1,59.99,59.99,59.99,,y,completely orderable,1.0,600.0,70.0,21.0,1.0,43.0,1.0,49.0,y,1
3,2,6,5,0.0,0,,,,0,,,,2.0,y,completely orderable,,,,,,,,,y,1
4,2,6,5,15.633,0,,,,0,,,,,y,completely orderable,,,,,,,,,y,1


In [34]:
def extracting_duration(data):
    
    ## Extracting sessions
    sessions = data['sessionNo'].unique()
    
    ## Defining data-frame to store results
    data_out = pd.DataFrame({'sessionNo': sessions})
    data_out['Duration'] = np.nan
    data_out['Buy'] = np.nan
    
    for i in range(0, data_out.shape[0]):
        
        temp = data[data['sessionNo'] == sessions[i]].reset_index(drop = True)
        
        ## Appending data 
        data_out.loc[i, 'Duration'] = temp['duration'].tail(1).values[0]
        data_out.loc[i, 'Buy'] = temp['order_numb'].tail(1).values[0]
    
    return data_out

In [18]:
a = train[train['sessionNo'] == 3].reset_index(drop = True)
a['duration'].tail(1).values[0]

341.61300000000006

In [None]:
test = extracting_duration(train)

In [30]:
test

Unnamed: 0,sessionNo,Duration,Buy
0,1,39.887,1.0
1,2,94.469,1.0
2,3,341.613,1.0
3,4,42.812,0.0
4,5,2816.046,1.0
...,...,...,...
49995,49996,,
49996,49997,,
49997,49998,,
49998,49999,,


In [4]:
len(train['sessionNo'].unique())

50000

In [None]:
train[train['sessionNo'] == 7]

In [None]:
len(train['sessionNo'].unique())

In [None]:
## Reading the test csv file
test = pd.read_csv(file_content_stream_test, sep = '|', na_values = '?')
test.head(10)

## Initial Exploration

In [None]:
train['order'].value_counts() / train['order'].shape[0]

In [None]:
train.describe()