In [4]:
# Import necessary libraries
import pandas as pd
import boto3
from botocore import UNSIGNED
from botocore.client import Config

# Set sample size
n = 100000

In [5]:
# Read in kdd training data from S3
bucket = 'mlds-final-project-bucket'
prefix = 'kdd12/train/training_shuffled/'

## Create an S3 client
s3_client = boto3.client('s3', region_name='eu-west-2', config=Config(signature_version=UNSIGNED))

## List objects within a S3 bucket prefix and read to pandas for the first n rows
response = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix)
kdd12_data = pd.DataFrame()
rows = n
if 'Contents' in response:
    keys = [obj['Key'] for obj in response['Contents']][1:]
    for key in keys:
        if rows == 0:
            break
        obj = s3_client.get_object(Bucket=bucket, Key=key)
        df = pd.read_csv(
            obj['Body'],
            nrows = rows,
            header=0,
            names=['Click','Impression','DisplayURL','AdID','AdvertiserID','Depth','Position','QueryID','KeywordID','TitleID','DescriptionID','UserID']
            )
        kdd12_data = pd.concat([kdd12_data, df])
        rows -= df.shape[0]
else:
    print("Folder is empty.")

## Save the data to a csv file
kdd12_data.to_csv('../../data/kdd12/kdd12_training.csv', index=False)

ClientError: An error occurred (AccessDenied) when calling the ListObjectsV2 operation: Access Denied

In [None]:
https://mlds-final-project-bucket.s3.eu-west-2.amazonaws.com/kdd12/train/training_shuffled/

In [6]:
import requests

In [12]:
page = requests.get("https://mlds-final-project-bucket.s3.eu-west-2.amazonaws.com/kdd12/train/training_shuffled")  

In [13]:
page.content

b'<?xml version="1.0" encoding="UTF-8"?>\n<Error><Code>AccessDenied</Code><Message>Access Denied</Message><RequestId>P7AMDH9CGAH54H3T</RequestId><HostId>AbtSiozWygYWNEvQLteNlOoyJOOxClewcan4sYGvxspAAw3mfcvSzTyDI/DmqyHZQat/ALHssAE=</HostId></Error>'

In [10]:
kdd12_subset = pd.read_csv('https://mlds-final-project-bucket.s3.eu-west-2.amazonaws.com/kdd12/train/training_shuffled/run-1718449016919-part-r-00000')

In [11]:
kdd12_subset.head()

Unnamed: 0,click,impression,display_url,ad_id,advertiser_id,depth,position,keyword_id,title_id,description_id,query_id,user_id
0,0,1,12057878999086460853,20157098,27961,1,1,75606,15055,12391,13532,1350148
1,0,1,12057878999086460853,20221208,27961,2,1,2977,1278,3054,4561,1350148
2,0,1,12057878999086460853,20183701,27961,1,1,18594855,227,543,642,1350148
3,0,1,12057878999086460853,20183690,27961,1,1,4260473,34048,175983,155050,1350148
4,0,1,3029113635936639912,10397010,24973,2,2,2977,1274,2570,26091,1350148


In [8]:
# Read in avazu training data from S3
bucket = 'mlds-final-project-bucket'
prefix = 'avazu/train_shuffled/'

## Create an S3 client
s3_client = boto3.client('s3')

## List objects within a S3 bucket prefix and read to pandas for the first n rows
response = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix)
avazu_training_data = pd.DataFrame()
rows = n
if 'Contents' in response:
    keys = [obj['Key'] for obj in response['Contents']][1:]
    for key in keys:
        if rows == 0:
            break
        obj = s3_client.get_object(Bucket=bucket, Key=key)
        df = pd.read_csv(
            obj['Body'],
            nrows = rows
            )
        avazu_training_data = pd.concat([avazu_training_data, df])
        rows -= df.shape[0]
else:
    print("Folder is empty.")

## Save the data to a csv file
avazu_training_data.to_csv('../../data/avazu/avazu_train.csv', index=False)

In [10]:
# Read in criteo training data from S3
bucket = 'mlds-final-project-bucket'
prefix = 'dac/train_shuffled/'

## Create column name list as per the README file
criteo_names = ['click']
for i in range(1, 14):
    criteo_names.append('int_{0}'.format(i))

for i in range(1, 27):
    criteo_names.append('cat_{0}'.format(i))

## Create an S3 client
s3_client = boto3.client('s3')

## List objects within a S3 bucket prefix and read to pandas for the first n rows
response = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix)
criteo_training_data = pd.DataFrame()
rows = n
if 'Contents' in response:
    keys = [obj['Key'] for obj in response['Contents']][1:]
    for key in keys:
        if rows == 0:
            break
        obj = s3_client.get_object(Bucket=bucket, Key=key)
        df = pd.read_csv(
            obj['Body'],
            nrows = rows,
            header=0,
            names=criteo_names
            )
        criteo_training_data = pd.concat([criteo_training_data, df])
        rows -= df.shape[0]
else:
    print("Folder is empty.")

## Save the data to a csv file
criteo_training_data.to_csv('../../data/criteo/criteo_train.csv', index=False)