In [3]:
import os
import numpy as np
import pandas as pd
import gzip
import matplotlib.pyplot as plt
import pickle
import csv
import time
import sys
import math
import random
import seaborn as sns
from collections import defaultdict
from IPython.display import display, clear_output

from sklearn.decomposition import PCA

#some numbers that takes a while to compute, and I dont want to run the computation again
nrows_train = 37670294
nrows_test = 2528244
nuser_ids = 1198787
nbookings_train = 3000694

base_path = os.getcwd()
data_path = os.path.join(base_path, 'data/')

In [4]:
#1 milion lines of train require 192MB of memory

def read_csv_in_gzip(filename, beginning=0, end=1000000):
    """Reads a csv file in gzip, returns a pandas DataFrame"""
    nrows = end - beginning
    with gzip.open(filename) as fzip:
        df = pd.read_csv(fzip, skiprows=beginning, nrows=nrows)
        
    return df


def update_userid_keys(id_list, df):
    """Adds to the id_list the user_id keys in the dataframe df
    """
    new_ids = list(set(id_list + list(df['user_id'].values)))
    
    return new_ids


def find_all_id(filename):
    """Returns a list with all the user_id in the gzip filename"""
    chunk_size = 10 ** 5
    id_list = []
    chunk_counter = 0
    with gzip.open(filename) as fzip:
        for chunk in pd.read_csv(fzip, chunksize=chunk_size):
            if chunk_counter % 50 == 0:    
                print('Processing chunk n. {} (chunk size equal to {})'.format(chunk_counter, chunk_size))
            chunk_counter += 1
            id_list = update_userid_keys(id_list, chunk)
    
    return id_list


def write_list_to_txt(lista, filename, append=True):
    """If append is true and the file already exists, it appends the result at the bottom 
    of the file"""
    with open(filename, 'a') as w_file:
        for item in lista:
            w_file.write("%s\n" % item)
    pass


def count_lines(filename, is_zip=False, print_exponent=20):
    """
    returns the number of lines in the gzip file.
    """
    if is_zip:
        with gzip.open(filename) as fzip:
            for i, l in enumerate(fzip):
                if print_exponent is not None:
                    if i % (2 ** print_exponent) == 0:
                        clear_output(wait=True)
                        print('got to line', i)
                        sys.stdout.flush()
    else:
        with open(filename) as f:
            for i, _ in enumerate(f):
                if print_exponent is not None:
                    if i % (2 ** print_exponent) == 0:
                        clear_output(wait=True)
                        print('got to line', i)
                        sys.stdout.flush()
    return i + 1


def count_files(directory_path):
    return len(os.listdir(directory_path))

In [5]:
def get_magnitude(number):
    magnitude = math.log(number, 10)
    n_unit = int(magnitude / 3)
    if n_unit == 0:
        unit = ''
        etodivide = 0
    elif n_unit == 1:
        unit = 'K'
        etodivide = 3
    elif n_unit == 2:
        unit = 'M'
        etodivide = 6
    elif n_unit == 3:
        unit = 'G'
        etodivide = 9
    else:
        unit = 'E' + str(magnitude)
        etodivide = magnitude
        
    return unit, etodivide


def print_processing_status(current_line, tot_line, s_time, frequency=None, pre_message=None):
    if frequency is None:
        frequency = int(tot_line / 100)
    message = 'Processing line {0:.1f}{4} (of {1:.1f}{4}). \t Elapsed time: {2:.0f} secs, \t ETA: {3:.0f} secs.' 
    if current_line % frequency == 0 and current_line > 1:
        unit, etodivide = get_magnitude(tot_line)
        loc_time = time.time() - s_time
        
        clear_output(wait=True)
        if pre_message is not None:
            print(pre_message)
        print(message.format(current_line / (10 ** etodivide), tot_line / (10 ** etodivide), 
              loc_time, (tot_line / current_line - 1) * loc_time, unit))
        sys.stdout.flush()
        
        
def convert_time(seconds):
    secs = int(seconds) % 60
    mins = (seconds - secs)/60 % 60
    hours = ((seconds - secs)/60 - mins)/60 
    if hours != 0:
        return str(hours) + 'h' + str(mins) + 'm'
    else:
        return str(mins) + 'm' + str(secs) + 's'

In [5]:
def write_in_csv(filename, df, columns=[], replace=False):
    """
    Writes a dataframe to a csv file.
    Columns are the name of the columns to write to file.
    If replace, it overwrites the file if it exists, 
    otherwise it appends to the end of it.
    The variable already_exists is to write the columns in the first line,
    when initializing a new file.
    """
    if replace:
        write_mode = 'w'
        already_exists = False
    else:
        write_mode = 'a'
        already_exists = False
        try:
            with open(filename, 'r') as f:
                already_exists = True
        except FileNotFoundError:
            pass
    
    nlines = len(df)
    l_chunk = int(nlines / 20)
          
    start_time = time.time()
    with open(filename, write_mode) as f:
        scrivi = csv.writer(f, delimiter=',')
        if not already_exists:
            scrivi.writerow(columns)
        for count, i in enumerate(df.index):
            scrivi.writerow(df.ix[i].values)
            
            print_processing_status(count, nlines, start_time)

In [6]:
def split_csv_for_userid(df):
    """
    Splits a dataframe according to the user_id key.
    Returns a dictionary, with keys the user_id.
    """
    local_ids = list(set(df['user_id']))
    
    splitted = dict()
    for j in local_ids:
        splitted[j] = pd.DataFrame(columns=df.columns)
        splitted[j] = df.ix[df['user_id'] == j]
        
    return splitted


def create_users_file(bigzipfile, basename='user_', location=os.path.join(os.getcwd(), 'data/'), n_tot_lines=0):
    """
    Process the bigzipfile, and creates a csv file for each user_id key, with the corresponding lines.
    """
    chunk_size = 10 ** 5
    n_chunks = int(n_tot_lines/chunk_size) + 1
    chunk_counter = 0
    print('Beginning to read:')
    with gzip.open(bigzipfile) as fzip:
        for chunk in pd.read_csv(fzip, chunksize=chunk_size):
            chunk_counter += 1
            print('Processing chunk n. {} (of {})'.format(chunk_counter, n_chunks))
            local_splitted = split_csv_for_userid(chunk)
            for i in local_splitted.keys():
                filename = basename + str(i) + '.csv'
                filename = location + filename
                write_in_csv(filename, local_splitted[i], local_splitted[i].columns)
              

In [6]:
def sample_users(nusers, nsamples):
    """
    Selects a sample of user_ids.
    """
    all_ids = list(range(nusers-1))
    random.shuffle(all_ids)
    
    uids = []
    for i in range(nsamples):
        uids += [all_ids.pop()]
    
    all_filenames = np.array(os.listdir(udata_path))
    sample_ids = np.zeros(nsamples, dtype=int)
    for i in range(nsamples):
        sample_ids[i] = int(all_filenames[uids[i]][5:-4])
        
    return sample_ids


def write_sample_train(sample_ids, train_filename='train.csv.gz', out_filename='data/train_sample.csv'):
    """
    A function to select a sample of user_ids and write them to file
    """
    
    with gzip.open(train_filename, 'rt', newline='') as infile:
        with open(out_filename, 'w', newline='') as outfile:
            legge = csv.reader(infile)
            scrive = csv.writer(outfile, delimiter=',')
            
            unit, etodivide = get_magnitude(nrows_train)
            status_string = 'Processing line {0:.1f}{4} (of {1:.1f}{4}). \t'
            statur_string += ' Elapsed time: {2:.0f} secs, \t ETA: {3:.0f} secs.' 
            start_time = time.time()
            
            p_wrote = False
            previous_id = -1
            for i, row in enumerate(legge):             
                if i>2 : 
                    print_processing_status(i, nrows_train, start_time, frequency=50000, pre_message='Writing...')
                
                if i == 0:
                    print('Initializing...')
                    scrive.writerow(row)
                else:
                    if int(row[7]) == previous_id:
                        if p_wrote:
                            scrive.writerow(row)
                        else:
                            pass                    
                    else:
                        if int(row[7]) in sample_ids:
                            scrive.writerow(row)
                            p_wrote = True
                        else: 
                            p_wrote = False
                    previous_id = int(row[7])
                        
            print('Done!')

In [9]:
def split_df_onfile(amount, amount2=0, infilename='data/train_booking_lines.csv', printfile=False, 
                    printcomplement=False, filename1='data/cv_data.csv', filename2='data/train_data.csv'):
    """
    Selects from a dataframe stored in the csv file infilename a subset big as amount.
    If printfile, it prints it to the csv file filename1, otherwise it returns the indices 
    and the dataframe.
    If printcomplement, it prints to file also the complement of the selected dataframe.
    """
    
    print('Loading file...')
    df_original = pd.read_csv(infilename)
    nlines0 = len(df_original)
    print('Done!')
    
    print('Indices selection...')
    indici_s = list(range(nlines0))
    random.shuffle(indici_s)
    
    indici1 = []
    indici2 = []
    
    for i in range(nlines0):
        if i < amount:
            indici1 += [indici_s.pop()]
        elif i < amount + amount2:
            indici2 += [indici_s.pop()]
    
    indici1 = sorted(indici1)
    if printcomplement:
        indici2 = sorted(indici2)
    print('Done!')
    
    print('Dataframe splittin...')
    df_split1 = df_original.ix[indici1]
    if printcomplement:
        df_split2 = df_original.ix[indici2]
    print('Done!')
    
    if printfile:
        print('Writing file 1...')
        write_in_csv(filename=filename1, df=df_split1, replace=True, columns=df_split1.columns, print_status=True)
        print('Done!')
        
        if printcomplement:
            print('Writing file 2...')
            write_in_csv(filename=filename2, df=df_split2, replace=True, 
                         columns=df_split2.columns, print_status=True)
            print('Done!')
    else:
        return indici, cv_df

In [11]:
def print_cluster_dict_to_file(cdict, filename):
    """
    cdict is a ditionary of dictionaries, whose first key is the first label you want to
    have an idea of, while the key of the second-layer dict are the hotel cluster;
    the value represents how many hotels of that cluster appear with the first label.
    This writes such a dict to file in the following way:
    for each key1, it prints its value and the length of its keys in a row, separated by a comma;
    then in the following rows are the key2 and the value, separated by a comma.
    """
    
    with open(filename, 'w') as ofile:
        for k1 in cdict.keys():
            length1 = len(cdict[k1])
            key1_line = k1 + ',' + str(length1) + '\n'
            ofile.write(key1_line)
            for k2 in cdict[k1].keys():
                line = k2 + ',' + str(cdict[k1][k2]) + '\n'
                ofile.write(line)
    

def read_cluster_dict_from_file(filename):
    toret = defaultdict(lambda: defaultdict(lambda: 0))
    with open(filename) as ifile:
        legge = csv.reader(ifile)
        while True:
            try:
                line1 = next(legge)
                k1 = line1[0]
                l1 = line1[1]
                for i in range(int(l1)):
                    line2 = next(legge)
                    k2 = line2[0]
                    v2 = line2[1]
                    toret[k1][k2] = int(v2)
    
            except StopIteration:
                break
    
    return toret

In [14]:
def create_dest_features(n_feat=3):
    """
    using PCA, it finds some features from the file destinations.csv.gz
    """
    destinations = read_csv_in_gzip('destinations.csv.gz')
    pca = PCA(n_components=n_feat)
    dest_small = pca.fit_transform(destinations[["d{0}".format(i + 1) for i in range(149)]])
    dest_small = pd.DataFrame(dest_small)
    dest_small["srch_destination_id"] = destinations["srch_destination_id"]
    
    return dest_small    


def augment_df(df):
    """
    augments the dataframe df with additional features
    """
    df["date_time"] = pd.to_datetime(df["date_time"])
    df["srch_ci"] = pd.to_datetime(df["srch_ci"], format='%Y-%m-%d', errors="coerce")
    df["srch_co"] = pd.to_datetime(df["srch_co"], format='%Y-%m-%d', errors="coerce")
    
    props = {}
    for prop in ["month", "day", "hour", "minute", "dayofweek", "quarter"]:
        props[prop] = getattr(df["date_time"].dt, prop)
    
    carryover = [p for p in df.columns if p not in ["date_time", "srch_ci", "srch_co"]]
    for prop in carryover:
        props[prop] = df[prop]
    
    date_props = ["month", "day", "dayofweek", "quarter"]
    for prop in date_props:
        props["ci_{0}".format(prop)] = getattr(df["srch_ci"].dt, prop)
        props["co_{0}".format(prop)] = getattr(df["srch_co"].dt, prop)
    props["stay_span"] = (df["srch_co"] - df["srch_ci"]).astype('timedelta64[h]')
        
    ret = pd.DataFrame(props)
    
    ret = ret.join(dest_small, on="srch_destination_id", how='left', rsuffix="dest")
    ret = ret.drop("srch_destination_iddest", axis=1)
    return ret

dest_small = create_dest_features()


def create_augmented_train_file(in_filename='train.csv.gz',
                                out_filename='data/train_augmented.csv',
                                tot_lines=nrows_train
                               ):
    """
    it creates a file with the augmented features
    """    
    
    chunk_size = 2 ** 10
    n_chunks = int(tot_lines / chunk_size) + 1
    start_time = time.time()
    
    with gzip.open(in_filename) as infile:
        reader = pd.read_csv(infile, chunksize=chunk_size)
        for i, chunk in enumerate(reader):
            print_processing_status(i, n_chunks, start_time, frequency=1)
            augmented = augment_df(chunk) 
            write_in_csv(filename=out_filename, df=augmented, columns=augmented.columns)
    

In [None]:
create_augmented_train_file(in_filename='train.csv.gz',
                           out_filename= os.path.join(data_path, 'test_augmented.csv'),
                           tot_lines=nrows_test)

In [16]:
create_augmented_train_file(in_filename='test.csv.gz',
                           out_filename= os.path.join(data_path, 'test_augmented.csv'),
                           tot_lines=nrows_test)

Processing line 2.5K (of 2.5K). 	 Elapsed time: 1012 secs, 	 ETA: 0 secs.
