In [1]:
import sys
from xgboost import XGBClassifier, cv
import geopandas as gpd
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn import model_selection
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import xgboost
from joblib import dump, load
from datetime import datetime


In [None]:
train_df = gpd.read_file('train.geojson', index_col=0)
test_df = gpd.read_file('test.geojson', index_col=0)

In [None]:
# adding time differences
def dates2diff(df):
    df['diff1'] = (df['date2'].apply(lambda x: datetime.strptime(x, "%d-%m-%Y"))
                        - df['date1'].apply(lambda x: datetime.strptime(x, "%d-%m-%Y"))).apply(lambda x: x.total_seconds())
    df['diff2'] = (df['date3'].apply(lambda x: datetime.strptime(x, "%d-%m-%Y"))
                        - df['date2'].apply(lambda x: datetime.strptime(x, "%d-%m-%Y"))).apply(lambda x: x.total_seconds())
    df['diff3'] = (df['date4'].apply(lambda x: datetime.strptime(x, "%d-%m-%Y"))
                        - df['date3'].apply(lambda x: datetime.strptime(x, "%d-%m-%Y"))).apply(lambda x: x.total_seconds())
    df['diff4'] = (df['date5'].apply(lambda x: datetime.strptime(x, "%d-%m-%Y"))
                        - df['date4'].apply(lambda x: datetime.strptime(x, "%d-%m-%Y"))).apply(lambda x: x.total_seconds())
    return df

df_aug1 = dates2diff(train_df)

In [None]:
change_type_map = {'Demolition': 0, 'Road': 1, 'Residential': 2, 'Commercial': 3, 'Industrial': 4,
                   'Mega Projects': 5}
format_date = "%d-%m-%Y"

def extract_features_date(df):
    global n_na_found, n_na_only_found
    n_na_found = 0
    n_na_only_found = 0
    
    #Step are strings. Steps in the same tuple are considered similar enough. The order of steps_tuple defines the order of steps in reality.
    steps_tuple = [('Greenland', 'Land Cleared'), 
             ('Prior Construction',), 
             ('Materials Dumped',),
             ('Construction Started', 'Excavation'),
             ('Construction Midway',),
             ('Construction Done',),
             ]
    steps = ()
    for step_tuple in steps_tuple:
        steps += step_tuple
    
    #Step (string) to avancement (int)
    def step2av(step):
        '''
        Input: step is a string
        Output: av is an int representing avancement
        '''
        for i, steps in enumerate(steps_tuple):
            if step in steps:
                return i
        print("NA DETECTED")
        return 'NA'
        
    #Extract duration and month where each advancement was made.
    def augment_date(row):
        global n_na_found, n_na_only_found
        
        #Feature for describing if construction is done at date5
        is_constructed = int(row['change_status_date5'] == 'Construction Done')
        
        #List [duration_for_reaching_avancement_A for A in Avancements]
        duration_for_reaching = [None for _ in range(len(steps_tuple)-1)]
        #List [int_representing_month_where_advancementA_was_made for A in Avancements]
        month_where_was = [None for _ in range(len(steps_tuple)-1)]
        #[0,0,1,3,5]
        L_int_steps = [step2av(row[status]) for status in ('change_status_date1', 'change_status_date2', 'change_status_date3', 'change_status_date4', 'change_status_date5')]    
        
        #If there a status is NA, the status become the previous status
        for i in range(len(L_int_steps)):
            if L_int_steps[i] == 'NA':
                n_na_found += 1
                if i == 0:
                    L_int_steps[i] = L_int_steps[i+1]
                else:
                    L_int_steps[i] = L_int_steps[i-1]
        
        #If status are only NA, status become only Greenland
        if 'NA' in L_int_steps:
            print("FULL NA DETECTED")
            n_na_only_found += 1
            L_int_steps = ['Greenland' for _ in range(len(L_int_steps))]
            
        #Each time we do an advancement (ie step changes), we fill the list duration_for_reaching with the diff time. 
        #If severals advancements are made we divise the duration by the number of advancements.
        #To implement: instead of divising by the duration, for each advancement A, we do duration(A) = D(A) / Sum_A(D(A)) where D(A) is the mean duration of the advancement, computed on data where advancement was reached in one step
        for k in range(len(L_int_steps)-1):
            int_step = L_int_steps[k]
            int_step_next = L_int_steps[k+1]
            
            if int_step_next > int_step:
                print("Nouvelle step détecté: de", int_step, 'à', int_step_next)
                for u in range(int_step, int_step_next):
                    #If severals advancement are made between only two dates, the duration of each advancement is the duration divided by the number of dates-1.
                    duration_for_reaching[u] = row['diff' + str(k+1)] // (int_step_next-int_step)
                    #The month where EVERY (Implement: not every advancements happend at the same time...) advancements are made is the month of the date between two dates
                    t1 = datetime.timestamp(datetime.strptime(row["date" + str(k+1)], format_date))
                    t2 = datetime.timestamp(datetime.strptime(row["date" + str(k+2)], format_date))
                    month_where_was[u] = datetime.fromtimestamp(t1 + (t2-t1)/2).month

        print(duration_for_reaching)
        print(month_where_was)
        
        L = [is_constructed, ] + duration_for_reaching + month_where_was
        # sys.exit()
        return L
    
    #Nom des features
    columns_names = ['is_constructed',] + ['duration_to_reach' + str(step2av(step[0])) for step in steps_tuple[1:]] + ['month_of_advancement' + str(step2av(step[0])) for step in steps_tuple[1:]]
    #Features augmentées 
    df_augment = df.apply(lambda row: pd.Series(augment_date(row), index=columns_names), axis=1)
    
    print(f"NA FOUND:{n_na_found}, NA ONLY FOUND: {n_na_only_found}")
    return df_augment

N_max = 50000
df_aug2 = extract_features_date(df_aug1[:N_max])

