# Date Time Features

In [5]:
import pandas as pd
df = pd.DataFrame(pd.date_range('2020-01-06', '2020-01-10', freq='10H'), columns =['datetime_column'])
df

Unnamed: 0,datetime_column
0,2020-01-06 00:00:00
1,2020-01-06 10:00:00
2,2020-01-06 20:00:00
3,2020-01-07 06:00:00
4,2020-01-07 16:00:00
5,2020-01-08 02:00:00
6,2020-01-08 12:00:00
7,2020-01-08 22:00:00
8,2020-01-09 08:00:00
9,2020-01-09 18:00:00


In [7]:
# year 
df.loc[:, "year"] = df['datetime_column'].dt.year
df.loc[:, "weekofyear"] = df['datetime_column'].dt.weekofyear
df.loc[:, "month"] = df['datetime_column'].dt.month
df.loc[:, "dayofweek"] = df['datetime_column'].dt.month
df.loc[:, "hour"] = df['datetime_column'].dt.hour
df.loc[:, "weekend"] = (df.datetime_column.dt.weekday>=5).astype(int)
df




Unnamed: 0,datetime_column,year,weekofyear,month,dayofweek,hour,weekend
0,2020-01-06 00:00:00,2020,2,1,1,0,0
1,2020-01-06 10:00:00,2020,2,1,1,10,0
2,2020-01-06 20:00:00,2020,2,1,1,20,0
3,2020-01-07 06:00:00,2020,2,1,1,6,0
4,2020-01-07 16:00:00,2020,2,1,1,16,0
5,2020-01-08 02:00:00,2020,2,1,1,2,0
6,2020-01-08 12:00:00,2020,2,1,1,12,0
7,2020-01-08 22:00:00,2020,2,1,1,22,0
8,2020-01-09 08:00:00,2020,2,1,1,8,0
9,2020-01-09 18:00:00,2020,2,1,1,18,0


# Numerical aggregrated features

In [22]:
# Datetime Features and Group By Features
df = pd.DataFrame({'date': pd.date_range('2020-01-01', '2020-01-10', freq='24H'),
                  'customer_id': [1,1,2,3,4,1,2,3,4,5],
                  'cat1': [2,4,5,6,12,3,56,1,2,3],
                  'cat2': [1224,1243,133,11,123,453,12,54,12,12],
                  'num1': [0.4, 0.3, 0.6, 0.4, 0.2, 0.2, 1.1, -0.6, 1, 45]})

def generate_features(df):
    df.loc[:, "year"] = df.date.dt.year
    df.loc[:, "month"] = df.date.dt.month
    df.loc[:, "weekofyear"] = df.date.dt.weekofyear
    df.loc[:, "month"] = df.date.dt.month
    df.loc[:, "dayofweek"] = df.date.dt.dayofweek
    df.loc[:, "weekend"] = (df.date.dt.weekday >=5).astype(int)
    
    # create an aggregrate dictionary
    aggs = {}
    aggs['month'] = ['nunique', 'mean']
    aggs['weekofyear'] = ['nunique', 'mean']
    aggs['num1'] = ['sum', 'min', 'max']
    aggs['customer_id'] = ['nunique']
    
    # we group by customer_id and calculate the aggregrates
    agg_df = df.groupby('customer_id').agg(aggs)
    agg_df = agg_df.reset_index()
    
    return agg_df

generate_features(df)
    
    
    

Unnamed: 0_level_0,customer_id,month,month,weekofyear,weekofyear,num1,num1,num1,customer_id
Unnamed: 0_level_1,Unnamed: 1_level_1,nunique,mean,nunique,mean,sum,min,max,nunique
0,1,1,1,2,1.333333,0.9,0.2,0.4,1
1,2,1,1,2,1.5,1.7,0.6,1.1,1
2,3,1,1,2,1.5,-0.2,-0.6,0.4,1
3,4,1,1,2,1.5,1.2,0.2,1.0,1
4,5,1,1,1,2.0,45.0,45.0,45.0,1


# list of values 

In [None]:
# transactions for a customer in a given period of time
import numpy as np
feature_dict = {}

# calculate mean
feature_dict['mean'] = np.mean(x)
feature_dict['max'] = np.max(x)
feature_dict['std'] = np.std(x)
feature_dict['var'] = np.var(x)
feature_dict['ptp'] = np.ptp(x)

# percentile features
feature_dict['percentile_10'] = np.percentile(x, 10)
feature_dict['percentile_75'] = np.percentile(x, 75)

# quantile features
feature_dict['quantile_10'] = np.percentile(x, 10)
feature_dict['quantile_75'] = np.percentile(x, 75)




-  The time series data (list of values) can be converted to a lot of features
- "tsfresh" python library
- URL: https://tsfresh.readthedocs.io/en/latest/index.html

# Polynomial Features

In [43]:
import numpy as np
# 2 columns and 100 rows
df = pd.DataFrame(np.random.rand(100,2), columns = [f"f_{i}" for i in range(1,3)])
df

# two degree polynomial features
# f_1, f_2, f1_f2, f1**2, f2**2

from sklearn import preprocessing

# initialize polynomial feature class object
pf = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
pf.fit(df)

# create polynomial features
poly_feats = pf.transform(df)

# create a dataframe with all the features
num_feats = poly_feats.shape[1]
df_transformed = pd.DataFrame(poly_feats, 
                             columns = [f"f_{i}" for i in range(1, num_feats+1)])
df_transformed

Unnamed: 0,f_1,f_2,f_3,f_4,f_5
0,0.053207,0.592453,0.002831,0.031523,0.351000
1,0.797546,0.229906,0.636080,0.183361,0.052857
2,0.533244,0.367698,0.284349,0.196073,0.135202
3,0.576727,0.194098,0.332614,0.111941,0.037674
4,0.380079,0.878431,0.144460,0.333873,0.771641
...,...,...,...,...,...
95,0.229904,0.563553,0.052856,0.129563,0.317592
96,0.967859,0.747436,0.936751,0.723413,0.558661
97,0.109721,0.411140,0.012039,0.045111,0.169036
98,0.681556,0.065279,0.464519,0.044491,0.004261


# Binning

- Convert number to categories
- pandas cut function

In [45]:
# create 10 bins of the numerical columns
df["f_bin_10"] = pd.cut(df["f_1"], bins=10, labels=False)
df["f_bin_100"] = pd.cut(df["f_1"], bins=100, labels=False)
df.loc[:, "f_3"] = np.random.randint(1000, 10000, 100)
print(f"Variance of column - f_3 --> {(df.f_3.var())}")

Variance of column - f_3 --> 6220671.482323232


In [46]:
df.f_3.apply(lambda x: np.log(1+x)).var()

0.33876831151432896

# Missing value imputation

- when dealing with real world data, you might encounter missing values in both categorical and numerical columns

- For categorical columns, keep it super simple. missing values --> NEW CATEGORY ("NONE")

- For Numerical column, replace with 0, mean, median, mode, consider group by

- Fancy way is to do using k-nearest neighbours


## KNN Imputer

In [1]:
import numpy as np
from sklearn import impute

X = np.random.randint(1, 15, (10, 6))
# convert the array to float
X = X.astype(float)

# randomly assign 10 elements to NaN (missing)
X.ravel()[np.random.choice(X.size, 10, replace=False)] = np.nan
print(X)
# use 2 nearest neighbours to fill values
knn_imputer = impute.KNNImputer(n_neighbors=2)
knn_imputer.fit_transform(X)
print(X)

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


[[ 9. nan nan  2.  8.  9.]
 [ 1.  6.  7.  2. 12. 10.]
 [ 1. 14.  3.  1. 12.  4.]
 [ 3. nan  9.  9.  8. nan]
 [14.  1.  6. nan  1. nan]
 [ 1. nan 11. 10.  2.  6.]
 [ 5.  8.  8.  6.  6.  5.]
 [11.  4. 10.  2. 10.  3.]
 [ 3.  3. 12.  8. 10. 14.]
 [ 4. nan nan  3. nan 12.]]
[[ 9. nan nan  2.  8.  9.]
 [ 1.  6.  7.  2. 12. 10.]
 [ 1. 14.  3.  1. 12.  4.]
 [ 3. nan  9.  9.  8. nan]
 [14.  1.  6. nan  1. nan]
 [ 1. nan 11. 10.  2.  6.]
 [ 5.  8.  8.  6.  6.  5.]
 [11.  4. 10.  2. 10.  3.]
 [ 3.  3. 12.  8. 10. 14.]
 [ 4. nan nan  3. nan 12.]]


  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
