# Multimodal feature generation

Example code to extract features from several different modalities, such as time, coordinates, textual and visual.

For reference purposes.

In [None]:
# remove this, add to the designated section

import pandas as pd
import ast
from datetime import datetime, timedelta
import requests
import pandas as pd 
import numpy as np
import os
import gensim
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.linear_model import SGDClassifier
from string import punctuation
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from tqdm import tqdm_notebook as tqdm
import math

# Load data

In [None]:
Import pandas as pd

df = ...

# Time
Create columns for time features

In [None]:

signal['created_at'] = pd.to_datetime(signal['created_at'])
signal['weekday'] = signal['created_at'].map(lambda x: x.weekday())
signal['hour'] = signal['created_at'].map(lambda x: x.hour)
signal['week'] = signal['created_at'].map(lambda x: x.week)
signal['dayofyear'] = signal['created_at'].map(lambda x: x.dayofyear)
signal['month'] = signal['created_at'].map(lambda x: x.month)
signal['date'] = pd.to_datetime(signal['created_at']).map(lambda x: str(x).split(' ')[0].replace('-',''))


For some classifiers features require to be one hot encoded, this can easiliy be done with pandas and sklearn, as can be seen in the code snippit below

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

In [None]:
# keep the following features and one hot encode them. 
one_hot = ['weekday', 'hour', 'month']

for c in one_hot:
    keep = list(df.columns)
    keep.remove(c)
    signal = pd.concat([df[keep],pd.get_dummies(df[c], prefix='time_'+c)],axis=1)

# Weather

Convert hourly datetime to weather recorded at schiphol airport

Data source and information about columns can be found at: https://projects.knmi.nl/klimatologie/uurgegevens/selectie.cgi


In [None]:
weather = pd.read_csv('knmi.csv')
weather.columns = weather.columns.str.replace(' ', '')

def get_weather_dict(date,hour):

    weather_at_time = weather[weather['YYYYMMDD'].astype(str)==date][weather['HH'].astype(int)==hour+1].reset_index(drop=True)
    weather_at_time = weather_at_time[['IX', 'M', 'R', 'S', 'O','Y','DD', 'FH', 'FF', 'FX', 'T', 'TD', 'SQ','Q', 'DR', 'RH', 'P', 'U']]
    
    weather_dict = {}
    for key, value in weather_at_time.to_dict().items():
        try:
            weather_dict['weather_'+key] = value[0]
        except:
            weather_dict['weather_'+key] = 0
    return weather_dict

signal['weather_dict'] = signal.apply(lambda row: get_weather_dict(row.date,row.hour), axis=1)
for column in list(signal['weather_dict'][0].keys()):
    signal[column] = signal['weather_dict'].map(lambda d: d[column])
del signal['weather_dict']

# Geo semantic

Creating a profile of the area arround the given coordinates using data from https://maps.amsterdam.nl/open_geodata/?LANG=en

In [None]:

def haversine_est(lon1, lat1, lon2, lat2):
    """
    Estimation with equirectangular distance approximation. 
    Since the distance is relatively small, you can use the equirectangular distance approximation. 
    This approximation is faster than using the Haversine formula. 
    So, to get the distance from your reference point (lat1/lon1) to the point your are testing (lat2/lon2),
    use the formula below. 
    Important Note: you need to convert all lat/lon points to radians:
    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    x = (lon2 - lon1) * np.cos( 0.5*(lat2+lat1) )
    y = lat2 - lat1
    km = 6371.0 * np.sqrt( x*x + y*y )
    return km

def generate_geo_dict(semantic_name,lat,lng,semantic_lats,semantic_lngs):
    
    geo_dict = {}

    distances = []
    for i in range(len(semantic_lats)):
        distances.append(haversine_est(lat,lng,semantic_lngs[i], semantic_lats[i]))
    distances = sorted(distances)
    if distances[0]>100:
        print('distance of ',distances[0])
    geo_dict['geo_'+semantic_name+'_nearest'] = distances[0]
    geo_dict['geo_'+semantic_name+'_nearest_5_mean'] = np.mean(distances[:5])
    geo_dict['geo_'+semantic_name+'_nearest_10_mean'] = np.mean(distances[:10])
    geo_dict['geo_'+semantic_name+'_nearest_100_mean'] = np.mean(distances[:100])

    
    geo_dict['geo_'+semantic_name+'_within_25m']  = 0
    geo_dict['geo_'+semantic_name+'_within_50m']  = 0
    geo_dict['geo_'+semantic_name+'_within_100m'] = 0
    geo_dict['geo_'+semantic_name+'_within_200m'] = 0
    
    for distance in distances:

        if distance <0.025:
            geo_dict['geo_'+semantic_name+'_within_25m'] += 1
        if distance <0.05:
            geo_dict['geo_'+semantic_name+'_within_50m'] += 1 
        if distance <0.1:
            geo_dict['geo_'+semantic_name+'_within_100m'] += 1 
        if distance <0.2:
            geo_dict['geo_'+semantic_name+'_within_200m'] += 1    
        
    return geo_dict
    
def load_geo_features(signal,description,column,csv,sep,minimal_number=100):
    
    decription = description.lower()
    df = pd.read_csv(csv,sep=sep)


    # add more kinds of data, trees, bars, benches, on water or not
    keep = df[column].value_counts().to_frame()
    keep = list(keep[keep[column]>minimal_number].index)
    print(len(keep), 'loading types of ',csv)

    for t in keep:
        print(t,len(df[df[column]==t]))
        try:
            lats = [float(x.replace(',','.')) for x in list(df[df[column]==t]['LAT'])]
            lngs = [float(x.replace(',','.')) for x in list(df[df[column]==t]['LNG'])]
        except:            
            lats = list(df[df[column]==t]['LAT'])
            lngs = list(df[df[column]==t]['LNG'])
        t = t.lower()
        signal['geo_'+description+'_'+t+'_dict'] = signal.apply(lambda row: generate_geo_dict(description+'_'+t,row.lat,row.lng,lats,lngs), axis=1)
        

        for c in list(signal['geo_'+description+'_'+t+'_dict'][0].keys()):
            signal[c] = signal['geo_'+description+'_'+t+'_dict'].map(lambda d: d[c])
        del signal['geo_'+description+'_'+t+'_dict']
    return signal

# change to a single file, and add file
signal = load_geo_features(signal,'container','waste_name','containers_with_type.csv',',',100)

signal = load_geo_features(signal,'trees','Soortnaam_NL','soorten_bomen.csv',',',100)

signal = load_geo_features(signal,'business','FUNCTIE2_OMS','FUNCTIEKAART.csv',';',100)

signal = load_geo_features(signal,'monument','Status','MONUMENTEN.csv',';',100)




# Textual features

Create word count features for 5000 words

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords


def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words=set(stopwords.words('dutch'))).fit(corpus) # or for example a tf-idf vectorizer could be used
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

words_count = get_top_n_words(signal['text'],5000)

signal['text'] = signal['text'].map(lambda x: x.lower())

for word_count in tqdm(words_count):
    word = word_count[0]
    count = word_count[1]
    signal['text_'+word] = signal['text'].map(lambda x: x.count(word)).astype(int)

del signal['text']

# Visual features

Extracting the 50 visual concepts of each image


In [None]:
from keras import backend as K
from tqdm import tqdm
tqdm.pandas()


model = ResNet50(weights='imagenet')

images_path = ''

images_ids = [int(x.split('.')[0]) for x in os.listdir(images_path)]


def get_image_output_layer(x):

    image_text = ''
    if x in images_ids:

        image_path = images_path+str(x)+'.jpg'
        img = image.load_img(image_path, target_size=(224, 224))
        x = image.img_to_array(img)
        x = np.expand_dims(x, axis=0)
        x = preprocess_input(x)

        preds = model.predict(x)


        return decode_predictions(preds, top=50)[0]


    return ''

i = images_ids[1]

df['image_objects_50'] = df['id'].progress_apply(get_image_output_layer)

# Visual 2024 features
This function can be used to convert any image to a 2024 dimensions representation of the image using imagenet and ResNet

Getting last layer: https://keras.io/getting-started/faq/#how-can-i-obtain-the-output-of-an-intermediate-layer

The hidden representation of the last layer (before the softmax) is often taken as a feature vector, as it contains higher level features

https://github.com/keras-team/keras-applications/blob/master/keras_applications/resnet50.py

In [1]:

def get_layer_before_softmax(x):
    '''
    Inputs a file name
    Outputs a 2024 dimensional representation of an image
    '''

    if x in images_ids:

        # load image setting the image size to 224 x 224
        img = image.load_img(images_path + str(x)+".jpg", target_size=(224, 224))
        # convert image to numpy array
        x = image.img_to_array(img)
        # the image is now in an array of shape (3, 224, 224) 
        # need to expand it to (1, 3, 224, 224) as it's expecting a list
        x = np.expand_dims(x, axis=0)
        x = preprocess_input(x)

        # with a Sequential model
        number_of_layers = len(model.layers)
        layer_output = K.function([model.layers[0].input],[model.layers[number_of_layers-2].output])



        return layer_output([x])[0]

    return [[0]*2048]
