In [155]:
from datetime import datetime
import numpy as np
import pandas as pd

import string
import nltk
from nltk.tokenize import word_tokenize

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [176]:
def tokenize_sentence(s):
    _s = "".join([w for w in s.lower() if w not in string.punctuation])
    return [i.lower() for i in word_tokenize(_s)]

def vectorize(ts):
    return sum([model.wv.get_vector(i) for i in ts])

In [116]:
to_drop = [
    'number',
    'street',
    'side',
    'city',
    'county',
    'state',
    'zipcode',
    'country',
    'timezone',
    'airport_code',
    'weather_timestamp',
]

to_cast = [
    "amenity",
    "bump",
    "crossing",
    "give_way",
    "junction",
    "no_exit",
    "railway",
    "roundabout",
    "station",
    "stop",
    "traffic_calming",
    "traffic_signal",
    "turning_loop",
    "sunrise_sunset",
    "civil_twilight",
    "nautical_twilight",
    "astronomical_twilight",
]

sunset_twilight = {
    "Day": 1,
    "Night": 0,
}

In [115]:
df = pd.read_csv("train.csv").drop(columns=['Unnamed: 0.1','Unnamed: 0'])
df_cpy = df.copy()

df.columns = list(map(lambda x: x.lower(), df.columns))

df = df.drop(columns=to_drop)
df['start_time'] = pd.to_datetime(df['start_time'])
df['end_time'] = pd.to_datetime(df['end_time'])

df['start_month'] = df['start_time'].apply(lambda x: x.month).astype(np.uint8)
df['start_day'] = df['start_time'].apply(lambda x: x.day).astype(np.uint8)
df['start_hour'] = df['start_time'].apply(lambda x: x.hour).astype(np.uint8)
df['end_month'] = df['end_time'].apply(lambda x: x.month).astype(np.uint8)
df['end_day'] = df['end_time'].apply(lambda x: x.day).astype(np.uint8)
df['end_hour'] = df['end_time'].apply(lambda x: x.hour).astype(np.uint8)

for c in ['sunrise_sunset','civil_twilight','nautical_twilight','astronomical_twilight']:
    df[c] = df[c].apply(lambda x: sunset_twilight.get(x, -1))
    
for i in to_cast:
    df[i] = df[i].astype(np.uint8)
    
df = df.join(pd.get_dummies(df.weather_condition, prefix='weather'))
df = df.join(pd.get_dummies(df.wind_direction, prefix='weather'))
df = df.drop(columns=['start_time', 'end_time', 'weather_condition', 'wind_direction'])

In [120]:
df = df.rename(
    columns={
        "distance(mi)": "distance_mi",
        "temperature(f)": "temperature_f",
        "wind_chill(f)": "wind_chill_f",
        "humidity(%)": "humidity_perc",
        "pressure(in)": "pressure_in",
        "visibility(mi)": "visibility_mi",
        "wind_speed(mph)": "wind_speed_mph",
        "precipitation(in)": "precipitation_in",
    }
)

In [None]:
sentences = df.description
# Tokenization of each document
tokenized_sent = []
for s in sentences:
    tokenized_sent.append(tokenize_sentence(s))

tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_sent)]

model = Doc2Vec(tagged_data, vector_size = 20, window = 2, min_count = 1, epochs = 100)

In [None]:
df['description_tokenized'] = df.description.apply(lambda x: tokenize_sentence(x))
df['description_vectorized'] = df.description_tokenized.apply(lambda x: vectorize(x))

_t_df = df.description_vectorized.apply(pd.Series)

_t_df = _t_df.rename(
    columns={
        i: f"desc_{i}" for i in range(20)
    }
)

df = df.join(_t_df)
df = df.drop(columns=['description_tokenized', 'description_vectorized', 'description'])

In [199]:
df.dropna().__len__()

45834

In [201]:
df.to_csv("train_clean.csv")

https://stackoverflow.com/questions/44993240/how-to-use-the-infer-vector-in-gensim-doc2vec