In [16]:
from datetime import datetime
import numpy as np
import pandas as pd

import string
import nltk
from nltk.tokenize import word_tokenize

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split

In [65]:
from sklearn.decomposition import PCA

In [10]:
def tokenize_sentence(s):
    _s = "".join([w for w in s.lower() if w not in string.punctuation])
    return [i.lower() for i in word_tokenize(_s)]

def vectorize(model, ts):
    return sum([model.wv.get_vector(i) for i in ts])

In [23]:
TO_DROP = [
    'number',
    'street',
    'side',
    'city',
    'county',
    'state',
    'zipcode',
    'country',
    'timezone',
    'airport_code',
    'weather_timestamp',
]

TO_CAST = [
    "amenity",
    "bump",
    "crossing",
    "give_way",
    "junction",
    "no_exit",
    "railway",
    "roundabout",
    "station",
    "stop",
    "traffic_calming",
    "traffic_signal",
    "turning_loop",
    "sunrise_sunset",
    "civil_twilight",
    "nautical_twilight",
    "astronomical_twilight",
]

SS_TW = {
    "Day": 1,
    "Night": 0,
}

In [97]:
df = pd.read_csv("train.csv").drop(columns=['Unnamed: 0.1','Unnamed: 0'])
df_cpy = df.copy()

def preprocess(df, mode='train'):
    df.columns = list(map(lambda x: x.lower(), df.columns))
    df = df.drop(columns=TO_DROP)
    df['start_time'] = pd.to_datetime(df['start_time'])
    df['end_time'] = pd.to_datetime(df['end_time'])

    df['start_month'] = df['start_time'].apply(lambda x: x.month).astype(np.uint8)
    df['start_day'] = df['start_time'].apply(lambda x: x.day).astype(np.uint8)
    df['start_hour'] = df['start_time'].apply(lambda x: x.hour).astype(np.uint8)
    df['end_month'] = df['end_time'].apply(lambda x: x.month).astype(np.uint8)
    df['end_day'] = df['end_time'].apply(lambda x: x.day).astype(np.uint8)
    df['end_hour'] = df['end_time'].apply(lambda x: x.hour).astype(np.uint8)

    for c in ['sunrise_sunset','civil_twilight','nautical_twilight','astronomical_twilight']:
        df[c] = df[c].apply(lambda x: SS_TW.get(x, -1))

    for i in TO_CAST:
        df[i] = df[i].astype(np.uint8)

    if mode == 'train':
        pca_weat = PCA(n_components=10)
        pca_weat = pca_weat.fit_transform(pd.get_dummies(df.weather_condition, prefix='weather'))
        pca_wind = PCA(n_components=10)
        pca_wind = pca_wind.fit_transform(pd.get_dummies(df.wind_direction, prefix='weather'))
    if mode == 'test':
        pass
    
    for c in pca_weat:
        for j, v in enumerate(c):
            df[f'weather_{j}'] = c[j]
            
    for c in pca_wind:
        for j, v in enumerate(c):
            df[f'wind_{j}'] = c[j]
    
    df = df.drop(columns=['start_time', 'end_time', 'weather_condition', 'wind_direction'])

    df = df.rename(
        columns={
            "distance(mi)": "distance_mi",
            "temperature(f)": "temperature_f",
            "wind_chill(f)": "wind_chill_f",
            "humidity(%)": "humidity_perc",
            "pressure(in)": "pressure_in",
            "visibility(mi)": "visibility_mi",
            "wind_speed(mph)": "wind_speed_mph",
            "precipitation(in)": "precipitation_in",
        }
    )

    df = df.drop(columns='description')
    
    return df
    

In [84]:
df = pd.read_csv("train.csv").drop(columns=['Unnamed: 0.1','Unnamed: 0'])

df = preprocess(df)

In [99]:
df.sample()

Unnamed: 0,severity,start_lat,start_lng,end_lat,end_lng,distance_mi,temperature_f,wind_chill_f,humidity_perc,pressure_in,...,wind_0,wind_1,wind_2,wind_3,wind_4,wind_5,wind_6,wind_7,wind_8,wind_9
17501,0,38.527977,-120.442157,38.527977,-120.442157,0.0,45.0,45.0,100.0,27.13,...,-0.177995,-0.062717,-0.072588,-0.013068,-0.069591,-0.070359,-0.023696,0.069226,0.210654,-0.131019


df_train, df_test = df

In [82]:

    df['description_tokenized'] = df.description.apply(tokenize_sentence)
    
    X = df.drop(columns='severity')
    y = df.severity
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
    
    df_train = X_train.join(y_train)
    df_test = X_test.join(y_test)
    
    tokenized_sent = df_train.description_tokenized

    tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_sent)]

    model = Doc2Vec(tagged_data, vector_size = 20, window = 2, min_count = 1, epochs = 100)

    def try_vectorize(model, w):
        try:
            return vectorize(model, w)
        except:
            return model.infer_vector(doc_words=x, alpha=0.025)

    # model.infer_vector(doc_words=tokens_list, steps=20, alpha=0.025)
    df_train['description_vectorized'] = df_train.description_tokenized.apply(lambda x: vectorize(model, x))
    df_test['description_vectorized'] = df_test.description_tokenized.apply(lambda x: try_vectorize(model, x))
    
    _tt_df = df_train.description_vectorized.apply(pd.Series)
    _ts_df = df_test.description_vectorized.apply(pd.Series)

    _tt_df = _tt_df.rename(
        columns={
            i: f"desc_{i}" for i in range(20)
        }
    )
    
    _ts_df = _ts_df.rename(
        columns={
            i: f"desc_{i}" for i in range(20)
        }
    )

    df_train = df_train.drop(columns=['description_tokenized', 'description_vectorized', 'description'])
    df_test = df_test.drop(columns=['description_tokenized', 'description_vectorized', 'description'])
    