In [1]:
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

In [2]:
SS_TT = {
    "Night": 0,
    "Day": 1,
}

def preprocess(df, weather_cols, wind_cols, pca_weather=None, pca_wind=None):
    weather = df[weather_cols]
    wind = df[wind_cols]
    
    # Begin preprocessing
    df.columns = list(map(lambda c: c.lower(), df.columns))

    # Fix datetimes
    df.start_time = pd.to_datetime(df.start_time)
    df.end_time = pd.to_datetime(df.end_time)
    # df.weather_timestamp = pd.to_datetime(df.weather_timestamp)

    df['start_day'] = df.start_time.apply(lambda x: x.day)
    df['start_hour'] = df.start_time.apply(lambda x: x.hour)
    df['end_day'] = df.end_time.apply(lambda x: x.day)
    df['end_hour'] = df.end_time.apply(lambda x: x.hour)
    df['time_diff'] = df.end_time - df.start_time
    df['time_diff'] = df.time_diff.apply(lambda x: x.seconds)

    df = df.drop(columns=['start_time', 'end_time', 'weather_timestamp'])

    # Drop description
    df = df.drop(columns=['description'])

    # Drop other irrelevant information
    df = df.drop(
        columns=[
            "number",
            "street",
            "side",
            "city",
            "county",
            "state",
            "zipcode",
            "country",
            "timezone",
            "airport_code"
        ]
    )

    # Renaming cols
    df = df.rename(
        columns={
            "distance(mi)": "distance_mi",
            "temperature(f)": "temperature_f",
            "wind_chill(f)": "wind_chill_f",
            "humidity(%)": "humidity_perc",
            "pressure(in)": "pressure_in",
            "visibility(mi)": "visibility_mi",
            "wind_speed(mph)": "wind_speed_mph",
            "precipitation(in)": "precipitation_in",
        }
    )

    # Convert to int bool columns
    for c in ["amenity","bump","crossing","give_way","junction","no_exit","railway","roundabout","station","stop","traffic_calming","traffic_signal","turning_loop"]:
        df[c] = df[c].astype(int)

    # Vectorize weather
    # weather = pd.get_dummies(df.weather_condition, prefix='weather')
    # wind = pd.get_dummies(df.wind_direction, prefix='wind')

    if pca_weather is None and pca_wind is None:
        # Apply PCA to weather and wind
        pca_weather = PCA(n_components=20)
        weather_new = pca_weather.fit_transform(weather)
        pca_wind = PCA(n_components=8)
        wind_new = pca_wind.fit_transform(wind)
    else:
        weather_new = pca_weather.transform(weather)
        wind_new = pca_wind.transform(wind)
    
    weather_new = pd.DataFrame(weather_new).rename(columns={ i:f"weather_{i}" for i in range(len(weather_new[0])) })
    wind_new = pd.DataFrame(wind_new).rename(columns={ i:f"wind_{i}" for i in range(len(wind_new[0])) })

    df = df.join(weather_new)
    df = df.join(wind_new)

    df = df.drop(
        columns=[
            "weather_condition",
            "wind_direction"
        ]
    )
    
    df = df.drop(columns=list(map(lambda x: x.lower(), weather_cols+wind_cols)))

    # Other columns
    df.sunrise_sunset = df.sunrise_sunset.apply(lambda x: SS_TT[x])
    df.civil_twilight = df.civil_twilight.apply(lambda x: SS_TT[x])
    df.nautical_twilight = df.nautical_twilight.apply(lambda x: SS_TT[x])
    df.astronomical_twilight = df.astronomical_twilight.apply(lambda x: SS_TT[x])
    
    return df, pca_weather, pca_wind

In [None]:
df = pd.read_csv("../train.csv").drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])

In [None]:
weather = pd.get_dummies(df.Weather_Condition, prefix='weather')
wind = pd.get_dummies(df.Wind_Direction, prefix='wind')

df = df.join(weather)
df = df.join(wind)

In [15]:
df_train, pca_w, pca_v = preprocess(df, list(weather.columns), list(wind.columns))
# df_val, _, _ = preprocess(df_val, list(weather.columns), list(wind.columns), pca_weather=pca_w, pca_wind=pca_v)

In [51]:
df_train.columns.__len__()

63

In [16]:
df_train.to_csv("train_.csv")

---

In [None]:
df_t = pd.read_csv("../test4alumnxs.csv").drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])

In [None]:
weather_t = pd.get_dummies(df_t.Weather_Condition, prefix='weather')
wind_t = pd.get_dummies(df_t.Wind_Direction, prefix='wind')

In [76]:
df_t = df_t.join(weather_t)
df_t = df_t.join(wind_t)

In [77]:
(set(df_t.columns).symmetric_difference(set(df.columns)) - {"Severity"})

{'weather_Blowing Dust',
 'weather_Blowing Dust / Windy',
 'weather_Blowing Snow',
 'weather_Heavy Drizzle',
 'weather_Heavy Snow / Windy',
 'weather_Light Blowing Snow',
 'weather_Light Freezing Fog',
 'weather_Light Freezing Rain / Windy',
 'weather_Light Rain Shower',
 'weather_Light Snow Shower',
 'weather_Sand / Dust Whirlwinds',
 'weather_Snow / Windy',
 'weather_Squalls / Windy',
 'weather_T-Storm / Windy'}

In [78]:
for i in set(df_t.columns).symmetric_difference(set(df.columns)) - {"Severity"}:
    df_t[i] = 0

In [79]:
df_t_p, _, _ = preprocess(df_t, list(weather.columns), list(wind.columns), pca_weather=pca_w, pca_wind=pca_v)

In [86]:
df_t_p_p = df_t_p.drop(columns=['weather_blowing dust', 'weather_light snow shower', 'weather_t-storm / windy'])

In [87]:
df_t_p_p.to_csv("test4alumnos_clean.csv", index=False)

df_train.to_csv("train_rev.csv", index=False)
df_val.to_csv("val_rev.csv", index=False)

---

In [3]:
df = pd.read_csv("../train.csv").drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])

In [4]:
weather = pd.get_dummies(df.Weather_Condition, prefix='weather')
wind = pd.get_dummies(df.Wind_Direction, prefix='wind')

df = df.join(weather)
df = df.join(wind)

X = df.drop(columns="Severity")
y = df.Severity

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [5]:
df_train = X_train.join(y_train)
df_test = X_test.join(y_test)

df_train_t, pca_w_t, pca_v_t = preprocess(df_train, list(weather.columns), list(wind.columns))
df_val_t, _, _ = preprocess(df_test, list(weather.columns), list(wind.columns), pca_weather=pca_w_t, pca_wind=pca_v_t)

In [6]:
df_train_t.to_csv("new_data/train_t.csv")
df_val_t.to_csv("new_data/train_val.csv")

In [7]:
X_train.index.intersection(X_test)

Index([], dtype='object')

df['description_tokenized'] = df.description.apply(tokenize_sentence)

X = df.drop(columns='severity')
y = df.severity

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

df_train = X_train.join(y_train)
df_test = X_test.join(y_test)

tokenized_sent = df_train.description_tokenized

tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_sent)]

model = Doc2Vec(tagged_data, vector_size = 20, window = 2, min_count = 1, epochs = 100)

def try_vectorize(model, w):
    try:
        return vectorize(model, w)
    except:
        return model.infer_vector(doc_words=x, alpha=0.025)

# model.infer_vector(doc_words=tokens_list, steps=20, alpha=0.025)
df_train['description_vectorized'] = df_train.description_tokenized.apply(lambda x: vectorize(model, x))
df_test['description_vectorized'] = df_test.description_tokenized.apply(lambda x: try_vectorize(model, x))

_tt_df = df_train.description_vectorized.apply(pd.Series)
_ts_df = df_test.description_vectorized.apply(pd.Series)

_tt_df = _tt_df.rename(
    columns={
        i: f"desc_{i}" for i in range(20)
    }
)

_ts_df = _ts_df.rename(
    columns={
        i: f"desc_{i}" for i in range(20)
    }
)

df_train = df_train.drop(columns=['description_tokenized', 'description_vectorized', 'description'])
df_test = df_test.drop(columns=['description_tokenized', 'description_vectorized', 'description'])
