<a href="https://colab.research.google.com/github/noahruiz416/Tempe_Traffic_Classification/blob/main/Tempe_Traffic_Prototype_Feature_Engineering_V1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This notebook will contain the initial feature engineering approaches that will be used. Decisions to transform or change certain varaibles will be explained

In [309]:
import pandas as pd
import numpy as np 

In [310]:
df = pd.read_csv("traffic_data.2.csv")

### Based on Exploratory Analysis, I plan on combining certain features that are very similar. Additionally I may exclude certain variables as well. More in depth reasoning will be included in final report.

In [311]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [341]:
#custom functions to transform data

#finds the median age between  drivers
def median_age(data):
  data['median_age'] = (data['Age_Drv1'] + data['Age_Drv2']) / 2
  return data

#helper function that contains the logic for whether or not drugs were involved in an accident
def label_drugs(row):
  if row['DrugUse_Drv1'] == "Drugs" or row['DrugUse_Drv2'] == "Drugs":
    return 1 #drugs are involved
  if row['DrugUse_Drv1'] != "Drugs" or row['DrugUse_Drv2'] != "Drugs":
    return 0 #drugs are not involved

#applying helper function in order to sort by rows
def apply_drug_label(data):
  data['drugs_involved'] = data.apply (lambda row: label_drugs(row), axis=1)
  return data

#helper function that contains the logic for whether or not alch was involved in an accident
def label_alchol(row):
  if row['AlcoholUse_Drv1'] == "Drugs" or row['AlcoholUse_Drv2'] == "Drugs":
    return 1 #drugs are involved
  if row['AlcoholUse_Drv1'] != "Drugs" or row['AlcoholUse_Drv2'] != "Drugs":
    return 0 #drugs are not involved

#applying helper function in order to sort by rows
def apply_alchol_label(data):
  data['alcohol_involved'] = data.apply (lambda row: label_alchol(row), axis=1)
  return data

#function to binary encode accidents as fatal or nonfatal
def encode_fatal_accidents(data):
  Severity = []
  for row in data['Injuryseverity']:
    if row != "Fatal":
      Severity.append(0) #nonfatal
    if row == "Fatal":
      Severity.append(1) #fatal
  data['Fatal_Non_Fatal'] = Severity
  return data

#drops unusable columns in Collisionmanner
def fix_collision_manner(data):
  data = data[data['Collisionmanner'].str.contains("10")==False]
  return data

#drops unusable columns in Lightcondition
def fix_light_condition(data):
  data = data[data['Lightcondition'].str.contains("51")==False]
  data = data[data['Lightcondition'].str.contains("Unknown 51")==False]
  return data

#drops outliers in age category
def filter_age_outliers(data):
  data.drop(data.index[data['Age_Drv1'] >= 100], inplace=True)
  data.drop(data.index[data['Age_Drv2'] >= 100], inplace=True)
  return data

#drops unusable columns in Violations
def fix_violations(data):
  data = data[data['Violation1_Drv1'].str.contains("108")==False]
  data = data[data['Violation1_Drv1'].str.contains("109")==False]
  data = data[data['Violation1_Drv1'].str.contains("49")==False]
  return data

#one hot encoding categorical features
def nominal_encoding(data):
  data = [['StreetName','Collisionmanner', 'Lightcondition', 'Weather', 'SurfaceCondition']] 
  data_dummies = pd.get_dummies(data)
  return data_dummies

#final piece to tie pipeline together
def initial_input_vector(data):
  pass