In [1]:
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

import csv
import pandas as pd
import numpy as np

In [2]:
def to_week_num(s):
    s = s.strip().lower()
    
    if s == "sun":
        return 1
    elif s == "mon":
        return 2
    elif s == "tue":
        return 3
    elif s == "wed":
        return 4
    elif s == "thu":
        return 5
    elif s == "fri":
        return 6
    elif s == "sat":
        return 7
    else:
        return 0

In [3]:
def create_policing_area_dic(train_data_file):
    df_train_0 = pd.read_csv(train_data_file, converters={"Weekday_of_Collision":to_week_num})
    df_train_0 = df_train_0.fillna(0)
    
    policing_area = df_train_0['Policing_Area']
    #print(policing_area)
  
    n = 1
    policing_area_dic = {}
    for x in policing_area:
        if x == 0:
            continue
        x = x.strip().lower()
        if x not in policing_area_dic.keys():
            policing_area_dic[x] = n
            n = n + 1
    
    return policing_area_dic
        

pa_dic = create_policing_area_dic('Accident_train.csv')
print(pa_dic)

{'crea': 1, 'belc': 2, 'lisb': 3, 'midu': 4, 'foyl': 5, 'antn': 6, 'arbc': 7, 'nabb': 8, 'ccgl': 9, 'antr': 10, 'lisc': 11, 'arnd': 12, 'ndow': 13, 'fero': 14, 'dcst': 15, 'dast': 16, 'bmen': 17, 'ferm': 18, 'beln': 19, 'arma': 20, 'cole': 21, 'moyl': 22, 'bele': 23, 'belw': 24, 'crai': 25, 'cook': 26, 'mean': 27, 'bels': 28, 'strb': 29, 'cfer': 30, 'omag': 31, 'nemd': 32, 'ards': 33, 'lima': 34, 'down': 35, 'banb': 36, 'newm': 37, 'bmon': 38, 'mfel': 39, 'larn': 40}


In [4]:
def to_policing_area_num(s):
    s = s.strip().lower()
    if s in pa_dic.keys():
        return pa_dic[s]
    else:
        return 41
    

In [5]:
def load_train_data(train_data_file):
    df_train = pd.read_csv(train_data_file, converters={"Weekday_of_Collision":to_week_num,"Policing_Area":to_policing_area_num})
    df_train = df_train.fillna(0)


    Y_train = df_train['Collision_Severity']

    # 'Policing_Area', ---- skipped for now

    X_train = df_train[['Collision_Ref_No', 'Policing_Area', 'Weekday_of_Collision','Day_of_Collision','Month_of_Collision', 'Hour_of_Collision', 'Carriageway_Type', 'Speed_Limit', 'Junction_Detail', 'Junction_Control','Ped_Crossing_HC', 'Ped_Crossing_PC', 'Light_Conditions', 'Weather_Conditions', 'Road_Surface_Conditions', 'Special_Conditions_at_Site' ]]


    #print(Y_train)
    #print(X_train)
    
    return X_train, Y_train

In [6]:
def train_the_model(X_train, Y_train, algo = 'DT'):
    if algo == 'DT':
        clf = tree.DecisionTreeClassifier(max_depth=3)
    elif algo == 'GBM':
        clf = GradientBoostingClassifier( random_state=0)
    elif algo == 'ADA':
        clf = AdaBoostClassifier(n_estimators=100)
    elif algo == 'RF':
        clf = RandomForestClassifier()

    clf = clf.fit(X_train, Y_train)
    return clf

In [7]:
def load_test_data(test_data_file):
    df_test = pd.read_csv(test_data_file, converters={"Weekday_of_Collision":to_week_num,"Policing_Area":to_policing_area_num})
    df_test = df_test.fillna(0)

    X_test = df_test[['Collision_Ref_No', 'Policing_Area',  'Weekday_of_Collision','Day_of_Collision','Month_of_Collision', 'Hour_of_Collision', 'Carriageway_Type', 'Speed_Limit', 'Junction_Detail', 'Junction_Control','Ped_Crossing_HC', 'Ped_Crossing_PC', 'Light_Conditions', 'Weather_Conditions', 'Road_Surface_Conditions', 'Special_Conditions_at_Site' ]]
    
    return X_test


In [8]:
def predict_model(X_test, output_file):
    predited_val = clf.predict(X_test)
    index = 1
    with open(output_file, 'w', newline='') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',')
        spamwriter.writerow(['S.No.', 'Collision_Severity'])
        for val in predited_val:
            spamwriter.writerow([index, val])
            index = index + 1

    csvfile.close()

In [9]:
X_train, Y_train = load_train_data('Accident_train.csv')
X_test = load_test_data('Accident_test.csv')

In [10]:
for algo in ['DT', 'GBM', 'ADA', 'RF']:
    clf = train_the_model(X_train, Y_train, algo)
    out_file = 'using_' + algo + '_prediction.csv' 
    predict_model(X_test,out_file)