In [None]:
import pandas as pd
import numpy as np

#Local_Authority_Highway_codes
lahc = ['E09000013','E09000014','E09000015']


#Load the Dataset
df = pd.read_csv('AccidentLondonBoroughs.csv')

#Formating tyes
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
df['Time'] = pd.to_datetime(df['Time'], format='%H:%M')

In [None]:
#transform Date and Time values into ints

dateMaxMin = [df['Date'].max(), df['Date'].min()]
timeMaxMin = [df['Time'].max(), df['Time'].min()]

dateDelta = dateMaxMin[0] - dateMaxMin[1]
numOfDays = dateDelta.days

timeDelta = timeMaxMin[0] - timeMaxMin[1]
numOfMins = timeDelta.total_seconds() / 60


print('dateStart: ' + str(dateMaxMin[1].date()) + '\n' + 'totalDays: ' + str(numOfDays) + '\n' + 'timeStart: ' + str(timeMaxMin[1].time()) + '\n' + 'totalMins: ' +  str(numOfMins))

In [None]:
dateVals = []
for i in df['Date']:
    delt = i - dateMaxMin[1]
    val = delt.days
    dateVals.append(val)
    
timeVals = []
for i in df['Time']:
    delt = i - timeMaxMin[1]
    val = int(delt.total_seconds() / 60)
    timeVals.append(val)
    
df['DateVals'] = dateVals
df['TimeVals'] = timeVals
    
df.shape

In [None]:
#get columns associated with the LAHC
data = pd.DataFrame(columns = df.columns.to_list())
dfDict = dict(df.dtypes)

for key, val in dfDict.items():
    if val == 'int64' or val == 'float64':
        data[key] = pd.to_numeric(data[key])
    if key == 'Date':
        data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%Y')
    if key == 'Time':
        data['Time'] = pd.to_datetime(data['Time'], format='%H:%M')
    
for lac in lahc :
    lacRows = df.loc[df['Local_Authority_(Highway)'] == lac]
    data = pd.concat([data, lacRows])


In [None]:
# raw, unclean, with outliers data
data.shape

In [None]:
class ColumnAttri:
    def __init__(self, name, ctype):
        self.name = name
        self.ctype = str(ctype)
        self.range = []
        self.values = []
        self.empty = 0
        self.notEmpty = 0
        
    def processCol(self, data):
        for row in data[self.name]:
            if row == -1 or str(row) == '-1':
                self.empty += 1
                continue
            self.notEmpty += 1
            
            if self.ctype == 'int64' or self.ctype == 'float64':
                if len(self.range) >= 1:
                    if self.range[0] > row:
                        self.range[0] = row
                    if self.range[1] < row:
                        self.range[1] = row
                else:
                    self.range.append(row)
                    self.range.append(row)
                    
            if self.name == 'Date' or self.name == 'Time':
                if len(self.range) >= 1:
                    if self.range[0] > row:
                        self.range[0] = row
                    if self.range[1] < row:
                        self.range[1] = row
                else:
                    self.range.append(row)
                    self.range.append(row)
                    
            if not row in self.values:
                self.values.append(row)
                
    def presentColAttri(self):
        numerics = ['float64', 'int64' , 'datetime64[ns]']
        
        if not self.ctype in numerics:
            self.range = ['Nan', 'Nan']
        attri = {'Name': self.name, 'Type': self.ctype, 'Empty': self.empty, 'Full': self.notEmpty, 'Ratio': (self.empty / 10681) * 100, 'Min': self.range[0], 'Max': self.range[1], 'Unique': len(self.values)}
        return attri
    
    def getValues(self):
        return self.values

In [None]:
ColAttributes = []
dataDict = dict(data.dtypes)

for key, val in dataDict.items():
    currCol = ColumnAttri(key, val)
    currCol.processCol(data)
    ColAttributes.append(currCol)
    

In [None]:
colDF = pd.DataFrame(columns=['Name','Type', 'Empty', 'Full', 'Ratio', 'Min', 'Max', 'Unique'])

for col in ColAttributes:
    colDF = colDF.append(col.presentColAttri(), ignore_index=True)
    
print(colDF.shape)
colDF

In [None]:
#drop police force atribut because all entries have same value
#drop Junction_control and 2nd_road_class beacuse they have over 20% missing entries

data.drop(columns=['Police_Force', 'Junction_Control', '2nd_Road_Class'], inplace=True)

In [None]:
# clean data with outliers
data.shape

In [None]:
# finding outliers 
excludeCol = {'Accident_Index', 'Location_Easting_OSGR', 'Location_Northing_OSGR', 'Longitude', 'Latitude', 'Local_Authority_(Highway)', 'Local_Authority_(District)', 'LSOA_of_Accident_Location', '1st_Road_Number', '2nd_Road_Number', 'Time', 'Date'}
numericCol = {'Number_of_Casualties', 'Number_of_Vehicles', 'DateVals', 'TimeVals' }
categorCol = (set(data.columns.to_list()) - numericCol) - excludeCol

In [None]:
# outlier detectioction of numeric Cols: colName, std, mean, lower, upper, numOfOutliers
outNumData = pd.DataFrame(columns=['name', 'std', 'mean', 'lower', 'upper', 'numOfOutliers'])
outNumIndex = []

for i in numericCol:
    
    std = data[i].std()
    mean = data[i].mean()
    lower = mean - 3 * std
    upper = mean + 3 * std
    count = 0
    
    for row in data[i]:
        if ((row < lower) | (row > upper)):
            count += 1
            
    if (count == 0):
        continue
    
    vals = {'name':i, 'std':std, 'mean':mean, 'lower':lower, 'upper':upper, 'numOfOutliers':count}
    outNumData = outNumData.append(vals, ignore_index=True)
    
outNumData

In [None]:
numOfRows = data.shape[0]

thrshldCalc = [0, 0]
for i in outNumData['numOfOutliers']:
    thrshldCalc[0] += i
    thrshldCalc[1] += 1
    
thrshld = thrshldCalc[0] / thrshldCalc[1]

avgTPerc = thrshld / numOfRows
lwrTPerc = outNumData['numOfOutliers'].min() / numOfRows
print(numOfRows, avgTPerc, lwrTPerc)

In [None]:
# outlier detectioction of categorical Cols: colName, value, rate
outCatData = pd.DataFrame(columns=['name', 'value', 'number', 'rate'])

percent = lwrTPerc

for i in categorCol:
    
    colValDict = dict(data[i].value_counts())
    
    for v in colValDict.keys():
        
        if(v == -1):
            continue
        
        
        if(colValDict[v] / numOfRows < percent):
            vals = {'name': i, 'value':v, 'number': colValDict[v], 'rate':(colValDict[v] / numOfRows) * 100}
            outCatData = outCatData.append(vals, ignore_index=True)

outCatData.sort_values(by=['rate'], inplace=True)
outCatData

In [None]:
# Oulier Categorical descriptive df: colName, numOfValues
outDescData = pd.DataFrame(columns=['name', 'flaggedVals', 'totalVals', 'flaggedPerc', 'dominant'])
percDict = dict.fromkeys(outCatData['name'].unique().tolist(), 0)

for i in outCatData['name'].unique().tolist():
    num = 0
    
    for p in outCatData.loc[outCatData['name'] == i]['rate']:
        percDict[i] += p
        num += 1
        
    g = colDF.loc[colDF['Name'] == i]['Unique']
    
    domi = False
    if(g.iloc[0] - num == 1):
        domi = True
    
    vals = {'name': i, 'flaggedVals': num, 'totalVals': g.iloc[0], 'flaggedPerc': percDict[i], 'dominant': domi}
    outDescData = outDescData.append(vals, ignore_index=True)

outDescData

In [None]:
unFlagCatCol = categorCol - set(outDescData['name'].to_list())
unFlagCatCol

In [None]:
#creating a dataframe with all outliers and removing them from the data dataframe

outRecords = pd.DataFrame(columns = data.columns.to_list())

datDict = dict(data.dtypes)

for key, val in datDict.items():
    if val == 'int64' or val == 'float64':
        outRecords[key] = pd.to_numeric(outRecords[key])
    if key == 'Date':
        outRecords['Date'] = pd.to_datetime(outRecords['Date'], format='%d/%m/%Y')
    if key == 'Time':
        outRecords['Time'] = pd.to_datetime(outRecords['Time'], format='%H:%M')
        
#removing numeric outliers
for i in outNumData['name'].unique():
    upper = outNumData.loc[outNumData['name'] == i]['upper'].tolist()[0]
    lower = outNumData.loc[outNumData['name'] == i]['lower'].tolist()[0]
    outs = data.loc[(data[i] > upper) | (data[i] < lower)]
    data.drop(index = data.index[(data[i] > upper) | (data[i] < lower)].tolist(), inplace = True)
    outRecords = pd.concat([outRecords, outs])
    
    
print(data.shape)
print(outRecords.shape)
        

In [None]:
#removing categorical outliers
for i in outCatData['name'].unique():
    vals = outCatData.loc[outCatData['name'] == i]['value'].tolist()
    for v in vals:
        outs = data.loc[data[i] == v]
        data.drop(index = data.index[data[i] == v].tolist(), inplace = True)
        outRecords = pd.concat([outRecords, outs])
        
print(data.shape)
print(outRecords.shape)

In [None]:
ColAttributes2 = []
dataDict = dict(data.dtypes)

for key, val in dataDict.items():
    currCol = ColumnAttri(key, val)
    currCol.processCol(data)
    ColAttributes2.append(currCol)

colDF2 = pd.DataFrame(columns=['Name','Type', 'Empty', 'Full', 'Ratio', 'Min', 'Max', 'Unique'])

for col in ColAttributes2:
    colDF2 = colDF2.append(col.presentColAttri(), ignore_index=True)
    
print(colDF2.shape)
colDF2

In [None]:
#droping all columns that only have 1 unique value
#the value in all the columsn was 0 signifying none, view document for extra info
print(data['Carriageway_Hazards'].unique())
print(data['Special_Conditions_at_Site'].unique())
print(data['Pedestrian_Crossing-Human_Control'].unique())

categorCol.remove('Carriageway_Hazards')
categorCol.remove('Pedestrian_Crossing-Human_Control')
categorCol.remove('Special_Conditions_at_Site')

In [None]:
data.drop(columns=['Carriageway_Hazards', 'Special_Conditions_at_Site', 'Pedestrian_Crossing-Human_Control'], inplace=True)
print(data.shape)

In [None]:
#exporting outlier free and clean data

data.to_csv(path_or_buf='/Users/othmanetazi/Desktop/DMA_ousework/data1.csv', index = False)
print(numericCol)
print(categorCol)
print(excludeCol)

In [None]:
#get correlatio for categorical attrbutes by factorizing them then using person
factorDf = pd.DataFrame()

for i in categorCol:
    vals, indexes = pd.factorize(data[i])
    factorDf[i] = vals
    
corrfCat = factorDf.corr(method = 'pearson')

In [None]:
#get correlatio for all attrbutes by factorizing categorical attributes then using person

for i in numericCol:
    factorDf[i] = data[i]
    
corrfAll = factorDf.corr(method = 'pearson')

In [None]:
#correlation analysis on the Catgorical columns
corrCat = data.loc[: , categorCol].corr(method = 'pearson')

In [None]:
#correlation analysis on the Numeric columns
corrNum = data.loc[: , numericCol].corr(method = 'pearson')

In [None]:
#correlation analysis on the All columns
numAndCatCol = set.union(numericCol, categorCol)
corrAll = data.loc[: , numAndCatCol].corr(method = 'pearson')

In [None]:
corrDfList = [corrNum, corrCat, corrfCat, corrAll, corrfAll]
corrDescList = []

def corrDesc(corrDf):
    corrDescDf = pd.DataFrame(columns=['name', 'highName', 'highVal', 'lowName', 'lowVal'])
    corrIndx = corrDf.index.tolist()

    for i in corrIndx:
        row = dict(corrDf.loc[i, :])

        holder = list(row.keys())[0]
        if (holder == i):
            holder = list(row.keys())[1]

        high = [holder, row[holder]]
        low = [holder,row[holder]]

        for k, v in row.items():
            if (i == k):
                continue

            if (v > high[1]):
                high[0] = k
                high[1] = v

            if (v < low[1]):
                low[0] = k
                low[1] = v

        vals = {'name': i, 'highName': high[0], 'highVal': high[1], 'lowName': low[0], 'lowVal': low[1]}
        corrDescDf = corrDescDf.append(vals, ignore_index=True)
    
    return corrDescDf


for i in corrDfList:
    corrDescList.append(corrDesc(i))
    
for i in corrDescList:
    i.sort_values(by=['highVal'], inplace=True)
    print(i.shape)


In [None]:
print('Numeric')
corrDescList[0]

In [None]:
print('Categorical non-Factorized')
corrDescList[1]

In [None]:
print('Categorical Factorized')
corrDescList[2]

In [None]:
print('All non-Factorized')
corrDescList[3]

In [None]:
print('All Factorized')
corrDescList[4]

In [None]:
#Modeling Ideas:
    #descriptive:
    #predictive: time to severity, weather condition to surface conditions
    
    #see if we can observe day light savings based of light condition / datevals

In [None]:
valColor = {1:'blue', 2:'green', 3:'red'}

for e in eList:
    e.plot(kind='scatter', x='Longitude', y='Latitude', c=e['Accident_Severity'].map(valColor), figsize=(7, 7), alpha = 0.1)

In [None]:
data['DateVals'].hist()

In [None]:
data['TimeVals'].hist()

In [None]:
for e in eList:
    e['TimeVals'].hist(alpha= 0.5)

In [None]:
percDict = dict.fromkeys(outCatData['name'].unique(), 0)
percDict

test = outCatData['name'].unique()
test[0]

In [None]:
jhd = outCatData.loc[outCatData['name'] == 'Junction_Detail']['value'].tolist()

In [None]:
for i in jhd:
    test = data.loc[data['Junction_Detail'] == i]
    print(test.shape)
    


In [None]:
data['Time'].value_counts()[:1].index.tolist()[0]