In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
pd.set_option('display.max_rows',1000)
pd.set_option('display.max_columns',40)
pd.set_option('max_colwidth',1000)

In [None]:
Traffic = pd.read_csv('accidents_2012_to_2014.csv')
Traffic.tail()

In [None]:
Traffic.info

In [None]:
Traffic.describe()

In [None]:
Traffic.isnull().sum()

In [None]:
pd.DataFrame(Traffic.keys())

In [None]:
Traffic['Time'].unique() # 13 nan values

In [None]:
Traffic['Time'].fillna(value=Traffic['Time'].mode().index[0], inplace = True)
Traffic['Time'].isnull().sum()

In [None]:
Traffic['Junction_Detail'].unique() # 464697

In [None]:
Traffic.drop('Junction_Detail', axis = 1, inplace = True)
pd.DataFrame(Traffic.keys())

In [None]:
Traffic['Junction_Control'].unique() # 178610

In [None]:
sns.catplot(x = 'Junction_Control', y = 'Number_of_Casualties', data = Traffic, color = True, palette = 'rainbow', height = 5, aspect = 2);

In [None]:
Traffic['Number_of_Casualties'].unique()

In [None]:
Traffic[Traffic['Number_of_Casualties'] == 54]['Junction_Control']

In [None]:
def impute_control(cols):
    Number_of_Casualties = cols[0]
    Junction_Control = cols[1]
    
    if pd.isnull(Junction_Control):

        if Number_of_Casualties < 20:
            return 'Giveway or uncontrolled'

        elif Number_of_Casualties >=20 and Number_of_Casualties <60:
            return 'Automatic traffic signal'

        else:
            return 'Stop Sign'

    else:
        return Junction_Control

In [None]:
Traffic['Junction_Control'] = Traffic[['Junction_Control','Number_of_Casualties']].apply(impute_control,axis=1)

In [None]:
Traffic.isnull().sum()

In [None]:
Traffic['Road_Surface_Conditions'].unique() # 755

In [None]:
a = Traffic[Traffic['Road_Surface_Conditions']=='Frost/Ice']
Traffic.drop(a.index, inplace = True)
Traffic['Road_Surface_Conditions'].unique()

In [None]:
b = Traffic[Traffic['Road_Surface_Conditions']=='Snow']
Traffic.drop(b.index, inplace = True)
Traffic['Road_Surface_Conditions'].unique()

In [None]:
Traffic['Road_Surface_Conditions'].isnull().sum()

In [None]:
Traffic[Traffic['Road_Surface_Conditions'] == 'Flood (Over 3cm of water)']['Number_of_Casualties'].count()

In [None]:
def impute_conditions(cols):
    Number_of_Casualties = cols[0]
    Road_Surface_Conditions = cols[1]
    
    if pd.isnull(Road_Surface_Conditions):

        if Number_of_Casualties < 20:
            return 'Dry'

        elif Number_of_Casualties >=20 and Number_of_Casualties <60:
            return 'Wet/Damp'

        else:
            return 'Flood (Over 3cm of water)'

    else:
        return Road_Surface_Conditions

In [None]:
Traffic['Road_Surface_Conditions'] = Traffic[['Road_Surface_Conditions','Number_of_Casualties']].apply(impute_conditions,axis=1)

In [None]:
Traffic.isnull().sum()

In [None]:
Traffic['Local_Authority_(District)'].unique()

In [None]:
Traffic['Local_Authority_(Highway)'].unique()

In [None]:
Traffic['Special_Conditions_at_Site'].unique() #2

In [None]:
Traffic['Junction_Control'].unique()

In [None]:
Traffic['Special_Conditions_at_Site'].fillna(value=Traffic['Special_Conditions_at_Site'].mode().index[0], inplace = True)
Traffic['Special_Conditions_at_Site'].isnull().sum()

In [None]:
Traffic['Carriageway_Hazards'].unique() #3

In [None]:
Traffic['Carriageway_Hazards'].fillna(value=Traffic['Carriageway_Hazards'].mode().index[0], inplace = True)
Traffic['Carriageway_Hazards'].isnull().sum()

In [None]:
Traffic['Did_Police_Officer_Attend_Scene_of_Accident'].unique() #2

In [None]:
Traffic['Did_Police_Officer_Attend_Scene_of_Accident'].fillna(value=Traffic['Did_Police_Officer_Attend_Scene_of_Accident'].mode().index[0], inplace = True)
Traffic['Did_Police_Officer_Attend_Scene_of_Accident'].isnull().sum()

In [None]:
Traffic['LSOA_of_Accident_Location'].unique() # 28718

In [None]:
Traffic.dropna(inplace = True)

In [None]:
Traffic.isnull().sum()

In [None]:
Traffic.head()

In [None]:
Traffic.shape

In [None]:
Traffic.tail(40)

In [None]:
Traffic['Police_Force'].unique()

In [None]:
fig1=sns.catplot(x='Number_of_Casualties',y = 'Number_of_Vehicles', hue = 'Accident_Severity', kind='point',aspect = 3, data=Traffic)
fig1

In [None]:
plt.plot(x= Traffic['Number_of_Casualties'],y= Traffic['Accident_Severity'],color='green', marker='d', linestyle='dotted',
...      linewidth=3, markersize=8) 
plt.xlabel('Number of Casualties')
plt.ylabel('Accident Severity')
plt.title('Number of Casualties Vs Accident Severity')
plt.show();

In [None]:
fig=sns.catplot(x='Number_of_Casualties', kind='count',aspect = 3, data=Traffic)
plt.ylim(1,)
fig

In [None]:
fig.savefig("Number of Casualties Count.png")

In [None]:
Pedestrians = sns.catplot(x='Pedestrian_Crossing-Physical_Facilities',hue='Accident_Severity',kind='count',data=Traffic, height=7, aspect=2.7)
Pedestrians

In [None]:
Pedestrians.savefig('Pedestrian Crossing Vs Accident Severity.png', dpi =85)

In [None]:
Traffic['Special_Conditions_at_Site'].unique()

In [None]:
Special_Conditions = sns.catplot(x='Special_Conditions_at_Site',hue='Accident_Severity',kind='count',data=Traffic, palette = 'rainbow',height=7, aspect=3.3)
Special_Conditions

In [None]:
Special_Conditions.savefig('Special_Conditions Vs Accident Severity.png', dpi =85)

In [None]:
Traffic['Hour'] = Traffic['Time'].str[0:2]
Traffic['Hour'] = pd.to_numeric(Traffic['Hour'])
Traffic = Traffic.dropna(subset=['Hour'])
Traffic['Hour'] = Traffic['Hour'].astype('int')

In [None]:
def time_of_day(hour):
    if hour >= 5 and hour <10:
        return 'morning rush (5am - 10am)'
    elif hour >=10 and hour <15:
        return 'office hours (10am - 3pm)'
    elif hour >=15 and hour <19:
        return 'afternoon rush (3pm - 7pm)'
    elif hour >=19 and hour <23:
        return 'evening (7pm - 11pm)'
    else:
        return 'night (11pm - 5am)'
Traffic['Daytime'] = Traffic['Hour'].apply(time_of_day)
Traffic[['Time','Hour','Daytime']].head(10)

In [None]:
Traffic.head()

In [None]:
Traffic.drop('Time', axis = 1, inplace = True)
Traffic

In [None]:
Time_of_day = Traffic.groupby('Daytime')
Time_of_day.head()

In [None]:
Traffic.groupby('Daytime').size()

In [None]:
Traffic.groupby('Daytime').size().plot(kind = 'bar', color = 'purple', figsize = (12,8), grid = False)
plt.xticks(np.arange(5),('afternoon rush (3pm - 7pm)', 'evening (7pm - 11pm)', 'morning rush (5am - 10am)','night (11pm - 5am)', 'office hours (10am - 3pm)'),rotation = 'horizontal')
plt.xlabel('Time of Day', fontsize = 12, fontweight = 'bold')
plt.ylabel('Count', fontsize = 12, fontweight = 'bold')
plt.title('Total Number of Accidents by Time of Day', fontsize = 14, fontweight = 'bold')
plt.show();

In [None]:
Traffic.groupby('Daytime')['Number_of_Casualties'].mean()

In [None]:
Traffic.groupby('Daytime')['Number_of_Casualties'].mean().plot(kind = 'bar', color = 'lightblue', figsize = (12,8), grid = False)
plt.xticks(np.arange(5),('afternoon rush (3pm - 7pm)', 'evening (7pm - 11pm)', 'morning rush (5am - 10am)','night (11pm - 5am)', 'office hours (10am - 3pm)'), rotation = 'horizontal')
plt.ylim((1, 1.5))
plt.xlabel('')
plt.ylabel('Average Number of Casualties', fontsize = 12, fontweight = 'bold')
plt.title('Average Number of Casualties by Time of Day', fontsize = 14, fontweight = 'bold')
plt.show()

In [None]:
Traffic['Date'] = pd.to_datetime(Traffic['Date'], format = '%d/%m/%Y')

In [None]:
# Has the number of accidents increased or decreased over the last few years?

yearly_count = Traffic['Date'].dt.year.value_counts().sort_index(ascending = False)

sns.set_style('whitegrid')
fig, ax = plt.subplots(figsize=(10,10))

ax.bar(yearly_count.index, yearly_count.values, color = 'darkred')
ax.set_title('Accidents per Year', fontsize = 14, fontweight = 'bold')
ax.set_ylabel('Total Counts', fontsize = 14, fontweight = 'bold')
plt.xlim(2011,2015)


sns.despine(ax = ax, top = True, right = True, left = True, bottom = True)
plt.show(),

In [None]:
Traffic['Road_Type'].unique()

In [None]:
Traffic.groupby('Road_Type')['Number_of_Casualties'].mean().plot(kind = 'bar', color =[ 'green','blue','k','y','orange','pink'], figsize = (12,8), grid = False)
plt.xticks(np.arange(6),('Dual carriageway','One way street','Roundabout','Single carriageway','Slip road','Unknown'), rotation = 'horizontal')
plt.ylim((1.1,1.6))
plt.xlabel('')
plt.ylabel('Average Number of Casualties', fontsize = 12, fontweight = 'bold')
plt.title('Average Number of Casualties by Road Type', fontsize= 14, fontweight = 'bold')
plt.show();

In [None]:
Traffic.groupby('Road_Type')['Number_of_Casualties'].mean()

In [None]:
Traffic.groupby('Speed_limit')['Number_of_Casualties'].mean()

In [None]:
Traffic.groupby('Speed_limit')['Number_of_Casualties'].mean().plot(kind = 'bar', color = 'brown', figsize = (12,8), grid = False)
plt.xticks(np.arange(6),('20mph', '30mph','40mph','50mph','60mph','70mph'), rotation = 'horizontal')
plt.ylim((1.0,1.8))
plt.xlabel('')
plt.ylabel('Average Number of Casualties', fontsize = 12, fontweight = 'bold')
plt.title('Average Number of Casualties by Speed Limit', fontsize = 14, fontweight = 'bold')
plt.show();

In [None]:
Traffic.groupby('Year')['Number_of_Casualties'].mean()

In [None]:
Traffic.groupby('Year')['Number_of_Casualties'].mean().plot(kind = 'bar', color = 'magenta', figsize = (12,8), grid = False)
plt.xticks(np.arange(3),('2012', '2013','2014'), rotation = 'horizontal')
plt.ylim((1.2,1.4))
plt.xlabel('')
plt.ylabel('Average Number of Casualties', fontsize = 12, fontweight = 'bold')
plt.title('Average Number of Casualties by Year', fontsize = 14, fontweight = 'bold')
plt.show();

In [None]:
Traffic.groupby('Day_of_Week')['Number_of_Casualties'].mean()

In [None]:
Traffic.groupby('Day_of_Week')['Number_of_Casualties'].mean().plot(kind = 'bar', color = 'darkcyan', figsize = (12,8), grid = False)
plt.xticks(np.arange(7),('Monday', 'Tuesday','Wednesday','Thurday','Friday','Saturday','Sunday'), rotation = 'horizontal')
plt.ylim((1.2,1.5))
plt.xlabel('')
plt.ylabel('Average Number of Casualties', fontsize = 12, fontweight = 'bold')
plt.title('Average Number of Casualties by Day of Week', fontsize = 14, fontweight = 'bold')
plt.show();

In [None]:
Traffic.groupby('Day_of_Week')['Number_of_Casualties'].size()

In [None]:
labels = 'Monday','Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'
sizes = [45874, 61010, 64507, 64467, 65236, 69791, 55375] 
explode = (0, 0, 0, 0, 0, 0.1, 0) 
color = ['white','pink','brown','purple', 'green','magenta','darkred']

fig1, ax1= plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, colors = color,autopct = '%1.1f%%', shadow = True, startangle=90) 
ax1.axis('equal')
plt.title('Number of Casualties by Day of Week')
plt.style.use('seaborn')
plt.show();

In [None]:
Traffic.keys()

In [None]:
Traffic.drop(columns=['Accident_Index', 'Location_Easting_OSGR', 'Location_Northing_OSGR','Police_Force', 'Local_Authority_(District)','Local_Authority_(Highway)'], axis = 1, inplace = True)
Traffic.head()

In [None]:
Traffic.info()

In [None]:
Traffic.drop('LSOA_of_Accident_Location', axis = 1, inplace = True)

In [None]:
Traffic['Date'] = pd.to_datetime(Traffic['Date'], format= '%d/%m/%Y')
Traffic['Month'] = Traffic['Date'].dt.month
Traffic['Day'] = Traffic['Date'].dt.day
Traffic.head()

In [None]:
Traffic.drop('Date', axis = 1, inplace = True)
Traffic.head()

In [None]:
Traffic.drop('Daytime', axis = 1, inplace = True)
Traffic.head()

In [None]:
pd.DataFrame(Traffic.keys())

In [None]:
Traffic['1st_Road_Class'].unique() 

In [None]:
Traffic['Junction_Control'].unique()

In [None]:
Traffic_new = pd.get_dummies(Traffic,columns=['Road_Type','Pedestrian_Crossing-Human_Control','Pedestrian_Crossing-Physical_Facilities','Light_Conditions','Weather_Conditions','Special_Conditions_at_Site','Carriageway_Hazards', 'Did_Police_Officer_Attend_Scene_of_Accident'], drop_first = True)
Traffic_new.head()

In [None]:
pd.DataFrame(Traffic_new.keys())

In [None]:
Traffic_new.drop(columns = ['Longitude', 'Latitude', '1st_Road_Class','1st_Road_Number','2nd_Road_Class','2nd_Road_Number', 'Weather_Conditions_Snowing with high winds','Weather_Conditions_Snowing without high winds'], axis = 1, inplace = True)
Traffic_new.keys()

In [None]:
My_data = Traffic_new.to_csv('Transportation Dataset.csv', index = False)

In [None]:
condition = (Traffic_new['Number_of_Vehicles'] < 6) & (Traffic_new['Number_of_Casualties'] < 9)
Traffic_new = Traffic_new[condition]

In [None]:
Traffic_new['Number_of_Vehicles'].value_counts()

In [None]:
Traffic_new['Number_of_Casualties'].value_counts()

In [None]:
X=Traffic_new.drop('Number_of_Casualties', axis = 1)
X.shape

In [None]:
y=Traffic_new['Number_of_Casualties']
y.shape

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
bestfeatures = SelectKBest(score_func=chi2, k = 'all')
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
featureScores = pd.concat([dfcolumns,dfscores], axis = 1)
featureScores.columns = ['Specs','Score']
important = pd.DataFrame(featureScores.nlargest(60,'Score'))
important

In [None]:
X.drop(['Carriageway_Hazards_Pedestrian in carriageway (not injured)', 'Pedestrian_Crossing-Human_Control_Control by school crossing patrol','Weather_Conditions_Other','Weather_Conditions_Fine without high winds','Special_Conditions_at_Site_Ol or diesel','Pedestrian_Crossing-Physical_Facilities_Footbridge or subway','Carriageway_Hazards_Other object in carriageway','Special_Conditions_at_Site_Mud','Carriageway_Hazards_Dislodged vehicle load in carriageway','Day_of_Week','Special_Conditions_at_Site_None','Carriageway_Hazards_None','Pedestrian_Crossing-Human_Control_None within 50 metres','Year'], axis = 1, inplace = True)
X.keys()

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.30, random_state = 101)

In [None]:
from sklearn.preprocessing import StandardScaler
SC_X=StandardScaler()
SC_Y=StandardScaler()

In [None]:
X_train=SC_X.fit_transform(X_train)

In [None]:
X_test=SC_X.fit_transform(X_test)

In [None]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train, y_train)

In [None]:
X_predict = lm.predict(X_train)

In [None]:
print(lm.intercept_)

In [None]:
coeff_df = pd.DataFrame(lm.coef_,X.columns, columns = ['Coefficient'])
coeff_df

In [None]:
predictions = lm.predict(X_test)
pred = pd.DataFrame(predictions)

In [None]:
pred.rename(columns={0:'Predictions'}, inplace = True)
pred

In [None]:
Predictions = pred['Predictions'].apply(np.ceil)
pd.DataFrame(Predictions)

In [None]:
y_test.head()

In [None]:
plt.scatter(y_test, Predictions);

In [None]:
sns.distplot(Predictions, kde = False);

In [None]:
sns.distplot(y_test-predictions, bins = 20, kde = False);

In [None]:
from sklearn import metrics

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))