In [67]:
import pandas as pd
import numpy as np


header_list = ["Identif", "River", "Location", "Erected", "Purpose", "Length", "Lanes", "Clear-G", "T-Or-D", "Material", "Span", "Rel-L", "Type"]
bridge = pd.read_csv('https://raw.githubusercontent.com/manasiakre/DAV__5400/main/bridges.data.version1', names = header_list)
bridge

Unnamed: 0,Identif,River,Location,Erected,Purpose,Length,Lanes,Clear-G,T-Or-D,Material,Span,Rel-L,Type
0,E1,M,3,1818,HIGHWAY,?,2,N,THROUGH,WOOD,SHORT,S,WOOD
1,E2,A,25,1819,HIGHWAY,1037,2,N,THROUGH,WOOD,SHORT,S,WOOD
2,E3,A,39,1829,AQUEDUCT,?,1,N,THROUGH,WOOD,?,S,WOOD
3,E5,A,29,1837,HIGHWAY,1000,2,N,THROUGH,WOOD,SHORT,S,WOOD
4,E6,M,23,1838,HIGHWAY,?,2,N,THROUGH,WOOD,?,S,WOOD
...,...,...,...,...,...,...,...,...,...,...,...,...,...
103,E84,A,24,1969,HIGHWAY,870,6,G,THROUGH,STEEL,MEDIUM,F,ARCH
104,E91,O,44,1975,HIGHWAY,3756,6,G,THROUGH,STEEL,LONG,F,ARCH
105,E90,M,7,1978,HIGHWAY,950,6,G,THROUGH,STEEL,LONG,F,ARCH
106,E100,O,43,1982,HIGHWAY,?,?,G,?,?,?,F,?


In [48]:
# TASK 1

# Source: https://stackoverflow.com/questions/29836477/pandas-create-new-column-with-count-from-groupby
# Source: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.set_index.html

# Using 'GroupBy' and '.count()' to get grouped data containing a column with the count of bridges constructed
# wrt'River','Purpose' & 'Material'

bridge_g1 = bridge.groupby(["River", "Purpose", "Material"])["Identif"].count().reset_index(name="How many?")
bridge_g1.set_index(['River','Purpose','Material'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,How many?
River,Purpose,Material,Unnamed: 3_level_1
A,AQUEDUCT,IRON,1
A,AQUEDUCT,WOOD,3
A,HIGHWAY,IRON,2
A,HIGHWAY,STEEL,21
A,HIGHWAY,WOOD,8
A,RR,IRON,1
A,RR,STEEL,9
A,RR,WOOD,2
A,WALK,STEEL,1
M,HIGHWAY,IRON,4


In [72]:
# TASK 2

# Source: https://www.statology.org/pandas-replace-values/

# converting 'Length' column from 'obj' to 'float' and replacing '?' with 'NaN'
bridge["Length"] = bridge["Length"].replace('?','NaN').astype("float")
bridge

Unnamed: 0,Identif,River,Location,Erected,Purpose,Length,Lanes,Clear-G,T-Or-D,Material,Span,Rel-L,Type
0,E1,M,3,1818,HIGHWAY,,2,N,THROUGH,WOOD,SHORT,S,WOOD
1,E2,A,25,1819,HIGHWAY,1037.0,2,N,THROUGH,WOOD,SHORT,S,WOOD
2,E3,A,39,1829,AQUEDUCT,,1,N,THROUGH,WOOD,?,S,WOOD
3,E5,A,29,1837,HIGHWAY,1000.0,2,N,THROUGH,WOOD,SHORT,S,WOOD
4,E6,M,23,1838,HIGHWAY,,2,N,THROUGH,WOOD,?,S,WOOD
...,...,...,...,...,...,...,...,...,...,...,...,...,...
103,E84,A,24,1969,HIGHWAY,870.0,6,G,THROUGH,STEEL,MEDIUM,F,ARCH
104,E91,O,44,1975,HIGHWAY,3756.0,6,G,THROUGH,STEEL,LONG,F,ARCH
105,E90,M,7,1978,HIGHWAY,950.0,6,G,THROUGH,STEEL,LONG,F,ARCH
106,E100,O,43,1982,HIGHWAY,,?,G,?,?,?,F,?


In [74]:
# Source: https://stackoverflow.com/questions/41040132/pandas-groupby-count-and-mean-combined

# Using 'GroupBy' and 'agg' to get grouped data containing a column with the mean of lenght values wrt
# 'Purpose' & 'Material'
bridge_g2 = bridge.groupby(['Purpose', 'Material']).agg({'Length':'mean'}).rename(columns = {'Length':'Average Length'})
bridge_g2

Unnamed: 0_level_0,Unnamed: 1_level_0,Average Length
Purpose,Material,Unnamed: 2_level_1
AQUEDUCT,IRON,1000.0
AQUEDUCT,WOOD,1092.0
HIGHWAY,?,
HIGHWAY,IRON,1216.666667
HIGHWAY,STEEL,1557.804348
HIGHWAY,WOOD,1053.375
RR,IRON,1100.0
RR,STEEL,1946.85
RR,WOOD,
WALK,STEEL,


In [76]:
# TASK 3

# Source: Mod 11 Lecture Notes

# splitting the 'Erected' column into 4 separate bins (one for each time period)
Erected = pd.cut(bridge.Erected, 4, precision = 0)

# defining a function that mentions the stats we want to apply to 'Length' column and the values respective to each
# 'Erected' bin
def b3(group):
    return{'Average Length':group.mean(),'Count':group.count(),'Max Length':group.max(),'Min Length':group.min()}

# grouping the 'Length' column's values relative to the 'Erected' bins 
bridge_g3 = bridge.Length.groupby(Erected)
bridge_g3.apply(b3).unstack()

Unnamed: 0_level_0,Average Length,Count,Max Length,Min Length
Erected,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(1818.0, 1860.0]",1094.625,8.0,1500.0,990.0
"(1860.0, 1902.0]",1603.347826,23.0,4558.0,1000.0
"(1902.0, 1944.0]",1676.181818,33.0,3000.0,860.0
"(1944.0, 1986.0]",1530.411765,17.0,3756.0,804.0
