# The goal of this document is to:

### Merge bridge and socioeconomic data

In [41]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from statistics import mean
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay 
from sklearn.metrics import accuracy_score
from random import randrange
import warnings
import math
from sklearn import tree
warnings.filterwarnings("ignore")
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import seaborn as sn
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from sklearn.model_selection import train_test_split #from imblearn import under_sampling, over_sampling

# Bridge Data with Demographic and socioeconomic variables 
#### 4. Retrieve relevant demographic and socioeconomic variables from the ACS 2020 5 year estimates. Map each census tract to the municipality they belong to. Aggregate these variables at municipality level and obtain summary statistics for each municipality. Mrge the summarized socioeconomic and demographic statistics to each bridge data point based on municipality
IMPORTANT NOTE: Census tracts must stay within a county and therefore a state. They do not necessarily coincide within any other geography. For example, although some census tracts follow place boundaries, there is no rule that says they must stay within a place. This explains why some same census tracts appear in different Municipalities
https://www.census.gov/newsroom/blogs/random-samplings/2014/07/understanding-geographic-relationships-counties-places-tracts-and-more.html<br>
Some municipalities will have nans, as there are not enough sample points in the census tracts.

In [42]:
#Read the excel file that contains the Census Tracts and the Municipality they belong to.
#Notice that Census Tracts in decimal format, e.g., XXXX.XX will be treated separately from Census Tracts in integer format
CensusTractdecimals_2020_Municipality = pd.read_excel("CensusTracts2020_Municipalities_AlleghenyCounty.xlsx", 
                                                      sheet_name = "Decimals")
CensusTractintegers_2020_Municipality = pd.read_excel("CensusTracts2020_Municipalities_AlleghenyCounty.xlsx", 
                                                      sheet_name = "Integers")
#Convert the census tract columns to string characters
CensusTractdecimals_2020_Municipality["CensusTract"] = CensusTractdecimals_2020_Municipality["CensusTract"].astype(str)
CensusTractintegers_2020_Municipality["CensusTract"] = CensusTractintegers_2020_Municipality["CensusTract"].astype(str)

#Create a dictionary, in which the keys are the census tracts and the values are the municipalities they belong to
CensusTracts_to_Municipality_dict = dict(zip(CensusTractintegers_2020_Municipality.CensusTract, 
                                             CensusTractintegers_2020_Municipality.Municipality))
CensusTracts_to_Municipality_dict.update(dict(zip(CensusTractdecimals_2020_Municipality.CensusTract, 
                                                  CensusTractdecimals_2020_Municipality.Municipality)))
#Convert the keys of the dictionary to string characters
CensusTracts_to_Municipality_dict = {str(key): str(value) for key, value in CensusTracts_to_Municipality_dict.items()}

#### Race
#### Information was retrieved from ACS 2020 5-year estimates. This dataframe describes the population composition in terms of race for each census tract. Data will be aggregated at the municipality level. Notice that some census tracts may not contain data points. This is due to these reasons: 1) the number of sample cases is too small or 2) no sample observations available. In this case, nans are ignored when aggregating values at the municipality level. Value were aggregate by adding all census tracts statistics or averaging all census tracts statistics.  The dataframe at municipality level contains absolute estimates and percentages (with respect to the total population) of race composition. 

In [43]:
#Read the census data 
race_df = pd.read_excel("Total_Population_Race_2020.xlsx")

#Retrieve the census tract and save it as a string
race_df["GEO_ID_CensusTract"] = race_df.Geography.str[14:].astype(str).str.lstrip("0")

#Add decimal points
race_df["CensusTract(decimal format)"] = race_df["GEO_ID_CensusTract"].str[:-2]+'.'+race_df["GEO_ID_CensusTract"].str[-2:]

#Replace the .00 to ""
race_df["CensusTract(decimal format)"] = race_df["CensusTract(decimal format)"].astype(str)
race_df["CensusTract(decimal format)"] = race_df["CensusTract(decimal format)"].str.replace(".00", "", regex=False)

#Create a new column that indicates the Municipality to which each row of census data frame belongs to
race_df['Municipality'] = race_df["CensusTract(decimal format)"].map(CensusTracts_to_Municipality_dict)

#The following df contains the number of inhabitants in each municipality according to their race
Municipality_race_df = pd.pivot_table(race_df, values=['Total population', 
                                                    "Hispanic or Latino (of any race)", 
                                                    "White alone", 
                                                    "Black or African American alone", 
                                                    "American Indian and Alaska Native alone", 
                                                    "Asian alone", 
                                                    "Native Hawaiian and Other Pacific Islander alone",
                                                    "Some other race alone", 
                                                    "Two or more races"], 
                                   index=['Municipality'],
                                   aggfunc={'Total population': np.sum,
                                            "Hispanic or Latino (of any race)" : np.sum, 
                                             "White alone" : np.sum, 
                                             "Black or African American alone": np.sum, 
                                             "American Indian and Alaska Native alone": np.sum, 
                                             "Asian alone": np.sum, 
                                             "Native Hawaiian and Other Pacific Islander alone": np.sum,
                                             "Some other race alone": np.sum, 
                                             "Two or more races": np.sum})

#find percentages
Municipality_race_df["%Hispanic"] = (Municipality_race_df["Hispanic or Latino (of any race)"] / 
                                     Municipality_race_df["Total population"]) * 100
Municipality_race_df["%White alone"] = (Municipality_race_df["White alone"] / 
                                        Municipality_race_df["Total population"]) * 100
Municipality_race_df["%Black or African American alone"] = (Municipality_race_df["Black or African American alone"] / 
                                                            Municipality_race_df["Total population"]) * 100
Municipality_race_df["%American Indian and Alaska Native alone"] = (
    Municipality_race_df["American Indian and Alaska Native alone"] / Municipality_race_df["Total population"]) * 100
Municipality_race_df["%Asian alone"] = (Municipality_race_df["Asian alone"] / 
                                        Municipality_race_df["Total population"]) * 100
Municipality_race_df["%Native Hawaiian and Other Pacific Islander alone"] = (
    Municipality_race_df["Native Hawaiian and Other Pacific Islander alone"] / 
    Municipality_race_df["Total population"]) * 100
Municipality_race_df["%Some other race alone"] = (Municipality_race_df["Some other race alone"] /
                                                  Municipality_race_df["Total population"]) * 100
Municipality_race_df["%Two or more races"] = (Municipality_race_df["Two or more races"] / 
                                              Municipality_race_df["Total population"]) * 100


#### Economic characteristics
#### Information was retrieved from ACS 2020 5-year estimates. This dataframe provides information regarding the total number of civilian labor force, unemployment rate among civilian labor force, mean household income, total number of workers, the number of workers according to their of means of transportation to work. Data will be aggregated at the municipality level. Notice that some census tracts may not contain data points. This is due to these reasons: 1) the number of sample cases is too small or 2) no sample observations available. In this case, nans are ignored when aggregating values at the municipality level. Value were aggregate by adding all census tracts statistics or averaging all census tracts statistics.  The dataframe at municipality level contains absolute estimates and percentages (with respect to the total number of workers) 

In [44]:
#Read the census data 
econ_characs_df = pd.read_excel("Econ_Characs_2020_data.xlsx")

#Retrieve the census tract and save it as a string
econ_characs_df["GEO_ID_CensusTract"] = econ_characs_df.Geography.str[14:].astype(str).str.lstrip("0")

#Add decimal points
econ_characs_df["CensusTract(decimal format)"] = econ_characs_df["GEO_ID_CensusTract"].str[:-2]+'.'+econ_characs_df["GEO_ID_CensusTract"].str[-2:]

#Replace the .00 to ""
econ_characs_df["CensusTract(decimal format)"] = econ_characs_df["CensusTract(decimal format)"].astype(str)
econ_characs_df["CensusTract(decimal format)"] = econ_characs_df["CensusTract(decimal format)"].str.replace(".00", "", regex=False)

#Create a new column that indicates the Municipality to which each row of census data frame belongs to
econ_characs_df['Municipality'] = econ_characs_df["CensusTract(decimal format)"].map(CensusTracts_to_Municipality_dict)

#The following df contains the average unemployment rate, household income in each municipality
Municipality_economic_characs_df = pd.pivot_table(econ_characs_df, values=['Total civilian labor force',
                                                                          'Unemployment Rate among civilian labor force',
                                                                           'Mean household income (dollars)',
                                                                           'Total number workers 16 years and over',
                                                                           'Total workers that commute with car, truck, or van  drove alone',
                                                                           'Total workers that commute with car, truck, or van  carpooled',
                                                                           'Total workers that commute with public transportation (excluding taxicab)',
                                                                           'Total workers that walked',
                                                                           'Total workers that commute by other means',
                                                                            'Total workers that worked from home'], 
                                                                   index=['Municipality'],
                                                                   aggfunc={'Total civilian labor force': np.sum,
                                                                            'Unemployment Rate among civilian labor force' : np.mean,
                                                                            'Mean household income (dollars)' : np.mean, 
                                                                           'Total number workers 16 years and over': np.sum,
                                                                           'Total workers that commute with car, truck, or van  drove alone': np.sum,
                                                                           'Total workers that commute with car, truck, or van  carpooled': np.sum,
                                                                           'Total workers that commute with public transportation (excluding taxicab)': np.sum,
                                                                           'Total workers that walked': np.sum,
                                                                           'Total workers that commute by other means': np.sum,
                                                                            'Total workers that worked from home': np.sum})

#Find percentages
econ_characs_df["%Total workers that commute with car, truck, or van  drove alone"] = (econ_characs_df["Total workers that commute with car, truck, or van  drove alone"] / econ_characs_df["Total number workers 16 years and over"]) * 100
econ_characs_df["%Total workers that commute with car, truck, or van  carpooled"] = (econ_characs_df["Total workers that commute with car, truck, or van  carpooled"] / econ_characs_df["Total number workers 16 years and over"]) * 100
econ_characs_df["%Total workers that commute with public transportation (excluding taxicab)"] = (econ_characs_df["Total workers that commute with public transportation (excluding taxicab)"] / econ_characs_df["Total number workers 16 years and over"]) * 100
econ_characs_df["%Total workers that walked"] = (econ_characs_df["Total workers that walked"] / econ_characs_df["Total number workers 16 years and over"]) * 100
econ_characs_df["%Total workers that commute by other means"] = (econ_characs_df["Total workers that commute by other means"] / econ_characs_df["Total number workers 16 years and over"]) * 100
econ_characs_df["%Total workers that worked from home"] = (econ_characs_df["Total workers that worked from home"] / econ_characs_df["Total number workers 16 years and over"]) * 100

#### Poverty information
#### Information was retrieved from ACS 2020 5-year estimates. This dataframe provides information regarding the total number of inhabitants living in poverty, and the number of inhabitants living below the poverty status at each census tract. Data will be aggregated at the municipality level. Notice that some census tracts may not contain data points. This is due to these reasons: 1) the number of sample cases is too small or 2) no sample observations available. In this case, nans are ignored when aggregating values at the municipality level. Value were aggregate by adding all census tracts statistics or averaging all census tracts statistics.  The dataframe at municipality level contains absolute estimates and percentages (with respect to the total population living in poverty status) 

In [45]:
#Read the census data 
poverty_status_df = pd.read_excel("PovertyStatus_data_2020.xlsx")

#Retrieve the census tract and save it as a string
poverty_status_df["GEO_ID_CensusTract"] = poverty_status_df.Geography.str[14:].astype(str).str.lstrip("0")

#Add decimal points
poverty_status_df["CensusTract(decimal format)"] = poverty_status_df["GEO_ID_CensusTract"].str[:-2]+'.'+poverty_status_df["GEO_ID_CensusTract"].str[-2:]

#Replace the .00 to ""
poverty_status_df["CensusTract(decimal format)"] = poverty_status_df["CensusTract(decimal format)"].astype(str)
poverty_status_df["CensusTract(decimal format)"] = poverty_status_df["CensusTract(decimal format)"].str.replace(".00", "", regex=False)

#Create a new column that indicates the Municipality to which each row of census data frame belongs to
poverty_status_df['Municipality'] = poverty_status_df["CensusTract(decimal format)"].map(CensusTracts_to_Municipality_dict)
#The following df contains number living in poverty and the number of people living below the poverty level
Municipality_poverty_df = pd.pivot_table(poverty_status_df, values=['Total population for whom poverty status is determined',
                                                                    'Total population for whom poverty status is determined (Below poverty level)'], 
                                                                   index=['Municipality'],
                                                                   aggfunc={'Total population for whom poverty status is determined' : np.sum,
                                                                            'Total population for whom poverty status is determined (Below poverty level)': np.sum})

Municipality_poverty_df["%Total population for whom poverty status is determined (Below poverty level)"] = (
    Municipality_poverty_df["Total population for whom poverty status is determined (Below poverty level)"]/
    Municipality_poverty_df["Total population for whom poverty status is determined"])*100

#### Educational attainment information
#### Information was retrieved from ACS 2020 5-year estimates. This dataframe provides information regarding the total number of households, the total population older than 25 years, and the population (older than 25 years old) with less than 9th grade. Data will be aggregated at the municipality level. Notice that some census tracts may not contain data points. This is due to these reasons: 1) the number of sample cases is too small or 2) no sample observations available. In this case, nans are ignored when aggregating values at the municipality level. Value were aggregate by adding all census tracts statistics or averaging all census tracts statistics.  The dataframe at municipality level contains absolute estimates and percentages (with respect to the total population older than 25 years) 

In [46]:
#Read the census data 
educational_attainment_df = pd.read_excel("EducationalAttainment_2020_data.xlsx")

#Retrieve the census tract and save it as a string
educational_attainment_df["GEO_ID_CensusTract"] = educational_attainment_df.Geography.str[14:].astype(str).str.lstrip("0")

#Add decimal points
educational_attainment_df["CensusTract(decimal format)"] = educational_attainment_df["GEO_ID_CensusTract"].str[:-2]+'.'+educational_attainment_df["GEO_ID_CensusTract"].str[-2:]

#Replace the .00 to ""
educational_attainment_df["CensusTract(decimal format)"] = educational_attainment_df["CensusTract(decimal format)"].astype(str)
educational_attainment_df["CensusTract(decimal format)"] = educational_attainment_df["CensusTract(decimal format)"].str.replace(".00", "", regex=False)

#Create a new column that indicates the Municipality to which each row of census data frame belongs to
educational_attainment_df['Municipality'] = educational_attainment_df["CensusTract(decimal format)"].map(CensusTracts_to_Municipality_dict)
#The following df contains the number of households, population of 25 years and over, and population with less than 9th grade completed
Municipality_educational_attainment_df = pd.pivot_table(educational_attainment_df, values=['Total number households',
                                                                    'Total population 25 years and over',
                                                                    'Total population 25 years and over with less than 9th grade'], 
                                                                   index=['Municipality'],
                                                                   aggfunc={'Total number households' : np.sum,
                                                                            'Total population 25 years and over': np.sum,
                                                                           'Total population 25 years and over with less than 9th grade' : np.sum})


#Find percentages
Municipality_educational_attainment_df["%Total population 25 years and over with less than 9th grade"] = (Municipality_educational_attainment_df["Total population 25 years and over with less than 9th grade"]/Municipality_educational_attainment_df["Total population 25 years and over"])*100



#### Merge the socioeconomic and demographic statistics of municipalities to the bridge dataset

In [47]:
#Read the file that contains the PA22 - 22 csv file with the municipality of Allegheny county where every bridge belongs to
Bridges_df = pd.read_csv("PA22 - PA22_Municipalities.csv")
#Join the 4 different df that contains demographic information about municipalities 

Bridges_Municipalities_Demographics_df = Bridges_df.merge(right = Municipality_educational_attainment_df, 
                                                       how = "left", 
                                                       left_on = "Municipality", 
                                                       right_on = "Municipality")

Bridges_Municipalities_Demographics_df = Bridges_Municipalities_Demographics_df.merge(right = Municipality_poverty_df, 
                                                       how = "left", 
                                                       left_on = "Municipality", 
                                                       right_on = "Municipality")

Bridges_Municipalities_Demographics_df = Bridges_Municipalities_Demographics_df.merge(right = Municipality_economic_characs_df, 
                                                       how = "left", 
                                                       left_on = "Municipality", 
                                                       right_on = "Municipality")

Bridges_Municipalities_Demographics_df = Bridges_Municipalities_Demographics_df.merge(right = Municipality_race_df, 
                                                       how = "left", 
                                                       left_on = "Municipality", 
                                                       right_on = "Municipality")

In [48]:
## This dataframe contains socioeconomic and demographic statistics and characteristics of each bridge. Lets merge this df
## with the df that has already been prepared
Bridges_Municipalities_Demographics_df = Bridges_Municipalities_Demographics_df[["Original_row_number","Municipality",
"Total number households",
"Total population 25 years and over",
"Total population 25 years and over with less than 9th grade",
"%Total population 25 years and over with less than 9th grade",
"Total population for whom poverty status is determined",
"Total population for whom poverty status is determined (Below poverty level)",
"%Total population for whom poverty status is determined (Below poverty level)",
"Mean household income (dollars)",
"Total civilian labor force",
"Total number workers 16 years and over",
"Total workers that commute by other means",
"Total workers that commute with car, truck, or van  carpooled",
"Total workers that commute with car, truck, or van  drove alone",
"Total workers that commute with public transportation (excluding taxicab)",
"Total workers that walked",
"Total workers that worked from home",
"Unemployment Rate among civilian labor force",
"American Indian and Alaska Native alone",
"Asian alone",
"Black or African American alone",
"Hispanic or Latino (of any race)",
"Native Hawaiian and Other Pacific Islander alone",
"Some other race alone",
"Total population",
"Two or more races",
"White alone",
"%Hispanic",
"%White alone",
"%Black or African American alone",
"%American Indian and Alaska Native alone",
"%Asian alone",
"%Native Hawaiian and Other Pacific Islander alone",
"%Some other race alone",
"%Two or more races"]]

#Now get the final merged dataset with 
df = pd.read_csv("df.csv")
df.reset_index(inplace=True)
df2 = df.merge(right = Bridges_Municipalities_Demographics_df,
             how = "inner", 
             left_on = "index", 
             right_on = "Original_row_number")

df2.describe()

Unnamed: 0,index,TRAFFIC_LANES_ON_028A,ADT_029,DEGREES_SKEW_034,MAIN_UNIT_SPANS_045,MAX_SPAN_LEN_MT_048,STRUCTURE_LEN_MT_049,ROADWAY_WIDTH_MT_051,INSPECT_FREQ_MONTHS_091,HIGHWAY_SYSTEM_104,...,Two or more races,White alone,%Hispanic,%White alone,%Black or African American alone,%American Indian and Alaska Native alone,%Asian alone,%Native Hawaiian and Other Pacific Islander alone,%Some other race alone,%Two or more races
count,1022.0,1022.0,1022.0,1022.0,1022.0,1022.0,1022.0,1022.0,1022.0,1022.0,...,733.0,733.0,733.0,733.0,733.0,733.0,733.0,733.0,733.0,733.0
mean,692.612524,0.186609,0.102949,0.261549,0.038188,0.108599,0.05543,0.215934,0.102583,0.422701,...,4853.390177,122336.787176,1.972072,79.614257,11.532155,0.086672,3.774036,0.042328,0.482217,2.496264
std,295.34346,0.153131,0.152009,0.307304,0.084395,0.130351,0.097999,0.128246,0.237846,0.494231,...,6033.761182,145404.43146,1.00118,13.209726,11.535695,0.099104,2.767746,0.197304,0.642771,1.212658
min,182.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,20.0,623.0,0.0,32.369319,0.115683,0.0,0.0,0.0,0.0,0.330524
25%,437.25,0.142857,0.018251,0.0,0.0,0.02705,0.003118,0.153518,0.04,0.0,...,172.0,6695.0,1.218699,69.570959,1.890595,0.0,1.780314,0.0,0.106662,1.525387
50%,692.5,0.142857,0.049591,0.151515,0.0,0.06847,0.013707,0.196162,0.04,0.0,...,467.0,30781.0,1.977879,80.437956,8.547846,0.095916,4.258003,0.0,0.414232,2.838126
75%,947.75,0.142857,0.119077,0.431818,0.057143,0.140744,0.058706,0.260128,0.04,1.0,...,13197.0,323498.0,3.046302,91.06824,18.923848,0.095916,4.936235,0.019785,0.568829,2.838126
max,1204.0,1.0,1.0,1.0,0.971429,1.0,0.825368,1.0,1.0,1.0,...,13197.0,323498.0,6.200655,98.82664,58.900665,0.951173,16.365213,1.605914,4.293263,8.549971


## Normalization of the socioeconomic and demographic data

In [49]:
df2['Structure_Kind_Aluminum'] #confirming dummy variabkes are there

0       0
1       0
2       0
3       0
4       0
       ..
1017    0
1018    0
1019    0
1020    0
1021    0
Name: Structure_Kind_Aluminum, Length: 1022, dtype: int64

In [50]:
def normalize(df, features):
    for feature_name in features:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        df[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return df


continous_var=df.columns[90:] #include only demographic and socio econmoic data
df_norm=df2.copy()
df_norm = normalize(df_norm, continous_var) #normalize the columns

In [51]:

for var in list(df_norm.columns):   #Checks for any columns remaining that are not in the correct format (numerical)
    print(var, df_norm[var].dtypes) 


index int64
TRAFFIC_LANES_ON_028A float64
ADT_029 float64
DEGREES_SKEW_034 float64
MAIN_UNIT_SPANS_045 float64
MAX_SPAN_LEN_MT_048 float64
STRUCTURE_LEN_MT_049 float64
ROADWAY_WIDTH_MT_051 float64
INSPECT_FREQ_MONTHS_091 float64
HIGHWAY_SYSTEM_104 int64
PERCENT_ADT_TRUCK_109 float64
BRIDGE_CONDITION int64
DECK_AREA float64
Toll int64
Urban int64
Deck int64
Age float64
Maintenance_County int64
Maintenance_Municipal int64
Maintenance_Private int64
Maintenance_Railroad int64
Maintenance_State int64
Function_Collector int64
Function_Interstate Principal Arterial int64
Function_Major Collector int64
Function_Minor Arterial int64
Function_Minor Collector int64
Function_Municipal int64
Function_Other Principal Arterial int64
Function_Other Principal Arterial Freeways int64
Structure_Kind_Aluminum int64
Structure_Kind_Concrete int64
Structure_Kind_Concrete continuous int64
Structure_Kind_Masonary int64
Structure_Kind_Other int64
Structure_Kind_Prestressed concrete int64
Structure_Kind_Prestres

In [52]:
#municipality name must be dropped for classification modelb because it is in object format
df_norm.drop(columns = 'Municipality', inplace = True)
#df_norm['Municipality']

In [53]:

for var in list(df_norm.columns):   #Checks for any columns remaining that are not in the correct format (numerical)
    print(var, df_norm[var].dtypes) 


index int64
TRAFFIC_LANES_ON_028A float64
ADT_029 float64
DEGREES_SKEW_034 float64
MAIN_UNIT_SPANS_045 float64
MAX_SPAN_LEN_MT_048 float64
STRUCTURE_LEN_MT_049 float64
ROADWAY_WIDTH_MT_051 float64
INSPECT_FREQ_MONTHS_091 float64
HIGHWAY_SYSTEM_104 int64
PERCENT_ADT_TRUCK_109 float64
BRIDGE_CONDITION int64
DECK_AREA float64
Toll int64
Urban int64
Deck int64
Age float64
Maintenance_County int64
Maintenance_Municipal int64
Maintenance_Private int64
Maintenance_Railroad int64
Maintenance_State int64
Function_Collector int64
Function_Interstate Principal Arterial int64
Function_Major Collector int64
Function_Minor Arterial int64
Function_Minor Collector int64
Function_Municipal int64
Function_Other Principal Arterial int64
Function_Other Principal Arterial Freeways int64
Structure_Kind_Aluminum int64
Structure_Kind_Concrete int64
Structure_Kind_Concrete continuous int64
Structure_Kind_Masonary int64
Structure_Kind_Other int64
Structure_Kind_Prestressed concrete int64
Structure_Kind_Prestres

In [54]:
df_norm.dropna(inplace=True)  #remove NAs 
print(len(df_norm))

733


In [61]:
df_norm.to_csv('df2.csv',index=False) #saving locally