In [74]:
#Importing the necessary packages
import pandas as pd
import numpy as np
import glob
import time

#### Function to create one dataframe from all individual monthly files

In [75]:
def create_df():
    
    #Total excel files = 204
    #Individual File shape = (852, 6)
    #Final Dataframe shape = (173808, 8)
    
    column_names = ["row_no", "min_K", "max_K", "mean_K", "geoid", "name"] #List of column names of individual files
    path = "narr_urban_county_data/air.sfc"
    all_files = glob.glob(path + "/*.csv")
    data = pd.DataFrame(columns = column_names) #Defining an empty dataframe
    for filename in all_files:
        temp_df = pd.read_csv(filename, index_col=None, header=0, names=column_names) #Individual files in a temp dataframe
        year = filename[-11:-7]
        month = filename[-6:-4]
        temp_df.insert(6, "year", year) #Adding year and month from filename for further use
        temp_df.insert(7, "month", month)
        temp_df["date"] = pd.to_datetime(temp_df[['year','month']].assign(DAY=1))
        data = data.append(temp_df, ignore_index=True) #The temp dataframe is not appended to final df
    idx = pd.date_range('1999-01-01', '2015-12-01', freq='MS') #Specifying the datetime index
    data = data[["name", "date", "year", "month", "max_K", "mean_K", "min_K", "geoid"]] #Re-ordering the columns
    
    return data

## Creating the dataframe

In [76]:
# Temperature Data

start = time.process_time() #To calculate the time taken to read 204 files
data = create_df()   
print("Dataframe created in ", time.process_time() - start, "seconds")
print("Shape of the Dataframe: ", data.shape)
data.head(5)

Dataframe created in  12.022556000000009 seconds
Shape of the Dataframe:  (173808, 8)


Unnamed: 0,name,date,year,month,max_K,mean_K,min_K,geoid
0,Lancaster,2005-04-01,2005,4,287.734802,287.612427,287.537933,31109
1,Minnehaha,2005-04-01,2005,4,287.369446,286.63326,285.855377,46099
2,Allen,2005-04-01,2005,4,284.91684,284.776489,284.636108,39003
3,Beaver,2005-04-01,2005,4,283.967102,283.80363,283.577515,42007
4,Chatham,2005-04-01,2005,4,288.692871,288.243927,287.893127,37037


In [78]:
#WNV Incedence Rate (IR) data

wnv = pd.read_csv("WNV_NI_NNI_1999to2015_prevalence_incidence_final_20180530.csv", encoding="ISO-8859-1")
print("Shape of the Dataframe: ", wnv.shape)
wnv.head()

Shape of the Dataframe:  (3108, 153)


Unnamed: 0,GEOID10,Select_County,STATEFP10,STATENS,STUSPS,STNAME,COUNTYFP10,COUNTYNS10,CTYNAME,POPESTIMATE1999,...,NIIR_2011_Z,NIIR_2012_Z,NIIR_2013_Z,NIIR_2014_Z,NIIR_2015_Z,NIIR_EST_AVG_Z,NIIR_EST_MED_Z,County_WNV_Class,County_WNV_Class2,filter_$
0,1001,1,1,1779775,AL,Alabama,1,161526,Autauga County,42963,...,-0.172668378274114,-0.313041876784145,-0.237005866057391,-0.211466937746909,-0.197767338621807,-0.230659201920693,-0.231569016088473,2,NI-S,0
1,1003,1,1,1779775,AL,Alabama,3,161527,Baldwin County,137555,...,-0.172668378274114,0.0148439683883608,-0.177634573907632,-0.211466937746909,-0.197767338621807,-0.177866829768146,-0.186748999098691,4,NI-M-pre12,0
2,1005,0,1,1779775,AL,Alabama,5,161528,Barbour County,28866,...,-0.172668378274114,-0.313041876784145,-0.237005866057391,-0.211466937746909,-0.197767338621807,-0.220130592240443,-0.221517976917691,2,NI-S-NNI,0
3,1007,0,1,1779775,AL,Alabama,7,161529,Bibb County,20560,...,,,,,,,,3,NI-S,0
4,1009,0,1,1779775,AL,Alabama,9,161530,Blount County,50237,...,,,,,,,,1,NNI-S,0
