### Import Libraries

In [54]:
#Libs
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

#preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

#Model Creation
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

#evaluation
#evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score


#hyperparameter tuning 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV


### Import Data

In [44]:
#data
dim_cd_df = pd.read_csv('../data/interim_data/dim_cd_df.csv',index_col=0)
dim_qq_df = pd.read_csv('../data/interim_data/dim_qq_df.csv',index_col=0)
FACT_df = pd.read_csv('../data/interim_data/FACT_df.csv',index_col=0)

### Subset and Merge Data

In [45]:
#Subset the Fact table for only data from the US for 2020 and remove rows where the answer was NA
us_data = pd.merge(FACT_df,dim_cd_df[['Account_Num','Country','geometry']],on='Account_Num')

us_data.drop_duplicates(inplace=True)
us_data=us_data.query("Country == 'United States of America' & \
            Year ==2020 & \
            Answer not in ['Question not applicable','nan']")


In [46]:
#merge with dim_qq_df for the question section
us_data = pd.merge(us_data,dim_qq_df[['Question_ID','Sect']],on='Question_ID')

#drop null from answer
us_data.dropna(inplace=True)

In [47]:
#sample 5000 rows
us_data_sample = us_data.sample(5000).reset_index(drop=True)

us_data_sample.shape

(5000, 7)

In [48]:
us_data_sample.head()

Unnamed: 0,Year,Account_Num,Question_ID,Answer,Country,geometry,Sect
0,2020,58668,8385,Heavy snows greatly hamper fishing and harbor ...,United States of America,POINT (-70.9342 41.6362),Climate Hazards
1,2020,74531,10301,Local,United States of America,POINT (-106 35.5),Mitigation Actions
2,2020,31108,7851,"Fluctuating socio-economic conditions, Increas...",United States of America,POINT (-95.3694 29.7602),Climate Hazards
3,2020,43909,20414,Stormwater management (natural or man-made inf...,United States of America,POINT (-81.3792 28.5383),Water Supply Management
4,2020,53959,8531,Significantly challenges,United States of America,POINT (-94.1719 36.0821),Climate Hazards


In [49]:
#Analysis 7: Classification to predict geometry
us_data_sample_w_city =us_data_sample.merge(dim_cd_df[['Account_Num','City']],on='Account_Num').drop_duplicates()
us_data_sample_w_city
X=us_data_sample_w_city[['Question_ID','Answer','Sect']]
y= us_data_sample_w_city[['City']]

In [50]:
#check for class imbalance
y.value_counts()

City            
Houston             128
San Francisco       123
West Palm Beach     122
Washington, DC      105
Lakewood             91
                   ... 
Toledo                6
South Barrington      5
Roanoke               4
Natchez               3
Wellfleet             3
Length: 116, dtype: int64

In [51]:
X

Unnamed: 0,Question_ID,Answer,Sect
0,8385,Heavy snows greatly hamper fishing and harbor ...,Climate Hazards
3,9619,0,Mitigation Actions
6,8389,"New Bedford has over 11 miles of coastline, mo...",Climate Hazards
9,8504,Supports,Climate Hazards
12,1399,0,Adaptation Actions
...,...,...,...
14818,4168,611075,City-wide GHG Emissions Data
14821,4428,Stayed the same,City-wide GHG Emissions Data
14824,19174,99,Water Supply
14827,17327,"Other, please specify",Finance and Economic Opportunities


In [52]:
X.Question_ID=X.Question_ID.astype('category')
X.Sect=X.Sect.astype('category')

In [55]:
# We will not set stratify=y and test moethods 1,3,4 and 5 listed above. 
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.30, random_state=0)

#create unique lists for our different transformed columns
cat_cols = list(X_train.select_dtypes(include='category').columns)
text_cols = ('Answer')

#add the transformers
preprocessor = ColumnTransformer( 
    transformers = [
  
        ('cat',OneHotEncoder(handle_unknown = 'ignore'),cat_cols), #encode the categorical features
        ('Answer',TfidfVectorizer(stop_words='english',ngram_range=(1,2),lowercase=True),text_cols),    #freq counts for words
    ], remainder='passthrough')


#fit and transform our data
X_train_trans = preprocessor.fit_transform(X_train)
X_test_trans = preprocessor.transform(X_test)




#create our lm model
rf_model = RandomForestClassifier()
print("Basic RF Model W/O Class Balancing")
rf_model.fit(X_train_trans,y_train)
rf_ypred=rf_model.predict(X_test_trans) 
f1_rf = f1_score(y_test,rf_ypred,average='weighted')
print(f1_rf)
print('----------------')

#create our knn model
param_grid = {'n_neighbors':np.arange(2,50)}

#model
knn_model = KNeighborsClassifier()
knn_cv_model= GridSearchCV(knn_model,param_grid,cv=5)

#fit
knn_cv_model.fit(X_train_trans,y_train)
knn_ypred = knn_cv_model.predict(X_test_trans)

f1_knn=f1_score(y_test,knn_ypred,average='weighted')
print("KNN Model W CV")
print(f1_knn)


Basic RF Model W/O Class Balancing
0.09811116581952363
----------------
KNN Model W CV
0.027127365018392947


In [58]:
#confusion_matrix
cm= confusion_matrix(y_test, rf_ypred)
cm

array([[ 3,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       ...,
       [ 0,  1,  0, ..., 18,  2,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  1,  0, ...,  0,  0,  0]])

In [59]:
#classication report
print(classification_report(y_test,rf_ypred))


                  precision    recall  f1-score   support

        Abington       0.21      0.17      0.19        18
         Alameda       0.00      0.00      0.00        14
           Alton       0.00      0.00      0.00         4
       Anchorage       0.08      0.11      0.10         9
       Ann Arbor       0.06      0.04      0.05        23
       Arlington       0.00      0.00      0.00         9
       Asheville       0.00      0.00      0.00         8
           Aspen       0.18      0.11      0.13        19
          Aurora       0.00      0.00      0.00         0
          Austin       0.00      0.00      0.00         9
       Baltimore       0.00      0.00      0.00        21
      Blacksburg       0.00      0.00      0.00        12
     Bloomington       0.33      0.23      0.27        26
          Boston       0.08      0.12      0.10        25
         Boulder       0.05      0.04      0.05        24
  Boulder County       0.00      0.00      0.00         8
   Boynton Be