In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('Sample Data Framework - Policy Database.csv')

In [3]:
df = df.drop(index=0)

In [4]:
df = df.drop(columns=['Policy Title','...Page #', '...Page #.1', 'Link', 'Notes','Summary','Purpose'])

In [5]:
df = df.fillna('None')

In [6]:
#Only use the most recent year
df['Year'].iloc[48] = '2018'
df['Year'].iloc[49] = '2017'
df['Year'].iloc[79] = '2017'

In [7]:
for i in range(len(df)):
    df['Key Words'].iloc[i] = df['Key Words'].iloc[i].split(',')
    df['Year'].iloc[i] = int(df['Year'].iloc[i])

In [8]:
primary_incentives = ['Diplomatic', 'Financial - grants', 'Financial - subsidies',
       'Financial - subsidies,Financial - grants',
       'Financial - subsidies,Legal', 'Financial - tax break',
       'Financial - trade', 'Legal', 'None', 'Political']
primary_disincentives = ['Financial - fines', 'Financial - fines,Legal',
       'Financial - fines,Legal,Imprisonment and Fines',
       'Financial - fines,Legal,Political',
       'Financial - fines,Political,Procedures/Guidelines',
       'Imprisonment and Fines', 'Legal', 'None', 'Political',
       'Procedures/Guidelines']
motivations = ['Agricultural Development', 'Climate Change Action', 'Conservation',
       'Define ecologic-economical zoning of legal Amazon',
       'Establish Amazon Fund',
       'Establishes Forest Service and promotes forest management principles',
       'Establishes system of Nature Conservation Units',
       'Fund research on GHG mitigation and promote REDD+ projects',
       'International Agreement/Conference',
       'Mitigate and address the environmental impacts of climate change',
       'None', 'Previous Supreme Court Ruling',
       'Promote conservation activites ']
land_use = ['Farmed land', 'Forest land', 'Forest land,Farmed land',
       'Forest land,Farmed land,Protective land and buffers',
       'Forest land,Protective land and buffers', 'None',
       'Protective land and buffers']

In [9]:
cat_cols = ['Country',
            'Jurisdiction',
            'Bottom-up:\nCommunity Engagement',
           'Bottom-up:\nIndigenous Rights',
           'Biome-Specific',
           'Hectare Quantity',
           'Motivation',
           'Primary Incentive...',
           'Primary Disincentive...',
           'ROAM+: Land Use',
           'ROAM+: Category',
           'ROAM+: Intervention',
           'Enacted By Law',
           'Enacted By Exec.', 
           'Voluntary/NDC', 
           'Passed', 
           'Active',
           'Executing Ministry', 
           'Enforcement Mechanism']

for i, col in enumerate(cat_cols):
    df[col] = df[col].astype('category').cat.codes

In [10]:
key_words = ['afforestation','agriculture','animal welfare','artificial regeneration','biodiversity',
             'biological resources','biome','board','clean','coconut','conservation','control','database',
             'enforcement','environment','farm','financing','forest','forest protection','funding','fundraising',
             'land ','land use','landholder','law','measuring','mobilization','monitor','natural resources',
             'oversight','plant breeders','plants','pollution','preservation','produce','protection','qualification',
             'quality','registry','regulation','reporting','reserve','resource','restriction','results-driven',
             'rural','safeguard','species' ,'support','sustainable','technical submission','threatened species',
             'variety','verification','wastewater','water','watershed','wild life']

In [11]:
for word in key_words:
    df[word] = [int(word in df['Key Words'].iloc[i]) for i in range(len(df))]

In [12]:
df = df.drop(columns=['Key Words'])

In [13]:
df.columns

Index(['Year', 'Country', 'Jurisdiction', 'Enacted By Law', 'Enacted By Exec.',
       'Voluntary/NDC', 'Passed', 'Active', 'Executing Ministry',
       'Enforcement Mechanism', 'Motivation', 'Primary Incentive...',
       'Primary Disincentive...', 'Bottom-up:\nCommunity Engagement',
       'Bottom-up:\nIndigenous Rights', 'Biome-Specific', 'ROAM+: Land Use',
       'ROAM+: Category', 'ROAM+: Intervention', 'Hectare Quantity',
       'afforestation', 'agriculture', 'animal welfare',
       'artificial regeneration', 'biodiversity', 'biological resources',
       'biome', 'board', 'clean', 'coconut', 'conservation', 'control',
       'database', 'enforcement', 'environment', 'farm', 'financing', 'forest',
       'forest protection', 'funding', 'fundraising', 'land ', 'land use',
       'landholder', 'law', 'measuring', 'mobilization', 'monitor',
       'natural resources', 'oversight', 'plant breeders', 'plants',
       'pollution', 'preservation', 'produce', 'protection', 'qualificati

## Predict the Primary Incentive

In [56]:
# X = df.drop(columns=['Motivation', 'Primary Incentive...',
#        'Primary Disincentive...', 'Bottom-up:\nCommunity Engagement',
#        'Bottom-up:\nIndigenous Rights', 'Biome-Specific', 'ROAM+: Land Use',
#        'ROAM+: Category', 'ROAM+: Intervention', 'Hectare Quantity',])
X = df[key_words]
y = df['Primary Incentive...']

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

### SVM

In [58]:
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train, y_train) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [59]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, svm.predict(X_test))

0.35714285714285715

### LDA

In [60]:
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)



LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

In [61]:
accuracy_score(y_test, lda.predict(X_test))

0.2857142857142857

### MLC

In [62]:
from sklearn.neural_network import MLPClassifier
mlc = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
mlc.fit(X_train, y_train)  

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [63]:
accuracy_score(y_test, mlc.predict(X_test))

0.35714285714285715

### Decision Tree

In [64]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=0)
dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [65]:
accuracy_score(y_test, dt.predict(X_test))

0.2857142857142857

### Random Forest

In [66]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=0)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [67]:
accuracy_score(y_test, rf.predict(X_test))

0.21428571428571427

### KNN

In [68]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, y_train) 

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [69]:
accuracy_score(y_test, neigh.predict(X_test))

0.14285714285714285

### Use Random Forest to train the model

In [70]:
rf = RandomForestClassifier(n_estimators=100, random_state=0)
rf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [71]:
try_me = np.array([0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0])

In [74]:
primary_incentives[rf.predict(try_me.reshape(1,-1))[0]]

'Legal'