# This notebook is to predict a Policy Paper's Incentives/Disincentives/Motivations based on keyword frequencies. 

## Current Supported Languages: English, Spanish
## Training Data: India(English), Mexico(Spanish)
## Best Model: Random Forest

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import OCR_pipeline as OCR
import TopicModeling as TM
import jsonpickle
import pandas as pd
import numpy as np

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/catharinewu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
import mysql.connector

mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  passwd="XXX", #Please replace this with your own mySQL password
  database="forestry"
)
mycursor = mydb.cursor()

def getExtractedTexts():
    mycursor.execute("SELECT filename, extractedText FROM PolicyPapers")
    myresult = mycursor.fetchall()
    return [x for x in myresult]

def getExtractedTextsFromCountries(countries):
    sqlStatement = "SELECT filename, extractedText FROM PolicyPapers WHERE"
    
    for i in range(len(countries)):
        whereClause = " country = '%s' " % countries[i]
        sqlStatement += whereClause
        if len(countries) != 1 and i < len(countries) - 1:
            sqlStatement += "or"
            
    mycursor.execute(sqlStatement)
    myresult = mycursor.fetchall()
    return [x for x in myresult]

In [4]:
#dictionary for texts in terms of sentences
import re
import gc
textDictionary = {}

India = getExtractedTextsFromCountries(['India'])
Brazil = getExtractedTextsFromCountries(['Brazil'])
Mexico = getExtractedTextsFromCountries(['Mexico'])

IndiaPapers = {}
BrazilPapers = {}
MexicoPapers = {}

for paper in India:
    IndiaPapers.update({paper[0].replace('.pdf','') : re.sub(r'(.) ', r'\1', paper[1])})

for paper in Brazil:
    BrazilPapers.update({paper[0].replace('.pdf','') : re.sub(r'(.) ', r'\1', paper[1])})

for paper in Mexico:
    MexicoPapers.update({paper[0].replace('.pdf','') : re.sub(r'(.) ', r'\1', paper[1])})


In [5]:
India_df = pd.Series(IndiaPapers).to_frame()
Brazil_df = pd.Series(BrazilPapers).to_frame()
Mexico_df = pd.Series(MexicoPapers).to_frame()

# English(India)

In [6]:
Eng_key_words = ['afforestation','agriculture','animal welfare','artificial regeneration','biodiversity',
             'biological resources','biome','board','clean','coconut','conservation','control','database',
             'enforcement','environment','farm','financing','forest','forest protection','funding','fundraising',
             'land ','land use','landholder','law','measuring','mobilization','monitor','natural resources',
             'oversight','plant breeders','plants','pollution','preservation','produce','protection','qualification',
             'quality','registry','regulation','reporting','reserve','resource','restriction','results-driven',
             'rural','safeguard','species' ,'support','sustainable','technical submission','threatened species',
             'variety','verification','wastewater','water','watershed','wild life']

In [7]:
#Add keyword columns with each entry its frequency
india_freq = []
for word in Eng_key_words:
    India_df[word] = np.zeros(len(India_df))
    for i in range(len(India_df)):
        India_df[word][i] = India_df[0][i].count(word)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [8]:
India_test = India_df.loc['_MineralConservationandDevelopmentRules1988-converted',:]
India_df = India_df.drop(index=['_MineralConservationandDevelopmentRules1988-converted'])

In [9]:
df = pd.read_csv('Sample Data Framework - Policy Database.csv')
df = df.drop(index=0)

In [10]:
just_india = df[df['Country'] == 'IN'][['Policy Title','Primary Incentive...','Primary Disincentive...','Motivation']]

In [11]:
india_reordered = just_india.iloc[[7,17,9,4,2,3,0,20,1,12,14,8,18,19,13,5,11,15,16],:]

In [12]:
india_reordered = india_reordered.reset_index().drop(columns = 'index')

In [13]:
India_df = India_df[Eng_key_words].reset_index().drop(columns='index')

## Predict Incentives

In [14]:
incentives = ['Diplomatic','Financial-grants','Financial-subsidies','Fianncial-trade','Legal']

In [15]:
X = India_df
y = india_reordered['Primary Incentive...'].astype('category').cat.codes

In [16]:
#Remove NaNs
X = X.drop(index=y[y==-1].index)
y = y.drop(index=y[y==-1].index)

In [17]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=0)
rf.fit(X, y)

  from numpy.core.umath_tests import inner1d


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

### Predict the incentive of 'Mineral Conservation and Development Rules 1988-converted'

In [18]:
incentives[rf.predict(India_test.drop(0).values.reshape(1,-1))[0]]

'Diplomatic'

## Predict Disincentives

In [19]:
disincentives = ['Financial-fines', 'Imprisonment and Fines', 'Procesures/Guidelines']

In [20]:
X = India_df
y = india_reordered['Primary Disincentive...'].astype('category').cat.codes

In [21]:
#Remove NaNs
X = X.drop(index=y[y==-1].index)
y = y.drop(index=y[y==-1].index)

In [22]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=0)
rf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

### Predict the disincentive of 'Mineral Conservation and Development Rules 1988-converted'

In [23]:
disincentives[rf.predict(India_test.drop(0).values.reshape(1,-1))[0]]

'Imprisonment and Fines'

## Predict Motivations

In [24]:
motivations = ['Agricultural Development','Climate Change Action','Conservation','International Agreement/Conference','Previous Supreme Court Ruling']

In [25]:
X = India_df
y = india_reordered['Motivation'].astype('category').cat.codes

In [26]:
#Remove NaNs
X = X.drop(index=y[y==-1].index)
y = y.drop(index=y[y==-1].index)

In [27]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=0)
rf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

### Predict the motivation of 'Mineral Conservation and Development Rules 1988-converted'

In [28]:
motivations[rf.predict(India_test.drop(0).values.reshape(1,-1))[0]]

'Conservation'

# Spanish(Mexico)

In [29]:
spanish_key_words = ['repoblación forestal', 'agricultura', 'bienestar de los animales', 'regeneración artificial', 
                     'biodiversidad', 'recursos biologicos', 'bioma', 'tablero', 'limpiar', 'Coco', 'conservación',
                     'controlar', 'base de datos', 'aplicación', 'ambiente', 'granja', 'financiación', 'bosque', 
                     'protección forestal', 'fondos', 'recaudación de fondos', 'tierra', 'uso del suelo', 
                     'terrateniente', 'ley', 'medición', 'movilización', 'monitor', 'recursos naturales', 'vigilancia',
                     'fitomejoradores', 'plantas', 'contaminación', 'preservación', 'Produce', 'proteccion',
                     'calificación', 'calidad', 'registro', 'regulación', 'reportando', 'reserva', 'recurso', 
                     'restricción', 'impulsado por resultados', 'rural', 'salvaguardia', 'especies', 'apoyo', 
                     'sostenible', 'sumisión técnica', 'especies amenazadas', 'variedad', 'verificación', 
                     'aguas residuales', 'agua', 'cuenca', 'fauna silvestre']

In [30]:
#Add keyword columns with each entry its frequency
mexico_freq = []
for word in spanish_key_words:
    Mexico_df[word] = np.zeros(len(Mexico_df))
    for i in range(len(Mexico_df)):
        Mexico_df[word][i] = Mexico_df[0][i].count(word)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [31]:
Mexico_test = Mexico_df.loc['Mexico Forestry Law',:]
Mexico_df = Mexico_df.drop(index=['Mexico Forestry Law'])

In [32]:
just_mexico = df[df['Country'] == 'MX'][['Policy Title','Primary Incentive...','Primary Disincentive...','Motivation']].reset_index().drop(columns=['index'])

In [33]:
mexico_reordered = just_mexico.iloc[[6,10,16,5,13,21,17,8,22,2,1,12,9,20,14,23,15,19,7,3,4,18,24,11],:]

In [34]:
mexico_reordered = mexico_reordered.reset_index().drop(columns = 'index')

In [35]:
Mexico_df = Mexico_df[spanish_key_words].reset_index().drop(columns='index')

## Predict Incentives

In [36]:
incentives = ['Diplomatic','Financial-grants','Financial-subsidies','Fianncial-tax break','Legal','Political']

In [37]:
X = Mexico_df
y = mexico_reordered['Primary Incentive...'].astype('category').cat.codes

In [38]:
#Remove NaNs
X = X.drop(index=y[y==-1].index)
y = y.drop(index=y[y==-1].index)

In [39]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=0)
rf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

### Predict the incentive of 'Mexico Forestry Law'

In [40]:
incentives[rf.predict(Mexico_test.drop(0).values.reshape(1,-1))[0]]

'Legal'

## Predict Disincentives

In [41]:
disincentives = ['Financial-fines', 'Financial-fines, Legal, Imprisonment and Fines', 'Financial-fines, Legal, Political', 'Procedures/Guidelines']

In [42]:
X = Mexico_df
y = mexico_reordered['Primary Disincentive...'].astype('category').cat.codes

In [43]:
#Remove NaNs
X = X.drop(index=y[y==-1].index)
y = y.drop(index=y[y==-1].index)

In [44]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=0)
rf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

### Predict the disincentive of 'Mexico Forestry Law'

In [45]:
disincentives[rf.predict(Mexico_test.drop(0).values.reshape(1,-1))[0]]

'Procedures/Guidelines'

## Predict Motivations

In [46]:
motivations = ['Agricultural Development','Climate Change Action','Conservation','International Agreement/Conference']

In [47]:
X = Mexico_df
y = mexico_reordered['Motivation'].astype('category').cat.codes

In [48]:
#Remove NaNs
X = X.drop(index=y[y==-1].index)
y = y.drop(index=y[y==-1].index)

In [49]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=0)
rf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

### Predict the motivation of 'Mexico Forestry Law'

In [50]:
motivations[rf.predict(Mexico_test.drop(0).values.reshape(1,-1))[0]]

'Agricultural Development'