# Import Statements

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import OCR_pipeline as OCR
import TopicModeling as TM
import jsonpickle
import gensim

# Cleaning And Saving Extracted Text

In [None]:
import mysql.connector

mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  passwd="XXX",
  database="forestry"
)
mycursor = mydb.cursor()

def getExtractedTextsFromCountries(countries):
    sqlStatement = "SELECT P1.filename, P1.pid, P2.country, P1.text from PolicyPaperPages P1 " \
                     "INNER JOIN PolicyPapers P2 " \
                     "on P1.filename = P2.filename WHERE"
    
    for i in range(len(countries)):
        whereClause = " P2.country = '%s' " % countries[i]
        sqlStatement += whereClause
        if len(countries) != 1 and i < len(countries) - 1:
            sqlStatement += "or"

    sqlStatement += "GROUP BY filename, pid ORDER BY filename, pid"
    mycursor.execute(sqlStatement)

    myresult = mycursor.fetchall()
    return combinePages(myresult)

def combinePages(myresult):
    curr_filename = myresult[0][0]
    curr_pages = []
    file_texts = []
    for result in myresult:
        if result[0] != curr_filename:
            file_texts.append([curr_filename, curr_pages])
            curr_pages = []
            curr_filename = result[0]
        else:
            curr_pages.append(str(result[3]))
        
    file_texts.append([curr_filename, curr_pages])
    return file_texts

In [None]:
key_words = ['afforestation','agriculture','animal welfare','artificial regeneration','biodiversity',
             'biological resources','biome','board','clean','coconut','conservation','control','database',
             'enforcement','environment','farm','financing','forest','forest protection','funding','fundraising',
             'land ','land use','landholder','law','measuring','mobilization','monitor','natural resources',
             'oversight','plant breeders','plants','pollution','preservation','produce','protection','qualification',
             'quality','registry','regulation','reporting','reserve','resource','restriction','results-driven',
             'rural','safeguard','species' ,'support','sustainable','technical submission','threatened species',
             'variety','verification','wastewater','water','watershed','wild life']

In [None]:
spanish_key_words = ['repoblación forestal', 'agricultura', 'bienestar de los animales', 'regeneración artificial', 
                     'biodiversidad', 'recursos biologicos', 'bioma', 'tablero', 'limpiar', 'Coco', 'conservación',
                     'controlar', 'base de datos', 'aplicación', 'ambiente', 'granja', 'financiación', 'bosque', 
                     'protección forestal', 'fondos', 'recaudación de fondos', 'tierra', 'uso del suelo', 
                     'terrateniente', 'ley', 'medición', 'movilización', 'monitor', 'recursos naturales', 'vigilancia',
                     'fitomejoradores', 'plantas', 'contaminación', 'preservación', 'Produce', 'proteccion',
                     'calificación', 'calidad', 'registro', 'regulación', 'reportando', 'reserva', 'recurso', 
                     'restricción', 'impulsado por resultados', 'rural', 'salvaguardia', 'especies', 'apoyo', 
                     'sostenible', 'sumisión técnica', 'especies amenazadas', 'variedad', 'verificación', 
                     'aguas residuales', 'agua', 'cuenca', 'fauna silvestre']

In [None]:
#dictionary for texts in terms of sentences
import re
import gc
textDictionary = {}
countries = ['India']
policyPapers = getExtractedTextsFromCountries(countries)
language_keywords = {}
language_keywords['english'] = key_words
language_keywords['spanish'] = spanish_key_words
for paper in policyPapers:
    full_text = ' '.join(paper[1])
    cleanedText = TM.toSentences(full_text, 'english', language_keywords['english'])
    textDictionary[paper[0]] = cleanedText

# LDA

In [None]:
from pprint import pprint
lemmatized_sents = []

# Print the Keywords in the 5 topics
for key in textDictionary:
    cleanedText = textDictionary[key]
    lemmatized_sents.extend(cleanedText)
    
corpus, lda_model = TM.LDA(lemmatized_sents)
pprint(lda_model.print_topics(-1))
doc_lda = lda_model[corpus]
#Next Step: Store LDA data in Policy Paper Object & other central file

In [None]:
# Save LDA
# lda_model.save("lda_model")

In [None]:
# Load LDA
lda_saved = gensim.models.ldamodel.LdaModel.load("lda_model")

In [None]:
lda_saved.print_topics(-1)

In [None]:
output = lda_model.print_topics(-1)
test = output[0][1]
test.split("+")

In [None]:
import operator

def top_three_topics(lda_model, text):
    preprocess = simple_preprocess(text)
    common_dictionary = Dictionary([text.split(" ")])
    bow = common_dictionary.doc2bow(preprocess)
    
    document_topics = lda_model.get_document_topics(bow)
    topics_ratio = dict(document_topics)
    
    top_three = []
    for i in range(0, 3):
        try: 
            largest = max(topics_ratio.items(), key=operator.itemgetter(1))
            top_three.append(largest)
            del topics_ratio[largest[0]]
        except: 
            return top_three
    
    return top_three

In [None]:
topic_map = {
    0: "Market",
    1: "Government",
    2: "Climate Change",
    3: "Forestry Policy",
    4: "Land/Water Management",
    5: "Organization",
    6: "States", 
    7: "Area", 
    8: "Crops",
    9: "Forest Conservation",
    10: "Legal",
    11: "Forest",
    12: "Project/Mission", 
    13: "Agriculture", 
    14: "Mining Regulation"
}

def insertPaperTopics(mycursor, filename, page_topics):
    sql_insert = "INSERT INTO PaperTopics " \
        "(filename, pid, topic) " \
        "VALUES (%s, %s, %s)"
    for pid, top_three in page_topics.items():
        for topic in top_three:
            val = (filename, pid, topic)
            mycursor.execute(sql_insert, val)
    mydb.commit()
    
def inDatabase(mycursor, filename):
    mycursor.execute("SELECT filename FROM PaperTopics WHERE filename='%s'" % filename)
    return mycursor.fetchall()

In [None]:
# Preparation cell for get_document topics. Preprocessing
from gensim.corpora.dictionary import Dictionary
from gensim.utils import simple_preprocess, simple_tokenize

for doc in policyPapers:
    page_topics = {}
    mycursor = mydb.cursor()
    if not inDatabase(mycursor, doc[0]):
        for i in range(len(doc[1])):
            topics = top_three_topics(lda_model, doc[1][i])
            labels = [topic_map[t[0]] for t in topics]
            page_topics[i+1] = labels
        insertPaperTopics(mycursor, doc[0], page_topics)
    mycursor.close()

In [None]:
# Retrieve the three topics for this specific document
topics = []
for item in top_three:
    topics.append(output[item[0]])
topics

In [None]:
my = lda_model.print_topics()[0][1]

In [None]:
re.findall(r"[a-zA-Z]+",my)

## Extract sentences by Keywords

### TODO: Have a feature on the web app so that the user can extract text using keywords

In [None]:
#Try with the first document
policy_0 = policyPapers[0][1]
sentences = TM.ReturnSentence(policy_0)

In [None]:
#This can be used as a preview feature
for s in sentences:
    if 'land' in s:
        print(s)

## Use Key Words to classify documents

In [None]:
key_words = ['afforestation','agriculture','animal welfare','artificial regeneration','biodiversity',
             'biological resources','biome','board','clean','coconut','conservation','control','database',
             'enforcement','environment','farm','financing','forest','forest protection','funding','fundraising',
             'land ','land use','landholder','law','measuring','mobilization','monitor','natural resources',
             'oversight','plant breeders','plants','pollution','preservation','produce','protection','qualification',
             'quality','registry','regulation','reporting','reserve','resource','restriction','results-driven',
             'rural','safeguard','species' ,'support','sustainable','technical submission','threatened species',
             'variety','verification','wastewater','water','watershed','wild life']

In [None]:
'artificial regeneration' in cleanWhitespaces

# Visualizations

### pyLDAvis

`pip install pyldavis`

Code source: https://markroxor.github.io/gensim/static/notebooks/gensim_news_classification.html
<br>
The area of the circles represent the prevalance of the topic. The length of the bars on the right represent the membership of a term in a particular topic. 

In [None]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word)
# p = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word)
# pyLDAvis.save_html(p, 'pyLDAvis.html')

### WordCloud

`pip install wordcloud`

Code source: https://medium.com/@rudyb2001/making-sense-of-the-news-using-machine-learning-e13ed5f96cf0
<br>
WordCloud of Top 10 words in each topic with the sizes of the words proportional to the weights.

In [None]:
len(lda_model.show_topics(formatted=False))

In [None]:
from matplotlib import pyplot as plt
import matplotlib.colors as mcolors
from wordcloud import WordCloud, STOPWORDS
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]

cloud = WordCloud(background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = lda_model.show_topics(formatted=False)

fig, axes = plt.subplots(2, 5, figsize=(15,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')

plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

# Storage