In [1]:
import pandas as pd
import numpy as np

from gensim import corpora, models
import ast

import plotly.express as px
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()



In [42]:
lda = models.ldamodel.LdaModel.load('../Data/Environmental Discourse/Full-TMs/Models/tm_09')

topicsDict = {}
for topicNum in range(lda.num_topics):
    topicWords = [w for w, p in lda.show_topic(topicNum)]
    topicsDict['Topic_{}'.format(topicNum)] = topicWords

pd.DataFrame(topicsDict)

Unnamed: 0,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,Topic_8
0,energy,think,community,food,state,oil,policy,city,climate
1,solar,know,work,farmer,water,energy,economy,car,climate_change
2,power,thing,tree,farm,bill,company,economic,green,emission
3,wind,go,water,organic,environmental,price,job,bike,carbon
4,electricity,way,food,eat,epa,gas,need,home,percent
5,cost,world,local,crop,coal,industry,cost,day,global_warming
6,technology,right,grow,agriculture,obama,china,market,get,country
7,plant,ve,good,meat,president,coal,money,good,report
8,use,good,way,corn,republican,percent,system,go,scientist
9,coal,get,plant,grow,group,world,tax,work,greenhouse_gas


In [37]:
env = pd.read_csv('../Data/Environmental Discourse/env_processed_text.csv', index_col=0)

In [38]:
env['tokens']= env.text_processed.apply(lambda x: x.split())

In [39]:
dictionary = corpora.Dictionary.load('../Data/Environmental Discourse/Full-TMs/dictionary')
#dictionary = corpora.Dictionary([i for i in env.tokens])

In [43]:
env['lda_topics'] = [lda[dictionary.doc2bow(l)] for l in env['tokens']]

#Dict to temporally hold the probabilities
topicsProbDict = {i : [0] * len(env) for i in range(lda.num_topics)}

#Load them into the dict
for index, topicTuples in enumerate(env['lda_topics']):
    for topicNum, prob in topicTuples:
        topicsProbDict[topicNum][index] = prob

#Update the DataFrame
for topicNum in range(lda.num_topics):
    env['topic_{}'.format(topicNum)] = topicsProbDict[topicNum]

KeyboardInterrupt: 

In [None]:
env['date'] = pd.to_datetime(env.date)
env['year'] = env.date.dt.year

In [None]:
labels_09 = ['renewables',
             'knowledge',
             'community',
             'food',
             'politics',
             'fossil fuels',
             'economy',
             'urbanism',
             'climate change']
env.columns = ['url', 'title', 'date', 'text_processed', 'tokens', 'lda_topics'] + labels_09 + ['year']

labels_dic = {'topic_{}'.format(i):labels_09[i] for i in range(9)}

px.line(env.groupby('year').mean(), labels=labels_dic)

In [31]:
env.to_pickle('../Data/Environmental Discourse/env_tm09_loadings.pkl')

In [41]:
for topic in labels_09:
    print('Topic:', topic)
    titles = env.sort_values(by=topic, ascending=False).title.head(3)
    for title in titles:
        print(' - ', title.strip())
    print('')

Topic: renewables
 -  How (Not) to Run a Modern Society on Solar and Wind Power Alone
 -  Enabling wind, sun to be our main power supplies
 -  Enabling Wind, Sun To Be Our Main Power Supplies: Quest for Storage — “Holy Grail” of New Energy Economy — Nears Goal

Topic: knowledge
 -  Human Nature
 -  About that novel
 -  Review: “Let Us Be Human: Christianity for a Collapsing Culture”

Topic: community
 -  Creating a cottage garden
 -  Transition Essentials: No.1 – Food
 -  Sticking to the plot: A celebration of permaculture allotment projects

Topic: food
 -  U.K. organic milk better for you than conventional, thanks to cows' grass-based diet
 -  While global GMO acreage surges, herbicide-resistent weeds thrive
 -  For first time, GM soybeans may be losing favor among farmers

Topic: politics
 -  Semana del 27/5/12
 -  Semana del 19/11/2006
 -  Samana del 12/11/06

Topic: fossil fuels
 -  Trends in world oil supply/consumption and net exports/imports
 -  What the new 2011 EIA oil supply

In [20]:
lda_07_04 = models.ldamodel.LdaModel.load('../Data/Environmental Discourse/Single-Year-TMs/Models/tm_07_04')
lda_07_06 = models.ldamodel.LdaModel.load('../Data/Environmental Discourse/Single-Year-TMs/Models/tm_07_06')
lda_07_08 = models.ldamodel.LdaModel.load('../Data/Environmental Discourse/Single-Year-TMs/Models/tm_07_08')
lda_07_10 = models.ldamodel.LdaModel.load('../Data/Environmental Discourse/Single-Year-TMs/Models/tm_07_10')

lda_13_04 = models.ldamodel.LdaModel.load('../Data/Environmental Discourse/Single-Year-TMs/Models/tm_13_04')
lda_13_06 = models.ldamodel.LdaModel.load('../Data/Environmental Discourse/Single-Year-TMs/Models/tm_13_06')
lda_13_08 = models.ldamodel.LdaModel.load('../Data/Environmental Discourse/Single-Year-TMs/Models/tm_13_08')
lda_13_10 = models.ldamodel.LdaModel.load('../Data/Environmental Discourse/Single-Year-TMs/Models/tm_13_10')

lda_19_04 = models.ldamodel.LdaModel.load('../Data/Environmental Discourse/Single-Year-TMs/Models/tm_19_04')
lda_19_06 = models.ldamodel.LdaModel.load('../Data/Environmental Discourse/Single-Year-TMs/Models/tm_19_06')
lda_19_08 = models.ldamodel.LdaModel.load('../Data/Environmental Discourse/Single-Year-TMs/Models/tm_19_08')
lda_19_10 = models.ldamodel.LdaModel.load('../Data/Environmental Discourse/Single-Year-TMs/Models/tm_19_10')

In [3]:
dictionary = corpora.Dictionary.load('../Data/Environmental Discourse/Single-Year-TMs/dictionary')

In [4]:
corpus = corpora.MmCorpus('../Data/Environmental Discourse/Single-Year-TMs/bow_corpus.mm')

In [5]:
p = pyLDAvis.gensim_models.prepare(lda_07_04, corpus, dictionary)
p

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


In [9]:
pyLDAvis.gensim_models.prepare(lda_07_06, corpus, dictionary)

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


In [10]:
pyLDAvis.gensim_models.prepare(lda_07_08, corpus, dictionary)

  default_term_info = default_term_info.sort_values(


In [11]:
pyLDAvis.gensim_models.prepare(lda_07_10, corpus, dictionary)

  default_term_info = default_term_info.sort_values(


In [12]:
pyLDAvis.gensim_models.prepare(lda_13_04, corpus, dictionary)

  default_term_info = default_term_info.sort_values(


In [21]:
pyLDAvis.gensim_models.prepare(lda_13_06, corpus, dictionary)

  default_term_info = default_term_info.sort_values(


In [15]:
pyLDAvis.gensim_models.prepare(lda_13_08, corpus, dictionary)

  default_term_info = default_term_info.sort_values(


In [14]:
pyLDAvis.gensim_models.prepare(lda_13_10, corpus, dictionary)

  default_term_info = default_term_info.sort_values(


In [16]:
pyLDAvis.gensim_models.prepare(lda_19_04, corpus, dictionary)

  default_term_info = default_term_info.sort_values(


In [22]:
pyLDAvis.gensim_models.prepare(lda_19_06, corpus, dictionary)

  default_term_info = default_term_info.sort_values(


In [18]:
pyLDAvis.gensim_models.prepare(lda_19_08, corpus, dictionary)

  default_term_info = default_term_info.sort_values(


In [19]:
pyLDAvis.gensim_models.prepare(lda_19_10, corpus, dictionary)

  default_term_info = default_term_info.sort_values(


In [20]:
q = pyLDAvis.gensim_models.prepare(lda_07_08, corpus, dictionary)
q

  default_term_info = default_term_info.sort_values(


In [16]:
p.topic_coordinates

Unnamed: 0_level_0,x,y,topics,cluster,Freq
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.011685,-0.012273,1,1,33.756711
4,-0.002989,0.042494,2,1,21.156279
2,-0.055069,-0.051953,3,1,19.308472
1,0.07258,0.014091,4,1,10.623029
3,0.030479,-0.036484,5,1,8.302211
5,-0.056687,0.044124,6,1,6.853298


In [21]:
q.topic_coordinates

Unnamed: 0_level_0,x,y,topics,cluster,Freq
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5,-0.009587,-0.029304,1,1,27.538426
7,0.03087,0.039386,2,1,23.67765
4,-0.024889,0.031667,3,1,11.095867
1,-0.081875,0.02658,4,1,8.957132
2,0.050141,0.024891,5,1,8.285369
3,-0.007852,0.003238,6,1,7.905693
0,0.06127,-0.018889,7,1,7.523246
6,-0.018079,-0.077568,8,1,5.016617


In [25]:
coh = pd.read_pickle('../Data/Environmental Discourse/Single-Year-TMs/coherence_scores.pkl')
coh['year'] = [2007] * 4 + [2013] * 4 + [2019] * 4
coh['topics'] = [4, 6, 8, 10] * 3

In [59]:
coh['year_str'] = coh.year.apply(str)

fig = px.scatter(coh, x='topics', y='coherence', color='year_str',
              labels={'coherence':'Coherence Score'},
        width=800)

fig.update_layout(
    margin=dict(l=20, r=20, t=20, b=20),
    font_family='Times New Roman',
    plot_bgcolor='rgba(0,0,0,0)',
    xaxis = dict(
        tickmode='linear',
        tick0=4,
        dtick=2,
        showline=True,
        mirror=True,
        linecolor='black'
    ),
    yaxis = dict(
        showline=True,
        mirror=True,
        linecolor='black'
        
    )
)

### <font color=blue> Setting up for dtm

In [10]:
import pandas as pd
from gensim import corpora, models
import ast

In [16]:
path = 'Environmental Discourse'

In [None]:
env = pd.read_csv('../Data/'+path+'/env_processed_tokens.csv', 
                  index_col=0, 
                  converters={'tokens': ast.literal_eval})
env['date'] = pd.to_datetime(env.date)
env['year'] = env.date.dt.year

In [11]:
dictionary = corpora.Dictionary.load('../Data/Environmental Discourse/Full-TMs/dictionary')
corpus = corpora.MmCorpus('../Data/Environmental Discourse/Single-Year-TMs/bow_corpus.mm')

In [19]:
docs_per_time_slice = list(env.groupby('year').agg({'year':'count'}).year)

In [20]:
ldaseq = models.ldaseqmodel.LdaSeqModel(corpus=corpus,
                                        id2word=dictionary, 
                                        time_slice=docs_per_time_slice, 
                                        num_topics=9)

[2405,
 3130,
 4054,
 4645,
 3803,
 3672,
 4037,
 3834,
 3421,
 2564,
 2370,
 1777,
 1313,
 1235,
 1432,
 1661,
 1472]

In [None]:
model.save('../Data/' + path + '/Full-TMs/Models/dtm_09')

In [14]:
data = pd.DataFrame({'genres':['Drama', 'Horror,Thriller', 'Documentary', 'Drama', 'Documentary']})

In [4]:
Documentary = data.genres.apply(lambda x: int('Documentary' in x))
Documentary

0    0
1    0
2    1
3    0
4    1
Name: genres, dtype: int64

In [10]:
data = data.iloc[0:1, :]

In [13]:
def return_series():
    return pd.Series(['a', 'b', 'c'])

In [15]:
data[['a', 'b','e']] = data.genres.apply(return_series)

TypeError: return_series() takes 0 positional arguments but 1 was given

In [26]:
movie = json.loads(movie.text, strict=False)

In [27]:
movie.get('Rated')

'N/A'

In [30]:
movie.get('BoxOffice')

'N/A'

In [22]:
import requests
import json

In [35]:
def get_movie_data(imdb_id, api_key):
    '''
    Gets movie data for one movie via OMDb API, using the movie's IMDb ID #,

    Note that omdb api key needs to be filled above in order for this to work.

    '''
    movie = requests.get('http://www.omdbapi.com/?i=' + imdb_id + '&plot=full&apikey={}'.format(api_key), timeout=None)
    movie = json.loads(movie.text, strict=False)

    # Get Desired Data entries from OMDb dictionary:
    rated = movie.get('Rated')
    plot = movie.get('Plot')
    metascore = movie.get('Metascore')
    box_office = movie.get('BoxOffice')
    awards = movie.get('Awards')

    # Return Entries as a Series to be added as new DataFrame rows
    return pd.Series([rated, plot, metascore, box_office, awards])

In [25]:
imdb_id = 'tt0170651'
api_key = '8d66f32a'

In [34]:
get_movie_data(imdb_id)

0    N/A
1    N/A
2    N/A
3    N/A
4    N/A
dtype: object

In [24]:
dictionary.num_pos

20256820

In [29]:
dictionary.token2id

{'able': 0,
 'act': 1,
 'action': 2,
 'actual': 3,
 'advantage': 4,
 'affected': 5,
 'ago': 6,
 'agreement': 7,
 'agricultural': 8,
 'ambitious': 9,
 'area': 10,
 'aspect': 11,
 'audience': 12,
 'aviation': 13,
 'awareness': 14,
 'away': 15,
 'barely': 16,
 'baseline': 17,
 'basically': 18,
 'beautiful': 19,
 'benefit': 20,
 'billion': 21,
 'biodiversity': 22,
 'biomass': 23,
 'bold': 24,
 'brag': 25,
 'brand': 26,
 'bring': 27,
 'build': 28,
 'burning': 29,
 'business': 30,
 'busy': 31,
 'call': 32,
 'care': 33,
 'case': 34,
 'catch': 35,
 'cc': 36,
 'change': 37,
 'choose': 38,
 'claim': 39,
 'clean_energy': 40,
 'clear': 41,
 'climate': 42,
 'climate_crisis': 43,
 'coal': 44,
 'commitment': 45,
 'common': 46,
 'communication': 47,
 'company': 48,
 'compensate': 49,
 'completely': 50,
 'condition': 51,
 'content': 52,
 'continue': 53,
 'convince': 54,
 'costume': 55,
 'course': 56,
 'create': 57,
 'credit': 58,
 'crisis': 59,
 'crucial': 60,
 'currently': 61,
 'cut': 62,
 'direction'