# Import data

In [1]:
# Importing libs to text manipulation
import gensim
import string
from nltk.corpus import stopwords
# You will have to download the set of stop words the first time
import nltk
nltk.download('stopwords')

from gensim import corpora, models


from google.colab import drive
drive.mount('/content/drive')
import gc


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import io
import pandas as pd

df = pd.read_csv('/content/drive/My Drive/Jupyter COLAB/MC886/Assigment 3/news_headlines.csv', encoding = 'utf-8')

In [3]:
df.head()

Unnamed: 0,publish_date,headline_text
0,20030303,unhooked brakes to blame for taiwan train disa...
1,20030918,oldest prisoner in tas released citing health
2,20030913,nine reportedly dead in portuguese plane crash
3,20031031,nurses welcome medicare rebate plan
4,20030930,un cuts its iraq staff


# First lets create Day, Month  and Year variables

In [0]:
df['str_publish_date'] = df['publish_date'].astype(str)

df['year'] = df['str_publish_date'].apply(lambda x: int(x[:4]))
df['month'] = df['str_publish_date'].apply(lambda x: int(x[4:6]))
df['day'] = df['str_publish_date'].apply(lambda x: int(x[6:8]))

In [5]:
df.head()

Unnamed: 0,publish_date,headline_text,str_publish_date,year,month,day
0,20030303,unhooked brakes to blame for taiwan train disa...,20030303,2003,3,3
1,20030918,oldest prisoner in tas released citing health,20030918,2003,9,18
2,20030913,nine reportedly dead in portuguese plane crash,20030913,2003,9,13
3,20031031,nurses welcome medicare rebate plan,20031031,2003,10,31
4,20030930,un cuts its iraq staff,20030930,2003,9,30


# How many news we have by years?

In [6]:
df.groupby('year').agg({'day':'count'}).reset_index().rename(columns = {'day':'count'})

Unnamed: 0,year,count
0,2003,59343
1,2004,65975
2,2005,66320
3,2006,61568
4,2007,69431
5,2008,71591
6,2009,68867
7,2010,67715
8,2011,69919
9,2012,78547


# Lets apply some text preprocessing

- lower case
- remove ponctuation
- remove stopwords
- lemmatize
- stemmize

In [0]:
# First lets take all text to lower case

df['headline_text'] = df['headline_text'].str.lower()

In [0]:
# Now lets remove all ponctuations

df['headline_text'] = df['headline_text'].str.replace('[{}]'.format(string.punctuation), '')

In [0]:
# Removing stopwords by tokenizing removing them and join the text again

stop_words = stopwords.words('english')

def tokenize(string_series):
  return list(gensim.utils.tokenize(string_series))

def join_tokens(tokens):
  return ' '.join(tokens)

def remove_sw(string_series,stopwords):
  
  tokens = tokenize(string_series)
  no_sw = [w for w in tokens if w not in stopwords]
  result = join_tokens(no_sw)
  return result

In [0]:
# Applying the function

df['new_headline_text'] = df['headline_text'].apply(remove_sw, args=(stop_words,))

In [11]:
df.head()

Unnamed: 0,publish_date,headline_text,str_publish_date,year,month,day,new_headline_text
0,20030303,unhooked brakes to blame for taiwan train disa...,20030303,2003,3,3,unhooked brakes blame taiwan train disaster
1,20030918,oldest prisoner in tas released citing health,20030918,2003,9,18,oldest prisoner tas released citing health
2,20030913,nine reportedly dead in portuguese plane crash,20030913,2003,9,13,nine reportedly dead portuguese plane crash
3,20031031,nurses welcome medicare rebate plan,20031031,2003,10,31,nurses welcome medicare rebate plan
4,20030930,un cuts its iraq staff,20030930,2003,9,30,un cuts iraq staff


In [12]:
# Lemmatize and then Stemming data

# Here we will upgrade our last function

from nltk.stem import WordNetLemmatizer, PorterStemmer,SnowballStemmer
  
nltk.download('wordnet')
  
lemmatizer = WordNetLemmatizer()
stemmer1 = PorterStemmer()
stemmer2 = SnowballStemmer('english')

def text_cleaner(string_series,stopwords):
  
  tokens = tokenize(string_series)
  no_sw = [w for w in tokens if w not in stopwords]
  lemma = [lemmatizer.lemmatize(l) for l in no_sw]
  stem = [stemmer2.stem(s) for s in lemma]
  result = join_tokens(stem)
  return result

def tokens_cleaner(string_series,stopwords):
  
  tokens = tokenize(string_series)
  no_sw = [w for w in tokens if w not in stopwords]
  lemma = [lemmatizer.lemmatize(l) for l in no_sw]
  stem = [stemmer2.stem(s) for s in lemma]
  return stem

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
# Applying the function

df['new_headline_text'] = df['headline_text'].apply(text_cleaner, args=(stop_words,))
df['tokens'] = df['headline_text'].apply(tokens_cleaner, args=(stop_words,))

gc.collect()

0

In [14]:
df.head()

Unnamed: 0,publish_date,headline_text,str_publish_date,year,month,day,new_headline_text,tokens
0,20030303,unhooked brakes to blame for taiwan train disa...,20030303,2003,3,3,unhook brake blame taiwan train disast,"[unhook, brake, blame, taiwan, train, disast]"
1,20030918,oldest prisoner in tas released citing health,20030918,2003,9,18,oldest prison ta releas cite health,"[oldest, prison, ta, releas, cite, health]"
2,20030913,nine reportedly dead in portuguese plane crash,20030913,2003,9,13,nine report dead portugues plane crash,"[nine, report, dead, portugues, plane, crash]"
3,20031031,nurses welcome medicare rebate plan,20031031,2003,10,31,nurs welcom medicar rebat plan,"[nurs, welcom, medicar, rebat, plan]"
4,20030930,un cuts its iraq staff,20030930,2003,9,30,un cut iraq staff,"[un, cut, iraq, staff]"


In [0]:
from gensim import corpora, models

dictionary = gensim.corpora.Dictionary(df['tokens'].values)

bow_corpus = [dictionary.doc2bow(doc) for doc in df['tokens'].values]

In [17]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=12, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.031*"australia" + 0.024*"win" + 0.020*"world" + 0.019*"day" + 0.014*"final" + 0.013*"open" + 0.013*"cup" + 0.009*"first" + 0.009*"afl" + 0.009*"australian"
Topic: 1 Word: 0.028*"elect" + 0.014*"new" + 0.011*"bill" + 0.011*"vote" + 0.011*"labor" + 0.010*"senat" + 0.008*"say" + 0.008*"refuge" + 0.008*"challeng" + 0.008*"parliament"
Topic: 2 Word: 0.018*"countri" + 0.017*"canberra" + 0.016*"fund" + 0.014*"govern" + 0.014*"new" + 0.010*"hope" + 0.010*"tasmanian" + 0.009*"wa" + 0.008*"centr" + 0.008*"hill"
Topic: 3 Word: 0.038*"polic" + 0.038*"man" + 0.023*"charg" + 0.022*"woman" + 0.021*"court" + 0.016*"murder" + 0.015*"crash" + 0.014*"face" + 0.013*"death" + 0.012*"car"
Topic: 4 Word: 0.022*"rural" + 0.022*"nation" + 0.019*"health" + 0.017*"call" + 0.013*"news" + 0.012*"park" + 0.012*"minist" + 0.010*"busi" + 0.009*"liber" + 0.008*"polit"
Topic: 5 Word: 0.012*"lead" + 0.012*"head" + 0.011*"victorian" + 0.011*"race" + 0.011*"futur" + 0.010*"brisban" + 0.009*"violenc" + 0.0