In [2]:
import pandas as pd

import warnings
import os

import spacy
from sklearn.feature_extraction import _stop_words

### Uncomment it to download necessary packages
# ! python -m spacy download en_core_web_sm
# import nltk
# nltk.download(['wordnet', 'omw-1.4'])

from nltk.stem.wordnet import WordNetLemmatizer

import string
from gensim.corpora import Dictionary
from gensim.models import TfidfModel, lsimodel, ldamodel

# from jupyterthemes import jtplot
# jtplot.style(theme='monokai', context='notebook', ticks=True, grid=False)

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

warnings.filterwarnings('ignore')
%matplotlib inline



  from imp import reload


In [3]:
df = pd.DataFrame(columns=['text', 'label'])

path = 'bbc/'

for root, _, files in os.walk(path):
    for filename in files:
        with open(os.path.join(root, filename)) as f:
            text = f.read()
            current_df = pd.DataFrame({'text': [text], 'label': root.split('/')[-1]})
            df = df.append(current_df, ignore_index=True)

In [4]:
df.head()

Unnamed: 0,text,label
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [5]:
nlp = spacy.load('en_core_web_sm')

stopwords = _stop_words.ENGLISH_STOP_WORDS
lemmatizer = WordNetLemmatizer()

def clean(doc):
    text_no_namedentities = []
    document = nlp(doc)
    ents = [e.text for e in document.ents]
    for item in document:
        if item.text in ents:
            pass
        else:
            text_no_namedentities.append(item.text)
    doc = (" ".join(text_no_namedentities))

    doc = doc.lower().strip()
    doc = doc.replace("</br>", " ") 
    doc = doc.replace("-", " ") 
    doc = "".join([char for char in doc if char not in string.punctuation and not char.isdigit()])
    doc = " ".join([token for token in doc.split() if token not in stopwords and len(token)>3])    
    doc = "".join([lemmatizer.lemmatize(word) for word in doc])
    return doc

In [6]:
cleaned_df = df.copy()
cleaned_df['text'] = cleaned_df['text'].apply(clean)
cleaned_df.head()

Unnamed: 0,text,label
0,sales boost time warner profit profits media g...,business
1,dollar gains speech dollar highest level euro ...,business
2,unit buyer faces loan claim owners embattled g...,business
3,high fuel prices profits british airways blame...,business
4,takeover talk lifts domecq shares drinks food ...,business


In [7]:
def get_topics(df_cat):

    texts = [df_cat.text.iloc[i].split() for i in range(len(df_cat))]
    
    dictionary = Dictionary(texts)
    
    corpus = [dictionary.doc2bow(text) for text in texts]

    lda = ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=15,
                        alpha='auto', eta='auto', iterations = 20, passes = 5)

    vis_data = gensimvis.prepare(lda, corpus, dictionary)
    return vis_data

In [8]:
vis_data_business = get_topics(cleaned_df[cleaned_df['label'] == 'business'])
pyLDAvis.display(vis_data_business)

In [9]:
vis_data_politics = get_topics(cleaned_df[cleaned_df['label'] == 'politics'])
pyLDAvis.display(vis_data_politics)

In [10]:
vis_data_sport = get_topics(cleaned_df[cleaned_df['label'] == 'sport'])
pyLDAvis.display(vis_data_sport)

In [11]:
vis_data_entertainment = get_topics(cleaned_df[cleaned_df['label'] == 'entertainment'])
pyLDAvis.display(vis_data_entertainment)

In [12]:
vis_data_tech = get_topics(cleaned_df[cleaned_df['label'] == 'tech'])
pyLDAvis.display(vis_data_tech)