# Latent Dirichlet Allocation
### *It's an unsupervised method used for topic modelling
### *The assumption is that each document may contain several topics and each topic may be represented by a group of words. Intuitively, we can think of this as two probabilities
### 1.Each document is a distribution of categories. For example, an article that talks about the features of a new phone might have content about the "technology" used in its development, "availability of features", "ease of use".Each of these becomes one category and we can define a distribution of these topics over the document(which category is most talked about is the one with highest probability and so on)
### 2.Now to talk about technology, the words that we use would be technical/scientific. So the probability of "LED" to be in the context is higher than for the word say "elections". So, what is derived out is which words are highly probable for the categories

In [1]:
import numpy as np
import pandas as pd

### *predicting the Genre by using summary can done.
### * we may not able to predict the genre by using topic modelling but we can the most important words related to each topic and we can able to relate the topic based on words.

In [2]:
data=pd.read_csv("moviesdatase.csv",usecols=["overview"])
#data=np.array(data).flatten()

In [3]:
data=data.iloc[range(10000),:]

In [4]:
data.fillna(" ",inplace=True)

In [5]:
data=np.array(data).flatten()

In [6]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [7]:
def process(doc):
    import re
    stop=stopwords.words('english')
    stemmer= PorterStemmer() 
    doc=re.sub("[\d_]+", " ",doc)
    doc=re.sub("[\W_]+"," ",doc)
    x= word_tokenize(doc)
    x=[i for i in x if len(i)>5]
    #stemmed=[stemmer.stem(word) for word in x]
    clean=[word for word in x if not word in stop]
    joined= ' '.join(clean)
    return joined

In [8]:
tfidf=TfidfVectorizer(preprocessor=process,decode_error = 'ignore') # Instantiating the tfidf vectorizer


In [9]:
Doc_term= tfidf.fit_transform(data)
#Doc_term= tfidf.fit_transform(data.astype("U"))

In [10]:
tfidf.vocabulary_

{'happily': 17240,
 'birthday': 12001,
 'brings': 12322,
 'Lightyear': 5563,
 'Afraid': 103,
 'losing': 19181,
 'circumstances': 12974,
 'separate': 23523,
 'eventually': 15795,
 'learns': 18946,
 'differences': 14674,
 'siblings': 23774,
 'discover': 14807,
 'enchanted': 15493,
 'magical': 19262,
 'unwittingly': 26207,
 'invite': 18567,
 'trapped': 25625,
 'inside': 18293,
 'living': 19110,
 'freedom': 16616,
 'finish': 16338,
 'proves': 21854,
 'running': 23098,
 'rhinoceroses': 22922,
 'monkeys': 19848,
 'terrifying': 25261,
 'creatures': 13944,
 'family': 16129,
 'wedding': 26621,
 'reignites': 22464,
 'ancient': 11119,
 'neighbors': 20147,
 'fishing': 16355,
 'buddies': 12376,
 'Meanwhile': 6230,
 'sultry': 24802,
 'Italian': 4774,
 'divorcée': 15011,
 'restaurant': 22780,
 'alarming': 10971,
 'locals': 19125,
 'interested': 18408,
 'seafood': 23385,
 'cooking': 13756,
 'Cheated': 1863,
 'mistreated': 19785,
 'stepped': 24457,
 'holding': 17506,
 'breath': 12284,
 'waiting': 26519

In [11]:
Docterms=Doc_term.todense()

In [12]:
lda = LatentDirichletAllocation(n_components=24, max_iter=200, 
        learning_method='online', random_state=0)

In [13]:
lda.fit_transform(Docterms)

array([[0.00977284, 0.00977284, 0.00977284, ..., 0.00977284, 0.10613613,
        0.00977284],
       [0.00825314, 0.00825314, 0.00825314, ..., 0.00825314, 0.00825314,
        0.00825314],
       [0.00828531, 0.00828531, 0.00828531, ..., 0.10566384, 0.00828531,
        0.00828531],
       ...,
       [0.00821896, 0.00821896, 0.00821896, ..., 0.00821896, 0.06869629,
        0.00821896],
       [0.00703666, 0.00703666, 0.00703666, ..., 0.00703666, 0.00703666,
        0.00703666],
       [0.01394548, 0.01394548, 0.01394548, ..., 0.01394548, 0.01394548,
        0.01394548]])

In [14]:
n_top_words = 20
feature_names = tfidf.get_feature_names()

for topic_idx, topic in enumerate(lda.components_):
    print("Topic %d:" % (topic_idx + 1))
    print(" ".join([feature_names[i]
                    for i in topic.argsort()\
                  [:-n_top_words - 1:-1]]))

Topic 1:
convince vampire church castle vampires pursuing circle Aliens Cornell infatuated impress Woolrich conservative destiny centered dysfunctional Gregory alcohol Antoine stages
Topic 2:
interview boring romantically singers fought objections nights Quentin introduced entirely speaking communist Government Shirley choices Kurosawa discuss proper active biopic
Topic 3:
Freddy historical suburb necessary ticket Barney monstrous Beijing satanic unbelievable Othello Almodovar measures persona Astronauts cartoon entertain Geoffrey Muriel unpaid
Topic 4:
Godzilla terrorizing unknowing marshal protects gunfighter Enterprise devices ravaged factions dystopian Federation Picard barbershop Virgil heists systems recognizes terminate hijacks
Topic 5:
employer Steven engage closed artistic Debbie Initially customer Simone Donovan concerts adored freight presided adapts representative Cleopatra Addams Wednesday Factory
Topic 6:
eating stakes trilogy Junior upcoming anonymous federal operations 

In [15]:
lda.components_.shape

(24, 26923)

# ***LDA using Gensim

In [16]:
!pip install gensim



In [17]:
import gensim
from gensim import corpora

In [18]:
def process_text(doc):
    stop = stopwords.words('english')
    import re
    input_string=re.sub("[\d_]+", " ",doc)
    input_string=re.sub("[\W_]+"," ",doc)
    Words= [word.lower() for word in word_tokenize(input_string)]
    words= [w for w in Words if not w  in stop]
    return words

In [19]:
texts=[]
for para in data:
    texts.append(process_text(para))

In [20]:
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [21]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=10,
                                           passes=10,
                                           alpha='symmetric',
                                           iterations=100,
                                           per_word_topics=True)

In [22]:
print(lda_model.print_topics())

[(0, '0.000*"hobnobbing" + 0.000*"runyonesque" + 0.000*"beggar" + 0.000*"bootlegger" + 0.000*"boozy" + 0.000*"brassy" + 0.000*"née" + 0.000*"runyon" + 0.000*"gruver" + 0.000*"buttons"'), (1, '0.000*"hobnobbing" + 0.000*"runyonesque" + 0.000*"beggar" + 0.000*"bootlegger" + 0.000*"boozy" + 0.000*"brassy" + 0.000*"née" + 0.000*"runyon" + 0.000*"gruver" + 0.000*"buttons"'), (2, '0.000*"hobnobbing" + 0.000*"runyonesque" + 0.000*"beggar" + 0.000*"bootlegger" + 0.000*"boozy" + 0.000*"brassy" + 0.000*"née" + 0.000*"runyon" + 0.000*"gruver" + 0.000*"buttons"'), (3, '0.109*"world" + 0.095*"father" + 0.091*"years" + 0.085*"three" + 0.057*"set" + 0.057*"back" + 0.037*"living" + 0.033*"discover" + 0.032*"whose" + 0.031*"evil"'), (4, '0.189*"man" + 0.126*"find" + 0.068*"way" + 0.050*"women" + 0.049*"even" + 0.047*"good" + 0.045*"leads" + 0.031*"without" + 0.021*"various" + 0.021*"detective"'), (5, '0.097*"live" + 0.082*"place" + 0.066*"eventually" + 0.065*"heart" + 0.038*"owner" + 0.033*"led" + 0.03

In [23]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        #print(row_list)
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        #print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

### * we can get the information regarding 
### 1.what is the Document Number
### 2. Which topic that particular Document belongs to
### 3. what is the contribution of that record to that particular topic.
### 4. keywords in that record.

In [27]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=texts)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(30)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,5.0,0.9367,"live, place, eventually, heart, owner, led, br...","[led, woody, andy, toys, live, happily, room, ..."
1,1,3.0,0.7921,"world, father, years, three, set, back, living...","[siblings, judy, peter, discover, enchanted, b..."
2,2,12.0,0.8265,"time, local, brother, away, still, john, busin...","[family, wedding, reignites, ancient, feud, ne..."
3,3,4.0,0.8375,"man, find, way, women, even, good, leads, with...","[cheated, mistreated, stepped, women, holding,..."
4,4,6.0,0.7731,"young, family, town, home, wife, daughter, fri...","[george, banks, recovered, daughter, wedding, ..."
5,5,4.0,0.7025,"man, find, way, women, even, good, leads, with...","[obsessive, master, thief, neil, mccauley, lea..."
6,6,12.0,0.8643,"time, local, brother, away, still, john, busin...","[ugly, duckling, undergone, remarkable, change..."
7,7,6.0,0.9604,"young, family, town, home, wife, daughter, fri...","[mischievous, young, boy, tom, sawyer, witness..."
8,8,3.0,0.8205,"world, father, years, three, set, back, living...","[international, action, superstar, jean, claud..."
9,9,15.0,0.8812,"must, mysterious, head, leader, prevent, james...","[james, bond, must, unmask, mysterious, head, ..."
