# Topic Modeling using LDA

In [1]:
import gensim

In [2]:
corpus = "In terms of unforgettable looks, and enduring desire from enthusiasts who may have grown up gluing together the AMT 3-in-1 model kit that it inspired, the 1940 models stand today as some of the most iconic, instantly recognizable automobiles that the Ford Motor Company ever produced. That year, Fords were produced in two series: Standard and Deluxe. The easiest way to tell them apart is to look for a cleaner one-piece grille on Standard models, while the Deluxe version has a three-piece grille assembly. Both cars also had slightly different pieces of hood trim. This 1940 Ford Standard Tudor sedan was a very popular model that year–around 151,000 of them were built and sold. This Standard has been under the same California ownership since 1994, after the seller bought it from an owner in Texas. The seller describes the car as being entirely original, though the age of the finish and status of any restoration or refresh are unknown."

In [3]:
from nltk import sent_tokenize

list_of_sentence = sent_tokenize(corpus)
list_of_sentence

['In terms of unforgettable looks, and enduring desire from enthusiasts who may have grown up gluing together the AMT 3-in-1 model kit that it inspired, the 1940 models stand today as some of the most iconic, instantly recognizable automobiles that the Ford Motor Company ever produced.',
 'That year, Fords were produced in two series: Standard and Deluxe.',
 'The easiest way to tell them apart is to look for a cleaner one-piece grille on Standard models, while the Deluxe version has a three-piece grille assembly.',
 'Both cars also had slightly different pieces of hood trim.',
 'This 1940 Ford Standard Tudor sedan was a very popular model that year–around 151,000 of them were built and sold.',
 'This Standard has been under the same California ownership since 1994, after the seller bought it from an owner in Texas.',
 'The seller describes the car as being entirely original, though the age of the finish and status of any restoration or refresh are unknown.']

In [4]:
list_of_simple_preprocess_data = []

for i in list_of_sentence:
    list_of_simple_preprocess_data.append(gensim.utils.simple_preprocess(i, deacc=True, min_len=3))

texts = list_of_simple_preprocess_data
texts

[['terms',
  'unforgettable',
  'looks',
  'and',
  'enduring',
  'desire',
  'from',
  'enthusiasts',
  'who',
  'may',
  'have',
  'grown',
  'gluing',
  'together',
  'the',
  'amt',
  'model',
  'kit',
  'that',
  'inspired',
  'the',
  'models',
  'stand',
  'today',
  'some',
  'the',
  'most',
  'iconic',
  'instantly',
  'recognizable',
  'automobiles',
  'that',
  'the',
  'ford',
  'motor',
  'company',
  'ever',
  'produced'],
 ['that',
  'year',
  'fords',
  'were',
  'produced',
  'two',
  'series',
  'standard',
  'and',
  'deluxe'],
 ['the',
  'easiest',
  'way',
  'tell',
  'them',
  'apart',
  'look',
  'for',
  'cleaner',
  'one',
  'piece',
  'grille',
  'standard',
  'models',
  'while',
  'the',
  'deluxe',
  'version',
  'has',
  'three',
  'piece',
  'grille',
  'assembly'],
 ['both',
  'cars',
  'also',
  'had',
  'slightly',
  'different',
  'pieces',
  'hood',
  'trim'],
 ['this',
  'ford',
  'standard',
  'tudor',
  'sedan',
  'was',
  'very',
  'popular',
  

In [5]:
bigram = gensim.models.Phrases(list_of_simple_preprocess_data) 
print(bigram)

Phrases<224 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>


In [6]:
#from gensim.utils import lemmatize
from nltk.corpus import stopwords

stops = set(stopwords.words('english')) 

def process_texts(texts):
    texts = [[word for word in line if word not in stops] for line in texts]
    texts = [bigram[line] for line in texts]
    #texts = [[word.decode("utf-8").split('/')[0] for word in lemmatize(' '.join(line), allowed_tags=re.compile('(NN)'), min_length=5)] for line in texts]
    return texts

In [7]:
train_texts = process_texts(list_of_simple_preprocess_data)
train_texts

[['terms',
  'unforgettable',
  'looks',
  'enduring',
  'desire',
  'enthusiasts',
  'may',
  'grown',
  'gluing',
  'together',
  'amt',
  'model',
  'kit',
  'inspired',
  'models',
  'stand',
  'today',
  'iconic',
  'instantly',
  'recognizable',
  'automobiles',
  'ford',
  'motor',
  'company',
  'ever',
  'produced'],
 ['year', 'fords', 'produced', 'two', 'series', 'standard', 'deluxe'],
 ['easiest',
  'way',
  'tell',
  'apart',
  'look',
  'cleaner',
  'one',
  'piece',
  'grille',
  'standard',
  'models',
  'deluxe',
  'version',
  'three',
  'piece',
  'grille',
  'assembly'],
 ['cars', 'also', 'slightly', 'different', 'pieces', 'hood', 'trim'],
 ['ford',
  'standard',
  'tudor',
  'sedan',
  'popular',
  'model',
  'year',
  'around',
  'built',
  'sold'],
 ['standard',
  'california',
  'ownership',
  'since',
  'seller',
  'bought',
  'owner',
  'texas'],
 ['seller',
  'describes',
  'car',
  'entirely',
  'original',
  'though',
  'age',
  'finish',
  'status',
  'rest

In [8]:
from gensim.models import LdaModel
#from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary

dictionary = Dictionary(train_texts)

print(dictionary)

Dictionary<75 unique tokens: ['amt', 'automobiles', 'company', 'desire', 'enduring']...>


In [9]:
corpus = [dictionary.doc2bow(text) for text in train_texts]

print(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1)], [(19, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1)], [(17, 1), (26, 1), (29, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 2), (37, 1), (38, 1), (39, 2), (40, 1), (41, 1), (42, 1), (43, 1)], [(44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1)], [(7, 1), (16, 1), (29, 1), (31, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1)], [(29, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1)], [(61, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1)]]


In [10]:
ldamodel = LdaModel(corpus=corpus, num_topics=6, id2word=dictionary)

In [11]:
ldamodel.show_topics()

[(0,
  '0.040*"year" + 0.040*"ford" + 0.040*"sedan" + 0.040*"tudor" + 0.040*"around" + 0.040*"hood" + 0.040*"sold" + 0.040*"popular" + 0.040*"standard" + 0.040*"model"'),
 (1,
  '0.013*"standard" + 0.013*"pieces" + 0.013*"fords" + 0.013*"california" + 0.013*"finish" + 0.013*"cars" + 0.013*"year" + 0.013*"deluxe" + 0.013*"produced" + 0.013*"bought"'),
 (2,
  '0.037*"models" + 0.028*"piece" + 0.025*"today" + 0.025*"inspired" + 0.025*"ford" + 0.024*"enduring" + 0.024*"recognizable" + 0.024*"produced" + 0.024*"gluing" + 0.024*"looks"'),
 (3,
  '0.049*"grille" + 0.042*"piece" + 0.032*"way" + 0.031*"models" + 0.026*"deluxe" + 0.026*"look" + 0.026*"tell" + 0.025*"standard" + 0.025*"apart" + 0.025*"assembly"'),
 (4,
  '0.037*"seller" + 0.037*"entirely" + 0.037*"status" + 0.037*"age" + 0.037*"restoration" + 0.037*"standard" + 0.037*"car" + 0.037*"year" + 0.037*"refresh" + 0.037*"describes"'),
 (5,
  '0.057*"standard" + 0.057*"seller" + 0.057*"owner" + 0.057*"texas" + 0.057*"bought" + 0.057*"own

In [12]:
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()

In [14]:
pyLDAvis.gensim_models.prepare(ldamodel, corpus, dictionary)


  default_term_info = default_term_info.sort_values(


## Wait for future updates...