In [1]:
import os
import tarfile
from six.moves import urllib
DOWNLOAD_ROOT = "https://ai.stanford.edu/~amaas/data/sentiment/"
IMDB_PATH = os.path.join("C:/Users/venka", "dummy_folder")
IMDB_URL = DOWNLOAD_ROOT + "aclImdb_v1.tar.gz"


def fetch_imdb_data(imdb_url=IMDB_URL, imdb_path=IMDB_PATH):
    if not os.path.isdir(imdb_path):
        os.makedirs(imdb_path)
    tgz_path = os.path.join(imdb_path, "aclImdb_v1.tar.gz")
    urllib.request.urlretrieve(imdb_url, tgz_path)
    imdb_tgz = tarfile.open(tgz_path)
    imdb_tgz.extractall(path=imdb_path)
    imdb_tgz.close()
        
fetch_imdb_data()

In [2]:
from sklearn.datasets import load_files
reviews_train = load_files("C:/Users/venka/dummy_folder/aclImdb/train/")
# load_files returns a bunch, containing training texts and training labels
text_train, y_train = reviews_train.data, reviews_train.target
print("type of text_train: {}".format(type(text_train)))
print("length of text_train: {}".format(len(text_train)))
print("text_train[6]:\n{}".format(text_train[6]))

type of text_train: <class 'list'>
length of text_train: 75000
text_train[6]:
b'Gloomy Sunday - Ein Lied von Liebe und Tod directed by Rolf Sch\xc3\xbcbel in 1999 is a romantic, absorbing, beautiful, and heartbreaking movie. It started like Jules and Jim; it ended as one of Agatha Christie\'s books, and in between it said something about love, friendship, devotion, jealousy, war, Holocaust, dignity, and betrayal, and it did better than The Black Book which is much more popular. It is not perfect, and it made me, a cynic, wonder in the end on the complexity of the relationships and sensational revelations, and who is who to whom but the movie simply overwhelmed me. Perfect or not, it is unforgettable. All four actors as the parts of the tragic not even a triangle but a rectangle were terrific. I do believe that three men could fell deeply for one girl as beautiful and dignified as Ilona in a star-making performance by young Hungarian actress Erica Marozs\xc3\xa1n and who would not? The 

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(max_features=10000, max_df=.15) 
X = vect.fit_transform(text_train)

In [7]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=10, learning_method="batch", max_iter=25, random_state=0)
# We build the model and transform the data in one step
# Computing transform takes some time,
# and we can save time by doing both at once
document_topics = lda.fit_transform(X)


In [13]:
print("lda.components_.shape: {}".format(lda.components_.shape))

print(lda.components_)

lda.components_.shape: (10, 10000)
[[1.00021129e-01 9.29456044e+01 1.79458626e+03 ... 3.28228272e+01
  1.68159341e+01 1.00024257e-01]
 [4.05195782e+00 9.58419899e+01 4.91995836e+02 ... 1.00036075e-01
  6.36472775e+01 1.00007600e-01]
 [4.97005773e+00 2.09183682e+02 1.86078564e+02 ... 9.22679698e+01
  1.00033937e-01 1.00006726e-01]
 ...
 [1.00017799e-01 2.49781601e+01 1.59593400e+03 ... 1.00021068e-01
  1.00015076e-01 1.00004843e-01]
 [1.33298066e+02 2.91134266e+02 1.26917672e+02 ... 5.07346909e+01
  3.06587269e+00 1.00007153e-01]
 [1.63812890e+00 4.41363833e+01 3.30718331e+02 ... 1.00019367e-01
  1.00029691e-01 1.25099928e+02]]


In [25]:
import numpy as np
# For each topic (a row in the components_), sort the features (ascending)
# Invert rows with [:, ::-1] to make sorting descending
sorting = np.argsort(lda.components_, axis=1)[:, ::-1]

# print(lda.components_[:, sorting, axis=1])
# print(sorting)

# Get the feature names from the vectorizer
feature_names = np.array(vect.get_feature_names_out())

print(feature_names[sorting])

[['action' 'game' 'effects' ... 'garson' 'antoinette' 'bettie']
 ['guy' 're' 'around' ... 'sadako' 'asoka' 'eklavya']
 ['war' 'world' 'american' ... 'inuyasha' 'sadako' 'timon']
 ...
 ['funny' 'comedy' 'girl' ... 'izo' 'naschy' 'eklavya']
 ['father' 'wife' 'woman' ... 'cena' 'gundam' 'blackadder']
 ['role' 'cast' 'john' ... 'ringu' 'sadako' 'inuyasha']]


In [33]:
print(feature_names[sorting][0, :10])

topic_0 = topic_7[0, :10]

print("topic_0 **********************")
for i in topic_0:
 # show first two sentences
 print(b".".join(text_train[i].split(b".")[:2]) + b".\n")
    
topic_7 = sorting[7, :10]

print("topic_7 **********************") 
for i in topic_7:
 # show first two sentences
 print(b".".join(text_train[i].split(b".")[:2]) + b".\n")


['action' 'game' 'effects' 'special' 'fight' 'fi' 'sci' '10' 'star'
 'alien']
topic_0 **********************
b"In-crappy-credible. That's the word that comes to mind.\n"
b'Joan Crawford in doing The Last of Mrs. Cheyney had to stand comparison with not one, but two previous actresses who essayed the part of a crooked adventuress who discovers she has a chance at love.\n'
b'This movie was inspiring to me, as it was about a woman who would not give up. She is one of my favorite actresses, appearing on ER in the U.\n'
b"This isn't the worst comedy of all-time, but that is about the best thing that I can say about this pathetic film. I didn't laugh once, or even smile once during this bomb.\n"
b'Chris Kattan is a great sketch actor on Saturday Night Live..\n'
b'008: The Thief of Bagdad (1924) - released 3/18/1924, viewed 7/28/05.<br /><br />George Gerswin\'s "Rhapsody in Blue" is performed for the first time in NYC.\n'
b'As a cop this actor is pathetic but fascinating. A serial killer is a