In [None]:
!pip install -U scipy==1.10.0 numpy==1.23.5

In [None]:
!pip install -U git+https://github.com/stephenhky/PyShortTextCategorization

In [3]:
!python -m spacy download en

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [4]:
import gensim
import nltk
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
from nltk.stem import*
stemmer = PorterStemmer()

In [5]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [6]:
%cd /content/drive/My Drive/ColabData

/content/drive/My Drive/ColabData


In [None]:
from nltk.corpus import PlaintextCorpusReader
corpus_root = 'MovieReviews' # Folder Name
filelists = PlaintextCorpusReader(corpus_root, '.*',encoding='latin-1')  # wildcard is read all files in the folder
filelists.fileids()  # Get the filenames

In [25]:
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Function to read text files from a directory
def read_files_from_directory(directory, encoding='utf-8'):
    file_texts = []
    file_names = []
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), 'r', encoding=encoding) as file:
                file_names.append(filename)
                file_texts.append(file.read())
    return file_names, file_texts

# Read text files from the directory
directory = '/content/drive/My Drive/ColabData/MovieReviews'  # Update with your directory path
file_names, file_texts = read_files_from_directory(directory, encoding='latin-1')

# Create a DataFrame
data = {'File_Name': file_names, 'Text': file_texts}
df = pd.DataFrame(data)
df.head()


Unnamed: 0,File_Name,Text
0,17144.txt,"WHITE BALLOON, THE (director: Jafar Panahi; ca..."
1,17111.txt,WAKING NED DEVINE (director: Kirk Jones (III);...
2,17255.txt,SHE'S SO LOVELY (director: Nick Cassavetes; ca...
3,17185.txt,JUNK MAIL (director: Pal Sletaune; cast: Rober...
4,17109.txt,NOSTALGHIA (director: Andrei Tarkovsky; cast: ...


In [32]:
data_sample = df[df['File_Name'] == '17255.txt']
data_sample

Unnamed: 0,File_Name,Text
2,17255.txt,SHE'S SO LOVELY (director: Nick Cassavetes; ca...


In [35]:
doc_sample = data_sample.values[0][1]
doc_sample

'SHE\'S SO LOVELY (director: Nick Cassavetes; cast: Sean Penn (Eddie), Robin Wright Penn (Maureen), John Travolta (Joey), Harry Dean Stanton (Tony \'Shorty\' Russo), Debi Mazar (Georgie), Gena Rowlands (Mrs. Green), James Gandolfini (Kiefer, the neighbor), Susan Traylor (Lucinda), Kelsey Mulrooney (Jennie,Eddie\'s daughter), 1997)\nWe have to wait until the near end of this love triangle, a bleak drama about lowlife angst, to hear the words of the Cole Porter tune, of which this film takes its title from, spoken by Eddie (Sean) to Joey (Travolta), "She doesn\'t love you. She doesn\'t love me. She\'s...delovely." But even these words don\'t make sense, as we find out soon afterwards, that their wife definetly loves one of them more than the other. Maureen (Robin) is the ex-wife of Eddie and the current wife of Joey. She is also the real-life wife of Sean.\nMaureen and Eddie are a couple, who love each other madly, but have many problems such as booze, drugs, and Eddie\'s habit of disapp

In [26]:
len(gensim.parsing.preprocessing.STOPWORDS)

337

In [None]:
gensim.parsing.preprocessing.STOPWORDS

In [8]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [28]:
data_text = df.dropna(subset=['Text'])

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

In [53]:
processed_docs = data_text['Text'].map(preprocess)
processed_docs[:11]

0     [white, balloon, director, jafar, panahi, cast...
1     [wake, devin, director, kirk, jone, cast, bann...
2     [love, director, nick, cassavet, cast, sean, p...
3     [junk, mail, director, sletaun, cast, robert, ...
4     [nostalghia, director, andrei, tarkovski, cast...
5     [denni, schwartz, movi, review, poetri, unmak,...
6     [modern, director, alan, rudolph, cast, keith,...
7     [shakespear, love, director, john, madden, cas...
8     [scenic, rout, director, mark, rappaport, cast...
9     [brilliant, witti, mock, documentari, jean, se...
10    [central, station, director, walter, sall, cas...
Name: Text, dtype: object

In [38]:
#create a dictionary from ‘processed_docs’ containing the number of times a word appears in the document set
dictionary = gensim.corpora.Dictionary(processed_docs)

In [39]:
#Filter out tokens that appear in
 #less than 15 documents (absolute number) or
 #more than 0.5 documents (fraction of total corpus size, not absolute number).
 #after the above two steps, keep only the first 100000 most frequent tokens.

dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[['File_Name'] == '17255.txt']

In [None]:
bow_doc_4310 = bow_corpus[['File_Name'] == '17255.txt']
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], dictionary[bow_doc_4310[i][0]], bow_doc_4310[i][1]))

In [45]:
#run LDA using bag of words
#Because LDA uses randomness in both training and inference steps, setting a random state is very important to control the randomness to make answers repeatable.
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2,random_state=2023)



In [None]:
#for each topic, we will explore the words occuring in that topic and its relative weight.

for idx, topic in lda_model.print_topics(-1): # print all the topics. (-1) means all the topics.
    print('Topic: {} \nWords: {}'.format(idx, topic))

In [47]:
lda_model[bow_corpus[['File_Name'] == '17255.txt']]

[(4, 0.99528533)]

In [56]:
top_topics = lda_model.top_topics(corpus=bow_corpus, topn=11)

i = 0
for words, coherence in top_topics:
    print('Topic: {} -> Top Words: {}'.format(i, words))
    i += 1


AttributeError: 'LdaMulticore' object has no attribute 'top_ten'

In [55]:
for i, corpus_item in zip(range(len(bow_corpus)), bow_corpus):
    print(data['File_Name'][i], ':', lda_model[corpus_item])

17144.txt : [(4, 0.99528635)]
17111.txt : [(3, 0.010904675), (6, 0.2976358), (8, 0.6863852)]
17255.txt : [(3, 0.99071836)]
17185.txt : [(7, 0.9936147)]
17109.txt : [(0, 0.043015044), (3, 0.661759), (8, 0.28437066)]
16748.txt : [(1, 0.01375727), (4, 0.68755615), (8, 0.2931729)]
17147.txt : [(1, 0.98874635)]
17303.txt : [(0, 0.3042248), (8, 0.65554255), (9, 0.034539554)]
17150.txt : [(1, 0.6377174), (7, 0.3570521)]
17108.txt : [(1, 0.19764349), (7, 0.60092676), (8, 0.19539325)]
17300.txt : [(1, 0.96850663), (8, 0.02372374)]
17239.txt : [(4, 0.9537108), (8, 0.03199912)]
17119.txt : [(0, 0.36659294), (1, 0.6294256)]
17146.txt : [(1, 0.42375776), (4, 0.2804612), (5, 0.06913816), (9, 0.2234503)]
17118.txt : [(1, 0.8216808), (4, 0.1700693)]
17116.txt : [(1, 0.6868116), (6, 0.024643673), (7, 0.2852725)]
17139.txt : [(7, 0.9839845), (8, 0.010608218)]
17280.txt : [(8, 0.99342823)]
17110.txt : [(0, 0.5373454), (7, 0.4575247)]
17117.txt : [(1, 0.5111784), (8, 0.48154655)]
17254.txt : [(1, 0.990422