In [8]:
!pip install gensim 

Collecting gensim
  Downloading gensim-4.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m49.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: gensim
Successfully installed gensim-4.3.2
[0m

In [1]:
import sqlite3
import pandas as pd

conn = sqlite3.connect('./data/hot_data.db')
text_data = pd.read_sql('SELECT * FROM sentences', conn)

labels = ['neutral', 'surprise', 'sadness', 'joy', 'fear', 'disgust', 'anger']
neutral = ['neutral']
negative = ['sadness', 'fear', 'disgust', 'anger']
positive = ['joy', 'surprise']

def get_label(sentiment):
	if sentiment in neutral:
		return 'neutral'
	elif sentiment in negative:
		return 'negative'
	elif sentiment in positive:
		return 'positive'
	else:
		return 'unknown'

text_data['label'] = text_data['emotion_label'].apply(get_label)
text_data.to_sql('sentences', conn, if_exists='replace', index=False)

text_data = pd.read_sql('SELECT * FROM sentences', conn)
text_data

Unnamed: 0,id,emotion_label,emotion_score,text,label
0,f0016236-f87f-4ab4-8689-42ce66fa6295,joy,0.770712,Hey Slugs!,positive
1,67e23947-8c13-4427-ad59-ae1398f2f225,fear,0.786226,Be aware of common craigslist housing scams!,negative
2,77d4e3cb-b557-47e7-bd79-39cee78d3e02,neutral,0.577710,"Of course not all post are scams, but you shou...",neutral
3,def31337-ba79-49a8-8b07-910b355aaa56,neutral,0.802858,1.,neutral
4,2bfb56cf-db29-4ab8-87d6-db696fda292c,neutral,0.643948,The Listing Has No Photos\n2.,neutral
...,...,...,...,...,...
17511,b682018a-ce03-461b-9853-d6130e91dbb0,neutral,0.962796,The underline Unix system on Macs are just far...,neutral
17512,fa4d2c10-d175-4071-b137-e21ab1fba328,neutral,0.731793,>Like why wouldn’t u just start out with a win...,neutral
17513,6e81543b-d7a4-4dc5-bbdf-02bc7324cddf,disgust,0.369341,Windows computers are much more annoying to us...,negative
17514,b4347747-ebe5-4593-9d62-4c25511a1c12,neutral,0.516326,"CS student, but windows surface has been doing...",neutral


In [11]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim import corpora, models
import nltk
nltk.download('stopwords')

class TextAnalyzer:
	def __init__(self, data_file):
		#self.data = pd.read_csv(data_file)
		conn = sqlite3.connect('./data/hot_data.db')
		self.data = pd.read_sql('SELECT * FROM sentences', conn)
		self.labels = ['neutral', 'surprise', 'sadness', 'joy', 'fear', 'disgust', 'anger']
		self.categories = {
			'all': self.labels,
			'neutral': ['neutral'],
			'negative': ['sadness', 'fear', 'disgust', 'anger'],
			'positive': ['joy', 'surprise']
		}
		self.label_word_freq = {}
		self.label_word_tfidf = {}

	def get_word_frequencies(self, text_data):
		vectorizer = CountVectorizer(stop_words='english')
		word_count = vectorizer.fit_transform(text_data)
		sum_words = word_count.sum(axis=0)
		word_freq = [(word, int(sum_words[0, idx])) for word, idx in vectorizer.vocabulary_.items()]
		return sorted(word_freq, key=lambda x: x[1], reverse=True)

	def get_tfidf_word_frequencies(self, text_data):
		vectorizer = TfidfVectorizer(stop_words='english')
		tfidf_matrix = vectorizer.fit_transform(text_data.dropna())
		feature_names = vectorizer.get_feature_names_out()
		dense = tfidf_matrix.todense()
		denselist = dense.tolist()
		df = pd.DataFrame(denselist, columns=feature_names)
		return df.mean(axis=0).sort_values(ascending=False).reset_index().rename(columns={0: 'score'})
	
	def perform_lda_topic_modeling(self, num_topics=5, num_words=5):
		# Filter out non-text data
		text_data = self.data['text'].dropna().tolist()

		# Tokenize the documents
		tokenized_data = [nltk.word_tokenize(doc.lower()) for doc in text_data]

		# Create a dictionary representation of the documents
		dictionary = corpora.Dictionary(tokenized_data)

		# Filter extremes to remove very rare and very common words
		dictionary.filter_extremes(no_below=5, no_above=0.5)

		# Vectorize data
		corpus = [dictionary.doc2bow(doc) for doc in tokenized_data]

		# Perform LDA
		lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

		topics = lda_model.print_topics(num_words=num_words)
		return topics

	def analyze(self):
		for key in self.categories.keys():
			filtered_data = self.data[self.data['emotion_label'].isin(self.categories[key])]['text']
			self.label_word_freq[key] = self.get_word_frequencies(filtered_data)
			self.label_word_tfidf[key] = self.get_tfidf_word_frequencies(filtered_data)

		print("Done loading data")

#if __name__ == "__main__":
#	text_analyzer = TextAnalyzer("data/hot_results.csv")
#	text_analyzer.analyze()

#	print("Word frequencies:")
#	print(text_analyzer.label_word_freq)
	


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
text_analyzer = TextAnalyzer("data/hot_results.csv")
text_analyzer.analyze()

print("Word frequencies:")
print(text_analyzer.label_word_freq)

Word frequencies:
{}


In [15]:
import nltk
nltk.download('punkt')

topics = text_analyzer.perform_lda_topic_modeling(num_topics=20, num_words=20)
for topic in topics:
    print(topic)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


(0, '0.116*"," + 0.078*"are" + 0.059*"people" + 0.047*"and" + 0.047*"who" + 0.042*"the" + 0.038*"students" + 0.033*"\'\'" + 0.033*"``" + 0.028*"for" + 0.024*"why" + 0.021*"other" + 0.020*"those" + 0.019*"all" + 0.018*"with" + 0.014*"like" + 0.013*"cs" + 0.013*"him" + 0.012*"at" + 0.011*"ucsc"')
(1, '0.069*"of" + 0.062*"the" + 0.052*"," + 0.051*"and" + 0.041*"a" + 0.022*"classes" + 0.020*"but" + 0.019*"is" + 0.019*"work" + 0.018*"in" + 0.018*"very" + 0.017*"with" + 0.017*"were" + 0.017*"lot" + 0.015*"are" + 0.015*"on" + 0.015*"good" + 0.015*"pretty" + 0.013*"most" + 0.011*"at"')
(2, '0.231*"!" + 0.060*"lol" + 0.043*"always" + 0.042*"everyone" + 0.039*"his" + 0.037*"up" + 0.036*"after" + 0.033*"said" + 0.030*"hope" + 0.026*"check" + 0.022*"look" + 0.021*"wrong" + 0.020*"out" + 0.020*"taken" + 0.019*"change" + 0.018*"this" + 0.017*"started" + 0.017*"called" + 0.016*"system" + 0.014*"exactly"')
(3, '0.123*"they" + 0.090*"the" + 0.054*"them" + 0.041*"of" + 0.034*"their" + 0.034*"to" + 0.033

In [4]:
words_dict = {}
for item in text_analyzer.label_word_freq['all']:
	words_dict[item[0]] = {}
for category in text_analyzer.label_word_freq.keys():
	for item in text_analyzer.label_word_freq[category]:
		if item[0] in words_dict.keys():
			words_dict[item[0]][f'{category}_freq'] = item[1]

for category in text_analyzer.label_word_tfidf.keys():
	for item in text_analyzer.label_word_tfidf[category].values:
		if item[0] in words_dict.keys():
			words_dict[item[0]][f'{category}_tfidf'] = item[1]	

words_dict

{'just': {'all_freq': 1152,
  'neutral_freq': 814,
  'negative_freq': 203,
  'positive_freq': 135,
  'all_tfidf': 0.01437462399562577,
  'neutral_tfidf': 0.01590428671664255,
  'negative_tfidf': 0.013324846517668982,
  'positive_tfidf': 0.011869431534327183},
 'like': {'all_freq': 921,
  'neutral_freq': 566,
  'negative_freq': 213,
  'positive_freq': 142,
  'all_tfidf': 0.010953785160353773,
  'neutral_tfidf': 0.01073927634668481,
  'negative_tfidf': 0.013020167613550881,
  'positive_tfidf': 0.011442585210984398},
 'people': {'all_freq': 864,
  'neutral_freq': 517,
  'negative_freq': 234,
  'positive_freq': 113,
  'all_tfidf': 0.010286351540233054,
  'neutral_tfidf': 0.009907842041192438,
  'negative_tfidf': 0.013723451705505546,
  'positive_tfidf': 0.009617470364526856},
 'don': {'all_freq': 823,
  'neutral_freq': 514,
  'negative_freq': 219,
  'positive_freq': 90,
  'all_tfidf': 0.011563342965185347,
  'neutral_tfidf': 0.010685472072039758,
  'negative_tfidf': 0.01776409442652829,
  

In [5]:
words_df = pd.DataFrame.from_dict(words_dict, orient='index')
words_df.index.names = ['word']
words_df.to_sql('words', conn, if_exists='replace', index=True)
words_df = pd.read_sql('SELECT * FROM words', conn)
words_df

Unnamed: 0,word,all_freq,neutral_freq,negative_freq,positive_freq,all_tfidf,neutral_tfidf,negative_tfidf,positive_tfidf
0,just,1152,814.0,203.0,135.0,0.014375,0.015904,0.013325,0.011869
1,like,921,566.0,213.0,142.0,0.010954,0.010739,0.013020,0.011443
2,people,864,517.0,234.0,113.0,0.010286,0.009908,0.013723,0.009617
3,don,823,514.0,219.0,90.0,0.011563,0.010685,0.017764,0.009163
4,campus,764,533.0,106.0,125.0,0.009893,0.010675,0.007557,0.011096
...,...,...,...,...,...,...,...,...,...
13579,dell,1,1.0,,,0.000030,0.000046,,
13580,proofing,1,1.0,,,0.000024,0.000036,,
13581,mplabx,1,1.0,,,0.000029,0.000044,,
13582,underline,1,1.0,,,0.000024,0.000036,,


In [6]:
# loop of words_df
sentence_df = pd.read_sql('SELECT * FROM sentences', conn)

for index, row in words_df.iterrows():
	result = sentence_df[sentence_df["text"].str.contains(row['word'], na=False)]
	result = result['label'].value_counts().to_dict()
	words_df.loc[index, 'neutral_num'] = result.get('neutral', 0)
	words_df.loc[index, 'positive_num'] = result.get('positive', 0)
	words_df.loc[index, 'negative_num'] = result.get('negative', 0)

words_df.to_sql('words', conn, if_exists='replace', index=True)
words_df = pd.read_sql('SELECT * FROM words', conn)
words_df
	

Unnamed: 0,word,all_freq,neutral_freq,negative_freq,positive_freq,all_tfidf,neutral_tfidf,negative_tfidf,positive_tfidf,neutral_num,positive_num,negative_num
0,just,1152,814.0,203.0,135.0,0.014375,0.015904,0.013325,0.011869,687.0,122.0,194.0
1,like,921,566.0,213.0,142.0,0.010954,0.010739,0.013020,0.011443,597.0,145.0,209.0
2,people,864,517.0,234.0,113.0,0.010286,0.009908,0.013723,0.009617,454.0,107.0,207.0
3,don,823,514.0,219.0,90.0,0.011563,0.010685,0.017764,0.009163,554.0,103.0,212.0
4,campus,764,533.0,106.0,125.0,0.009893,0.010675,0.007557,0.011096,481.0,117.0,99.0
...,...,...,...,...,...,...,...,...,...,...,...,...
13579,dell,1,1.0,,,0.000030,0.000046,,,0.0,0.0,0.0
13580,proofing,1,1.0,,,0.000024,0.000036,,,1.0,0.0,0.0
13581,mplabx,1,1.0,,,0.000029,0.000044,,,0.0,0.0,0.0
13582,underline,1,1.0,,,0.000024,0.000036,,,1.0,0.0,0.0


In [17]:
### Delete words table
#cursor = conn.cursor()
#cursor.execute("DROP TABLE words")
#conn.commit()

words_df.to_sql('words', conn, if_exists='replace', index=True)
words_df = pd.read_sql('SELECT * FROM words', conn)