In [1]:
!pip install textblob
!python -m spacy download en_core_web_sm

Collecting textblob
  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m636.8/636.8 kB[0m [31m44.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: textblob
Successfully installed textblob-0.17.1
[0mCollecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m79.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.4.1
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
import pandas as pd
import spacy
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from textblob import TextBlob

class TextAnalyzer:
	def __init__(self, data_file):
		self.data = pd.read_csv(data_file)
		self.nlp = spacy.load("en_core_web_sm")
		self.labels = ['neutral', 'surprise', 'sadness', 'joy', 'fear', 'disgust', 'anger']
		self.categories = {
			'all': self.labels,
			'neutral': ['neutral'],
			'negative': ['sadness', 'fear', 'disgust', 'anger'],
			'positive': ['joy', 'surprise']
		}
		self.dict = {}

	def get_word_frequencies(self, text_data):
		vectorizer = CountVectorizer(stop_words='english')
		word_count = vectorizer.fit_transform(text_data)
		sum_words = word_count.sum(axis=0)
		word_freq = [(word, int(sum_words[0, idx])) for word, idx in vectorizer.vocabulary_.items()]
		return sorted(word_freq, key=lambda x: x[1], reverse=True)
	
	def get_sentiment(self, text_data):
		sentiments = [TextBlob(text).sentiment.polarity for text in text_data]
		return sum(sentiments) / len(sentiments) if sentiments else 0

	def get_named_entities(self, text_data):
		entities = []
		for doc in self.nlp.pipe(text_data, disable=["tagger", "parser"]):
			entities.extend([(ent.text, ent.label_) for ent in doc.ents])
		return entities

	def get_tfidf_word_frequencies(self, text_data):
		vectorizer = TfidfVectorizer(stop_words='english')
		tfidf_matrix = vectorizer.fit_transform(text_data)
		feature_names = vectorizer.get_feature_names_out()
		dense = tfidf_matrix.todense()
		denselist = dense.tolist()
		df = pd.DataFrame(denselist, columns=feature_names)
		data = df.mean(axis=0).sort_values(ascending=False).reset_index().rename(columns={0: 'score'})
		data = data.to_records(index=False)
		data = list(data)
		return data

	def analyze(self):
		for key in tqdm(self.categories.keys()):
			filtered_data = self.data[self.data['label'].isin(self.categories[key])]['text']
			self.dict[f'{key}_word_freq'] = self.get_word_frequencies(filtered_data)
			self.dict[f'{key}_word_tfidf'] = self.get_tfidf_word_frequencies(filtered_data)
			self.dict[f'{key}_sentiment'] = self.get_sentiment(filtered_data)
			self.dict[f'{key}_named_entities'] = self.get_named_entities(filtered_data)
			
		print("Done loading data")

In [3]:
text_analyzer = TextAnalyzer("hot_results.csv")
text_analyzer.analyze()

100%|██████████| 4/4 [05:47<00:00, 86.89s/it]

Done loading data





In [25]:
text_analyzer.dict

{'all_word_freq': [('just', 3474),
  ('people', 2802),
  ('like', 2773),
  ('don', 2459),
  ('know', 1871),
  ('ucsc', 1826),
  ('class', 1610),
  ('think', 1567),
  ('campus', 1561),
  ('time', 1433),
  ('good', 1296),
  ('ve', 1280),
  ('want', 1225),
  ('students', 1206),
  ('year', 1167),
  ('israel', 1106),
  ('need', 1079),
  ('really', 1064),
  ('ll', 1018),
  ('school', 993),
  ('make', 957),
  ('https', 950),
  ('work', 933),
  ('right', 923),
  ('classes', 894),
  ('going', 887),
  ('did', 886),
  ('say', 866),
  ('got', 837),
  ('way', 834),
  ('said', 831),
  ('sure', 819),
  ('looking', 797),
  ('student', 796),
  ('didn', 782),
  ('major', 752),
  ('housing', 739),
  ('doesn', 720),
  ('lot', 702),
  ('better', 697),
  ('does', 688),
  ('quarter', 686),
  ('pretty', 683),
  ('lol', 678),
  ('uc', 673),
  ('post', 645),
  ('let', 633),
  ('yes', 623),
  ('years', 609),
  ('yeah', 609),
  ('actually', 608),
  ('saying', 604),
  ('college', 600),
  ('job', 592),
  ('bus', 58

In [28]:
text_analyzer.dict

{'all_word_freq': [('just', 3474),
  ('people', 2802),
  ('like', 2773),
  ('don', 2459),
  ('know', 1871),
  ('ucsc', 1826),
  ('class', 1610),
  ('think', 1567),
  ('campus', 1561),
  ('time', 1433),
  ('good', 1296),
  ('ve', 1280),
  ('want', 1225),
  ('students', 1206),
  ('year', 1167),
  ('israel', 1106),
  ('need', 1079),
  ('really', 1064),
  ('ll', 1018),
  ('school', 993),
  ('make', 957),
  ('https', 950),
  ('work', 933),
  ('right', 923),
  ('classes', 894),
  ('going', 887),
  ('did', 886),
  ('say', 866),
  ('got', 837),
  ('way', 834),
  ('said', 831),
  ('sure', 819),
  ('looking', 797),
  ('student', 796),
  ('didn', 782),
  ('major', 752),
  ('housing', 739),
  ('doesn', 720),
  ('lot', 702),
  ('better', 697),
  ('does', 688),
  ('quarter', 686),
  ('pretty', 683),
  ('lol', 678),
  ('uc', 673),
  ('post', 645),
  ('let', 633),
  ('yes', 623),
  ('years', 609),
  ('yeah', 609),
  ('actually', 608),
  ('saying', 604),
  ('college', 600),
  ('job', 592),
  ('bus', 58

In [40]:
text_analyzer.dict

{'all_word_freq': [('just', 3474),
  ('people', 2802),
  ('like', 2773),
  ('don', 2459),
  ('know', 1871),
  ('ucsc', 1826),
  ('class', 1610),
  ('think', 1567),
  ('campus', 1561),
  ('time', 1433),
  ('good', 1296),
  ('ve', 1280),
  ('want', 1225),
  ('students', 1206),
  ('year', 1167),
  ('israel', 1106),
  ('need', 1079),
  ('really', 1064),
  ('ll', 1018),
  ('school', 993),
  ('make', 957),
  ('https', 950),
  ('work', 933),
  ('right', 923),
  ('classes', 894),
  ('going', 887),
  ('did', 886),
  ('say', 866),
  ('got', 837),
  ('way', 834),
  ('said', 831),
  ('sure', 819),
  ('looking', 797),
  ('student', 796),
  ('didn', 782),
  ('major', 752),
  ('housing', 739),
  ('doesn', 720),
  ('lot', 702),
  ('better', 697),
  ('does', 688),
  ('quarter', 686),
  ('pretty', 683),
  ('lol', 678),
  ('uc', 673),
  ('post', 645),
  ('let', 633),
  ('yes', 623),
  ('years', 609),
  ('yeah', 609),
  ('actually', 608),
  ('saying', 604),
  ('college', 600),
  ('job', 592),
  ('bus', 58