In [1]:
import pandas as pd
import numpy as np

import string

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation

import nltk
nltk.download("wordnet")
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\isofr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\isofr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\isofr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv ('dataset.csv')

df.head(5)

Unnamed: 0,Genre,Reports,Age,Gpa,Year,Count,Gender,Nationality
0,Academic Support and Resources,The limited access to research databases and m...,27,2.18,2,1,M,Egypt
1,Academic Support and Resources,I'm having trouble finding the course material...,23,3.11,2,1,F,Egypt
2,Academic Support and Resources,It's frustrating to have limited access to res...,20,3.68,2,1,F,Egypt
3,Academic Support and Resources,I'm really struggling in one of my classes but...,20,1.3,2,1,F,Egypt
4,Academic Support and Resources,I am really struggling with understanding the...,26,2.5,2,1,F,Egypt


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1005 entries, 0 to 1004
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Genre        1005 non-null   object 
 1   Reports      1005 non-null   object 
 2   Age          1005 non-null   int64  
 3   Gpa          1005 non-null   float64
 4   Year         1005 non-null   int64  
 5   Count        1005 non-null   int64  
 6   Gender       1005 non-null   object 
 7   Nationality  1005 non-null   object 
dtypes: float64(1), int64(3), object(4)
memory usage: 62.9+ KB


In [4]:
df_text = df['Reports']
df_text.head(5)

0    The limited access to research databases and m...
1    I'm having trouble finding the course material...
2    It's frustrating to have limited access to res...
3    I'm really struggling in one of my classes but...
4     I am really struggling with understanding the...
Name: Reports, dtype: object

In [5]:
df_text = df_text.str.lower()

In [6]:
df_text = df_text.apply(word_tokenize)

lemmatizer = WordNetLemmatizer()
df_text = df_text.apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

df_text.head(5)

0    [the, limited, access, to, research, database,...
1    [i, 'm, having, trouble, finding, the, course,...
2    [it, 's, frustrating, to, have, limited, acces...
3    [i, 'm, really, struggling, in, one, of, my, c...
4    [i, am, really, struggling, with, understandin...
Name: Reports, dtype: object

In [7]:
stop_w = set(stopwords.words('english'))
stop_w.add('student')
stop_w.add('university')
stop_w.add('academic')


df_text = df_text.apply(lambda x: [word for word in x if word not in stop_w])
print ('Stop Words: \n', stop_w)
df_text.head(5)

Stop Words: 
 {'hadn', 'won', 'under', 'very', 'll', 'university', 'a', 'we', "wasn't", 'most', 'such', 'this', 'over', 're', "hadn't", 'to', "you'd", 'you', 'can', 'y', 'have', 'down', 'nor', 'for', 'ain', 'once', 'there', 'between', 'both', 'mightn', "mightn't", 'through', 'further', 'his', 'ourselves', 'that', 'their', 'herself', 'up', 'where', 'too', 'wouldn', 'what', 'hers', 'now', 'not', 'didn', 's', 'are', 'aren', 'again', 'yourself', 'them', "you've", 'had', 'weren', 'each', 'some', 'other', "she's", 'all', 'own', 'i', 'yourselves', 'am', 'is', 'in', 'at', 'doesn', 'him', 'during', 'were', 'the', 'so', "didn't", 'about', 'yours', 'on', "hasn't", "shouldn't", "don't", 'having', "isn't", 'ma', 'been', 'shouldn', "needn't", "you're", 'they', 't', 'those', "mustn't", "you'll", 'here', "haven't", 'before', 'after', "shan't", 'was', 'haven', 'when', 'do', 'whom', 'as', 'then', 'and', 'of', 'mustn', 'doing', "won't", 'by', 've', 'couldn', 'which', 'does', 'out', 'from', 'no', 'until',

0    [limited, access, research, database, material...
1    ['m, trouble, finding, course, material, need,...
2    ['s, frustrating, limited, access, research, d...
3    ['m, really, struggling, one, class, ca, n't, ...
4    [really, struggling, understanding, instructio...
Name: Reports, dtype: object

In [8]:
df_text.map(lambda x: x.translate(str.maketrans('', '', string.punctuation)) if isinstance(x, str) else x)
df_text.head(5)

0    [limited, access, research, database, material...
1    ['m, trouble, finding, course, material, need,...
2    ['s, frustrating, limited, access, research, d...
3    ['m, really, struggling, one, class, ca, n't, ...
4    [really, struggling, understanding, instructio...
Name: Reports, dtype: object

In [9]:
vectorizer = CountVectorizer()

BoW_df = vectorizer.fit_transform(df_text.apply(' '.join))
print(BoW_df)

  (0, 845)	1
  (0, 50)	2
  (0, 1203)	1
  (0, 372)	1
  (0, 892)	1
  (0, 237)	1
  (0, 868)	1
  (0, 622)	1
  (0, 108)	1
  (0, 950)	1
  (0, 185)	1
  (0, 43)	1
  (0, 1392)	1
  (0, 45)	1
  (1, 892)	1
  (1, 950)	1
  (1, 1493)	1
  (1, 590)	1
  (1, 348)	1
  (1, 263)	1
  (1, 839)	1
  (1, 448)	1
  (1, 197)	1
  (1, 106)	1
  (1, 1320)	1
  :	:
  (1000, 17)	1
  (1000, 406)	1
  (1000, 1504)	1
  (1001, 91)	1
  (1001, 589)	1
  (1001, 603)	1
  (1001, 18)	1
  (1001, 1504)	1
  (1002, 589)	1
  (1002, 500)	1
  (1002, 603)	1
  (1002, 19)	1
  (1002, 1504)	1
  (1003, 589)	1
  (1003, 685)	1
  (1003, 603)	1
  (1003, 20)	1
  (1003, 975)	1
  (1003, 1504)	1
  (1004, 589)	1
  (1004, 603)	1
  (1004, 21)	1
  (1004, 340)	1
  (1004, 842)	1
  (1004, 1504)	1


In [10]:
tfidf = TfidfVectorizer()

tfidf_df = tfidf.fit_transform(df_text.apply(' '.join))
print(tfidf_df)

  (0, 45)	0.3252247923429595
  (0, 1392)	0.25741850865204857
  (0, 43)	0.24295241412656934
  (0, 185)	0.26883449140412025
  (0, 950)	0.179118589434489
  (0, 108)	0.34126123364405325
  (0, 622)	0.3326683160962006
  (0, 868)	0.22651921937777184
  (0, 237)	0.21877997278731787
  (0, 892)	0.1982816608793846
  (0, 372)	0.2617085959978892
  (0, 1203)	0.2617085959978892
  (0, 50)	0.36449043915658164
  (0, 845)	0.17067064888403657
  (1, 1320)	0.42952067611588185
  (1, 106)	0.25220653114239194
  (1, 197)	0.42952067611588185
  (1, 448)	0.31659571411452875
  (1, 839)	0.347633858084929
  (1, 263)	0.20367075211317565
  (1, 348)	0.20727313949930995
  (1, 590)	0.2509584780563873
  (1, 1493)	0.3280864611855377
  (1, 950)	0.2025143045975075
  (1, 892)	0.22418037566175486
  :	:
  (1000, 902)	0.36837529742985026
  (1000, 589)	0.2645371375263222
  (1000, 950)	0.24068387058592397
  (1001, 1504)	0.5607737519989152
  (1001, 18)	0.5466535351712615
  (1001, 603)	0.36121997439374565
  (1001, 589)	0.3235050350614

In [11]:
lsa = TruncatedSVD(n_components=4, algorithm='randomized', n_iter=15, random_state=42)
result = lsa.fit_transform(tfidf_df)

for i in range(result.shape[1]):
    df_text[f'LSA Topic {i}'] = result[:, i]
print (result)

[[ 0.34379758  0.33695002 -0.15366756 -0.23511651]
 [ 0.21621306  0.09355922  0.05778494 -0.03554458]
 [ 0.28779584  0.21206322 -0.17589586 -0.12715711]
 ...
 [ 0.05710566 -0.09809273 -0.00124078 -0.07676064]
 [ 0.0559712  -0.10071119 -0.00840856 -0.09287893]
 [ 0.04579816 -0.07891523 -0.00262315 -0.066166  ]]


In [12]:
n_top_words = 4
topics = []
feature_names = vectorizer.get_feature_names_out()

topics = [[feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]] for topic in lsa.components_]

for i, topic in enumerate(topics):
    print(f"Topic {i+1}: {', '.join(topic)}")

Topic 1: limited, access, course, difficult
Topic 2: access, limited, material, research
Topic 3: health, time, mental, stress
Topic 4: job, opportunity, internship, online


In [13]:
lda = LatentDirichletAllocation(n_components=4, doc_topic_prior=0.9, topic_word_prior=0.9)
result = lda.fit_transform(tfidf_df)

for i in range(result.shape[1]):
    df_text[f'LDA Topic {i}'] = result[:, i]
print (result)

[[0.1471959  0.14705986 0.14497843 0.56076581]
 [0.18877185 0.18901574 0.18013836 0.44207405]
 [0.1515487  0.14710754 0.1541851  0.54715866]
 ...
 [0.22884907 0.32069491 0.24777776 0.20267826]
 [0.22274333 0.3612834  0.21963031 0.19634297]
 [0.23517858 0.3275834  0.23600734 0.20123068]]


In [14]:
n_top_words = 4
topics = []
feature_names = vectorizer.get_feature_names_out()

topics = [[feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]] for topic in lda.components_]

for i, topic in enumerate(topics):
    print(f"Topic {i+1}: {', '.join(topic)}")

Topic 1: available, pay, financial, option
Topic 2: option, food, campus, sport
Topic 3: health, time, mental, wish
Topic 4: access, limited, course, online
