# LDA 

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
df = pd.read_csv('../raw_data/combined_text_preprocessed.csv',keep_default_na=False)

In [3]:
df

Unnamed: 0,combined_preprocessed,answered_percent,text_length
0,would love think kind intellectual either dumb...,100.0,1565
1,chef mean workaholic love cook regardless whet...,60.0,815
2,im ashamed much write public text online date ...,90.0,3728
3,work library go school read thing write old de...,70.0,330
4,hey hows go currently vague profile know come ...,50.0,496
...,...,...,...
59941,vibrant expressive care optimist love people t...,100.0,1040
59942,im nick never know write im sure hand im south...,100.0,1634
59943,hello enjoy travel watch movie hang friend rul...,100.0,1189
59944,world ball integrity one take either away momm...,100.0,1122


## Finding two topics

### Vectorizing

In [4]:
%%time
# Instantiating the TfidfVectorizer
tf_idf_vectorizer = TfidfVectorizer(min_df=0.1, ngram_range=(1,2))

# Training it on the texts
combined_weighted_words = pd.DataFrame(tf_idf_vectorizer.fit_transform(df.combined_preprocessed.values).toarray(),
                 columns = tf_idf_vectorizer.get_feature_names_out())

combined_weighted_words

CPU times: user 24.5 s, sys: 1.97 s, total: 26.5 s
Wall time: 26.4 s


Unnamed: 0,actually,adventure,almost,along,also,always,amaze,american,anything,area,...,woman,word,work,world,would,write,year,yes,yet,youre
0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.125752,0.000000,...,0.000000,0.103616,0.140483,0.000000,0.133960,0.000000,0.064378,0.000000,0.106134,0.0
1,0.0,0.000000,0.000000,0.000000,0.000000,0.117395,0.000000,0.000000,0.107094,0.000000,...,0.000000,0.000000,0.079760,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
2,0.0,0.000000,0.000000,0.000000,0.081198,0.000000,0.000000,0.000000,0.043892,0.000000,...,0.000000,0.000000,0.032689,0.049425,0.000000,0.057229,0.089881,0.000000,0.000000,0.0
3,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.187798,0.000000,0.000000,0.328780,0.000000,0.000000,0.000000,0.0
4,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.212428,0.000000,0.000000,...,0.000000,0.000000,0.385680,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59941,0.0,0.000000,0.000000,0.142332,0.078259,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.139426,0.126022,0.285816,0.000000,0.110315,0.000000,0.000000,0.000000,0.0
59942,0.0,0.084036,0.193807,0.000000,0.056493,0.066949,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.045486,0.000000,0.000000,0.159267,0.062534,0.102665,0.103094,0.0
59943,0.0,0.000000,0.103404,0.000000,0.060283,0.000000,0.000000,0.000000,0.065172,0.181377,...,0.000000,0.000000,0.048538,0.000000,0.069426,0.000000,0.133459,0.000000,0.110010,0.0
59944,0.0,0.000000,0.000000,0.000000,0.000000,0.079786,0.000000,0.000000,0.218355,0.000000,...,0.112628,0.000000,0.000000,0.081961,0.155072,0.000000,0.000000,0.000000,0.000000,0.0


### Training LDA

In [5]:
%%time

# Instantiate the LDA
n_components = 2
lda_model = LatentDirichletAllocation(n_components=n_components, max_iter=100, random_state=42)

# Fit the LDA on the vectorized documents
lda_model.fit(combined_weighted_words)

CPU times: user 15min 8s, sys: 220 ms, total: 15min 8s
Wall time: 15min 8s


### Document topic mixture

In [8]:
%%time
document_topic_mixture = lda_model.transform(combined_weighted_words)

document_topic_mixture

CPU times: user 5.75 s, sys: 0 ns, total: 5.75 s
Wall time: 5.16 s


array([[0.27832158, 0.72167842],
       [0.4749332 , 0.5250668 ],
       [0.28276668, 0.71723332],
       ...,
       [0.70885421, 0.29114579],
       [0.31247611, 0.68752389],
       [0.23743553, 0.76256447]])

In [9]:
two_topics_df = pd.DataFrame({
    'topic_0_from_two': document_topic_mixture[:, 0],
    'topic_1_from_two': document_topic_mixture[:, 1],
    'original_text': df.combined_preprocessed
})

In [11]:
two_topics_df

Unnamed: 0,topic_0_from_two,topic_1_from_two,original_text
0,0.278322,0.721678,would love think kind intellectual either dumb...
1,0.474933,0.525067,chef mean workaholic love cook regardless whet...
2,0.282767,0.717233,im ashamed much write public text online date ...
3,0.261315,0.738685,work library go school read thing write old de...
4,0.433234,0.566766,hey hows go currently vague profile know come ...
...,...,...,...
59941,0.603836,0.396164,vibrant expressive care optimist love people t...
59942,0.201254,0.798746,im nick never know write im sure hand im south...
59943,0.708854,0.291146,hello enjoy travel watch movie hang friend rul...
59944,0.312476,0.687524,world ball integrity one take either away momm...


### Topic means

In [None]:
%%time

topic_means = two_topics_df.mean()
print(topic_means)

### Most relevant words per topic

In [None]:
topic_word_mixture = pd.DataFrame(
    lda_model.components_,
    columns = tf_idf_vectorizer.get_feature_names_out()
)

topic_word_mixture

In [None]:
topic_word_mixture.iloc[0].sort_values(ascending = False).head(10)

### Saving pickle model

In [None]:
# save
with open('../pkl/model.pkl','wb') as f:
    pickle.dump(lda_model,f)

In [None]:
# load
with open('../pkl/model.pkl', 'rb') as f:
    clf2 = pickle.load(f)