In [115]:
import pandas as pd

In [116]:
# Load the dataset with correct encoding
df = pd.read_csv('Articles.csv', encoding='ISO-8859-1')

# Keep only the 'Article' column
df = df[['Article']]

# Optional: Show first few rows
df.head()


Unnamed: 0,Article
0,KARACHI: The Sindh government has decided to b...
1,HONG KONG: Asian markets started 2015 on an up...
2,HONG KONG: Hong Kong shares opened 0.66 perce...
3,HONG KONG: Asian markets tumbled Tuesday follo...
4,NEW YORK: US oil prices Monday slipped below $...


In [117]:
df['Article'][0]

'KARACHI: The Sindh government has decided to bring down public transport fares by 7 per cent due to massive reduction in petroleum product prices by the federal government, Geo News reported.Sources said reduction in fares will be applicable on public transport, rickshaw, taxi and other means of traveling.Meanwhile, Karachi Transport Ittehad (KTI) has refused to abide by the government decision.KTI President Irshad Bukhari said the commuters are charged the lowest fares in Karachi as compare to other parts of the country, adding that 80pc vehicles run on Compressed Natural Gas (CNG). Bukhari said Karachi transporters will cut fares when decrease in CNG prices will be made.                        \r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n'

In [118]:
len(df)

2692

# Preprocessing

In [119]:
from sklearn.feature_extraction.text import CountVectorizer

In [120]:
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')

In [121]:
dtm = cv.fit_transform(df['Article'])                 # dtm is a varible full form is document-terms matrix

In [122]:
dtm

<2692x16067 sparse matrix of type '<class 'numpy.int64'>'
	with 332553 stored elements in Compressed Sparse Row format>

# LDA(Latent Dirichlet Allocation)

In [123]:
from sklearn.decomposition import LatentDirichletAllocation

In [124]:
LDA = LatentDirichletAllocation(n_components=7, random_state=42)

In [125]:
LDA.fit(dtm)

# Showing Stored Words

In [126]:
len(cv.get_feature_names_out())

16067

In [127]:
type(cv.get_feature_names_out())

numpy.ndarray

In [128]:
cv.get_feature_names_out()[16000]

'younus'

In [129]:
cv.get_feature_names_out()[15000]

'type'

In [130]:
import random

In [131]:
random_word_id = random.randint(0,16067)
print(cv.get_feature_names_out()[random_word_id])

centenario


In [132]:
for i in range(10):
    random_word_id = random.randint(0,16067)
    print(cv.get_feature_names_out()[random_word_id])

partnering
ruling
assurances
viable
driven
roelof
cover
staging
monumental
expenditures


# Grab the topics

In [133]:
len(LDA.components_)

7

In [134]:
type(LDA.components_)

numpy.ndarray

In [135]:
LDA.components_.shape

(7, 16067)

In [136]:
len(LDA.components_[0])

16067

In [137]:
LDA.components_

array([[1.44597589e-01, 6.60667355e+00, 1.14174243e+00, ...,
        1.42857147e-01, 1.42857145e-01, 1.42857163e-01],
       [1.68862238e+00, 1.73688834e+02, 1.42857169e-01, ...,
        1.42862889e-01, 1.42857144e-01, 1.42857155e-01],
       [1.42998716e-01, 2.03733099e+02, 1.49819595e-01, ...,
        1.42857146e-01, 1.42859620e-01, 1.42857159e-01],
       ...,
       [1.43720072e-01, 2.12322547e+01, 1.42990751e-01, ...,
        1.42857147e-01, 6.14284523e+00, 1.42857161e-01],
       [1.32722791e+01, 3.55220258e+01, 1.13640660e+00, ...,
        6.14284131e+00, 1.42866569e-01, 2.14285704e+00],
       [3.11768072e+01, 5.63931683e+01, 1.43326276e-01, ...,
        1.42857147e-01, 1.42857145e-01, 1.42857163e-01]])

In [138]:
single_topic = LDA.components_[0]

In [139]:
single_topic.argsort()

array([15558,  8989, 12813, ..., 12372,  8341, 13519], dtype=int64)

In [140]:
# Only for understanding for argsort
import numpy as np 
arr = np.array([10,200,1])
arr

array([ 10, 200,   1])

In [141]:
arr.argsort()

array([2, 0, 1], dtype=int64)

In [142]:
single_topic.argsort()[-10:]

array([15719,  2040, 10327,  7438, 15760,  9033,  2914, 12372,  8341,
       13519], dtype=int64)

In [143]:
top_word_indices = single_topic.argsort()[-10:]

In [144]:
for index in top_word_indices:
    print(cv.get_feature_names_out()[index])

west
balls
pakistan
india
wickets
match
captain
runs
lanka
sri


In [145]:
# For 20 word
top_word_indices = single_topic.argsort()[-20:]

In [146]:
for index in top_word_indices:
    print(cv.get_feature_names_out()[index])

stadium
fours
team
world
twenty20
bangladesh
wicket
mohammad
indies
overs
west
balls
pakistan
india
wickets
match
captain
runs
lanka
sri


# Grab highest probability words per topic

In [147]:
for index, topic in enumerate(LDA.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['bangladesh', 'wicket', 'mohammad', 'indies', 'overs', 'west', 'balls', 'pakistan', 'india', 'wickets', 'match', 'captain', 'runs', 'lanka', 'sri']


THE TOP 15 WORDS FOR TOPIC #1
['demand', 'output', 'million', 'production', 'barrel', 'opec', 'global', 'week', 'year', 'percent', 'market', 'crude', 'prices', 'said', 'oil']


THE TOP 15 WORDS FOR TOPIC #2
['world', 'president', 'karachi', 'company', 'country', 'china', 'minister', 'government', 'bank', 'strong', 'billion', 'million', 'new', 'pakistan', 'said']


THE TOP 15 WORDS FOR TOPIC #3
['growth', 'power', 'strong', '2015', 'rs', 'country', 'pakistan', 'minister', 'percent', 'government', 'tax', 'million', 'billion', 'year', 'said']


THE TOP 15 WORDS FOR TOPIC #4
['zealand', 'south', 'pakistan', 'australia', 'match', 'new', 'runs', 'wicket', 'innings', 'day', 'series', 'ball', 'second', 'test', 'england']


THE TOP 15 WORDS FOR TOPIC #5
['final', 'test', 'play', 'england', 'india', 'match', 'players'

In [148]:
dtm

<2692x16067 sparse matrix of type '<class 'numpy.int64'>'
	with 332553 stored elements in Compressed Sparse Row format>

In [149]:
dtm.shape

(2692, 16067)

In [150]:
len(df1)

2692

In [151]:
topic_results = LDA.transform(dtm)

In [152]:
topic_results

array([[2.20176726e-03, 2.20670832e-03, 2.80663192e-01, ...,
        2.20299771e-03, 2.20070716e-03, 2.20213504e-03],
       [2.73209844e-04, 1.86474048e-01, 2.73688208e-04, ...,
        2.73085591e-04, 2.73224240e-04, 8.12159048e-01],
       [4.61298038e-03, 4.61810467e-03, 4.61340835e-03, ...,
        4.61502081e-03, 4.61402172e-03, 9.72314127e-01],
       ...,
       [8.17888399e-04, 8.18263865e-04, 9.95090133e-01, ...,
        8.20051687e-04, 8.18562970e-04, 8.17782182e-04],
       [9.73214222e-04, 9.75080778e-04, 7.63928688e-01, ...,
        9.73461734e-04, 1.76398518e-02, 7.95178010e-02],
       [3.97089757e-04, 3.96677032e-04, 2.15435199e-01, ...,
        3.96690500e-04, 2.72703908e-01, 3.97159066e-04]])

# NMF(NON NEGATIVE MATRIX FACTORIZATION)

In [153]:
import pandas as pd

In [154]:
df1 = pd.read_csv('Articles.csv', encoding='ISO-8859-1')
df1 = df[['Article']]
df1.head()

Unnamed: 0,Article
0,KARACHI: The Sindh government has decided to b...
1,HONG KONG: Asian markets started 2015 on an up...
2,HONG KONG: Hong Kong shares opened 0.66 perce...
3,HONG KONG: Asian markets tumbled Tuesday follo...
4,NEW YORK: US oil prices Monday slipped below $...


In [155]:
df1.head()

Unnamed: 0,Article
0,KARACHI: The Sindh government has decided to b...
1,HONG KONG: Asian markets started 2015 on an up...
2,HONG KONG: Hong Kong shares opened 0.66 perce...
3,HONG KONG: Asian markets tumbled Tuesday follo...
4,NEW YORK: US oil prices Monday slipped below $...


In [156]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [157]:
tfidf=TfidfVectorizer(max_df=0.95,min_df=2,stop_words='english')

In [158]:
dtm=tfidf.fit_transform(df1['Article'])

In [159]:
dtm

<2692x16067 sparse matrix of type '<class 'numpy.float64'>'
	with 332553 stored elements in Compressed Sparse Row format>

In [160]:
from sklearn.decomposition import NMF

In [161]:
nmf_model=NMF(n_components=7,random_state=42)

In [162]:
nmf_model.fit(dtm)

In [163]:
len(tfidf.get_feature_names_out())

16067

In [164]:
import random

In [165]:
for i in range(10):
    random_word_id=random.randint(0,16067)
    print(tfidf.get_feature_names_out()[random_word_id])

shaping
partly
mainland
weather
litres
attar
briefed
cci
notable
readings


In [166]:
len(nmf_model.components_)

7

In [167]:
nmf_model.components_

array([[0.        , 0.10595509, 0.00063785, ..., 0.        , 0.        ,
        0.        ],
       [0.00242285, 0.0161716 , 0.        , ..., 0.        , 0.01344721,
        0.        ],
       [0.        , 0.09484366, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.00675957, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.00256317, 0.0316448 , 0.0002208 , ..., 0.00827725, 0.        ,
        0.00412608],
       [0.03818982, 0.01510753, 0.00041252, ..., 0.        , 0.        ,
        0.        ]])

In [168]:
len(nmf_model.components_[0])

16067

In [169]:
single_topics=nmf_model.components_[0]

In [170]:
single_topics.argsort()

array([    0,  8471,  8467, ...,  9322, 12431, 10327], dtype=int64)

In [171]:
single_topics[16000]

0.016924992720819064

In [172]:
single_topics.argsort()[-10:]

array([14391,  7313,  5024,  3989,  3918,  6574,  2335,  9322, 12431,
       10327], dtype=int64)

In [173]:
top_word_indices=single_topics.argsort()[-10:]

In [174]:
for index in top_word_indices:
    print(tfidf.get_feature_names_out()[index])

tax
imf
economic
cricket
country
government
billion
minister
said
pakistan


In [175]:
for index ,topic in enumerate(nmf_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([[tfidf.get_feature_names_out()[i]for i in topic.argsort()[-15:]]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
[['pcb', 'islamabad', 'finance', 'india', 'china', 'tax', 'imf', 'economic', 'cricket', 'country', 'government', 'billion', 'minister', 'said', 'pakistan']]


THE TOP 15 WORDS FOR TOPIC #1
[['south', 'africa', 'root', 'day', 'lord', 'wicket', 'ball', 'amir', 'cook', 'lanka', 'sri', 'innings', 'series', 'test', 'england']]


THE TOP 15 WORDS FOR TOPIC #2
[['brent', 'iran', 'demand', 'market', 'said', 'million', 'barrels', 'cents', 'output', 'production', 'barrel', 'opec', 'prices', 'crude', 'oil']]


THE TOP 15 WORDS FOR TOPIC #3
[['fed', 'week', 'stocks', 'rose', 'rate', 'shares', 'markets', 'points', 'tokyo', 'bank', 'gold', 'index', 'dollar', 'yen', 'percent']]


THE TOP 15 WORDS FOR TOPIC #4
[['stadium', 'sixes', 'twenty20', 'pakistan', 'wicket', 'fours', 'overs', 'bangladesh', 'match', 'captain', 'balls', 'mohammad', 'india', 'wickets', 'runs']]


THE TOP 15 WORDS FOR TOPIC #5
[['ronaldo', 'players', 'france', 'open', 'win', 'title', 'messi', 'game', '

In [176]:
dtm

<2692x16067 sparse matrix of type '<class 'numpy.float64'>'
	with 332553 stored elements in Compressed Sparse Row format>

In [177]:
dtm.shape

(2692, 16067)

In [178]:
len(df1)

2692

In [179]:
topics_result=nmf_model.transform(dtm)

In [180]:
topics_result.shape

(2692, 7)

In [181]:
topics_result[0]

array([0.0217303 , 0.        , 0.00701783, 0.        , 0.00051753,
       0.        , 0.07753032])

In [182]:
topics_result[0].round(2)

array([0.02, 0.  , 0.01, 0.  , 0.  , 0.  , 0.08])

In [183]:
topics_result[0].argmax

<function ndarray.argmax>

In [184]:
topics_result.argmax(axis=1)

array([6, 3, 3, ..., 5, 0, 5], dtype=int64)

In [185]:
df1['Topic']=topics_result.argmax(axis=1)

In [186]:
df1.head(10)

Unnamed: 0,Article,Topic
0,KARACHI: The Sindh government has decided to b...,6
1,HONG KONG: Asian markets started 2015 on an up...,3
2,HONG KONG: Hong Kong shares opened 0.66 perce...,3
3,HONG KONG: Asian markets tumbled Tuesday follo...,3
4,NEW YORK: US oil prices Monday slipped below $...,2
5,New York: Oil prices tumbled Tuesday to fresh ...,2
6,KARACHI: Strong bulls on Friday pulled the ben...,3
7,"Singapore: Oil fell further in Asia Monday, wi...",2
8,KARACHI: Wholesale market rates for sugar drop...,6
9,SYDNEY: Oil prices fell 1 percent on Wednesday...,2
