In [1]:
import seaborn
import pandas as pd
import numpy as np

In [19]:
import nltk, re
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter


In [18]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
normalizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /home/dq/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/dq/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/dq/nltk_data...


copy some articles texts and put them into a list

In [4]:
article_1 = '''KARACHI: The Sindh government has decided to bring down public transport fares by 7 per cent due to massive reduction in petroleum product prices by the federal government, Geo News reported.Sources said reduction in fares will be applicable on public transport, rickshaw, taxi and other means of traveling. Meanwhile, Karachi Transport Ittehad (KTI) has refused to abide by the government decision.KTI President Irshad Bukhari said the commuters are charged the lowest fares in Karachi as compare to other parts of the country, adding that 80pc vehicles run on Compressed Natural Gas (CNG). Bukhari said Karachi transporters will cut fares when decrease in CNG prices will be made.'''

article_2 = '''HONG KONG:  Hong Kong shares opened 0.66 percent lower Monday following a tepid lead from Wall Street, as the first full week of the new year kicked off. The benchmark Hang Seng Index dipped 158.63 points to 23,699.19.'''

article_3 = '''KARACHI: Wholesale market rates for sugar dropped to less than Rs 50 per kg following the resumption of sugar cane crushing by sugar mills in Sindh. Within two days, the rate dropped by Rs 1.70 to Rs 49.80 per kg in Karachi Whole Sale Market. According to dealers, the resumption of sugar cane crushing by the mills stabilised the supply to the market with an immediate effect on price as well. Industry experts said that the quality of sugar cane is excellent in Sindh and approximately 100 kg of sugar cane can produce 11 kg of sugar.'''

article_4 = '''ISLAMABAD: Long queues of vehicles on fuel stations were visible in different parts of the country as the petrol became rare commodity on Thursday. Federal Minister for Petroleum Shahid Khaqan Abbasi says "it may take up to ten days to bring the situation to normality". He claimed that northern areas of Pakistan had been facing the petrol shortage. The minister cited the recent decline in petroleum prices and delay in a shipment as reasons for the shortage.He said situation would improve as soon as shipment reached Pakistan. Sources told Geo News hat due to financial restraints the Pakistan State Oil has been unable import petrol.'''

article_5 = '''KARACHI: The final shipment of Chinese manufactured Rail Engines arrived in Pakistan on Friday. Federal Railways Minister, Khwaja Saad Rafique says, the inclusion of the new engines will help ease the shortfall faced by Pakistan Railways. The shipment includes 2000 and 3000-horse-power engines which will be used to pull freight bogeys. Rafique told journalists, the inclusion of 15 new engines has brought Pakistan Railways total strength to 268 engines however more engines are still required.'''

article_6 = '''SYDNEY: Cricket fever has gripped Australia with the World Cup just days away. Fans from around the world have thronged to the country and hotels are capitalising. Prices of rooms have almost doubled to 300 dollars and hotels are experiencing full bookings. Experts estimate that during the mega event Australia will generate 1.5 million US dollars just from hotel bookings. If the cost of internal air travel, taxis and tickets is taken into consideration, Australia stands to generate two million US dollars during the World Cup.'''

article_7 = '''SAN FRANCISCO: Apple Inc aims to begin producing electric vehicles as early as 2020, Bloomberg reported. The report cited people with knowledge of the matter as saying, a seemingly aggressive target for a mobile devices maker with little experience in car manufacture.The iPhone maker is pushing its "car team" of about 200 people to meet that goal. But Apple may decide to scrap its car-making effort, or delay it, if executives grew unhappy with its progress, the news agency said.'''

article_8 = '''LAHORE: Federal Minister for Railways, Khawaja Saad Rafique Tuesday announced good news of pay-raise for the employees of Pakistan Railways. In a media statement, the Minister disclosed that a summary for increase in salaries for the employees of Pakistan Railways has been forwarded to the Prime Minister. He also said that the government had also chalked out a plan to build houses for the Railways workers. Khawaja Saad Rafique said it was expected that the salaries of Railway Police may witness a jump of 20 percent. He also announced the government\x92s plan to launch a new train service between Karachi and Islamabad.'''

article_9 = '''ISLAMABAD: The Federal Cabinet on Tuesday approved the budget strategy paper, sources revealed to Geo News. During the cabinet meeting, Prime Minister Nawaz Sharif said tax rate had to be reduced to increase revenue. He added that people would happily pay taxes if the rate was reduced. The prime minister directed the cabinet to provide maximum relief to people in the budget, emphasising that the economic impact should reach people.'''

article_10 = '''BEIJING: China will keep the yuan basically stable against a basket of currencies and there is no basis for continued yuan depreciation, central bank vice governor Yi Gang said on Sunday. China also will keep foreign exchange reserves at appropriate levels, Yi said.'''

articles = [article_1, article_2, article_3, article_4, article_5, article_6, article_7, article_8, article_9, article_10]

In [5]:
# view article
print(articles[1])

HONG KONG:  Hong Kong shares opened 0.66 percent lower Monday following a tepid lead from Wall Street, as the first full week of the new year kicked off. The benchmark Hang Seng Index dipped 158.63 points to 23,699.19.


some functions to preprocess texts

In [20]:
def get_part_of_speech(word):
    probable_part_of_speech = wordnet.synsets(word)
    pos_counts = Counter()
    pos_counts["n"] = len(  [ item for item in probable_part_of_speech if item.pos()=="n"]  )
    pos_counts["v"] = len(  [ item for item in probable_part_of_speech if item.pos()=="v"]  )
    pos_counts["a"] = len(  [ item for item in probable_part_of_speech if item.pos()=="a"]  )
    pos_counts["r"] = len(  [ item for item in probable_part_of_speech if item.pos()=="r"]  )
    most_likely_part_of_speech = pos_counts.most_common(1)[0][0]
    return most_likely_part_of_speech

In [21]:
def preprocess_text(text):
    cleaned = re.sub(r'\W+', ' ', text).lower()
    tokenized = word_tokenize(cleaned)
    normalized = " ".join([normalizer.lemmatize(token, get_part_of_speech(token)) for token in tokenized if not re.match(r'\d+',token)])
    return normalized

In [22]:
# preprocess articles
processed_articles = [preprocess_text(article) for article in articles]

In [23]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

In [24]:
# initialize and fit CountVectorizer
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(processed_articles)
print(counts)

  (0, 188)	1
  (0, 77)	1
  (0, 339)	1
  (0, 71)	1
  (0, 325)	1
  (0, 58)	2
  (0, 127)	1
  (0, 207)	1
  (0, 62)	1
  (0, 270)	1
  (0, 333)	1
  (0, 315)	1
  (0, 4)	1
  (0, 66)	1
  (0, 224)	1
  (0, 61)	1
  (0, 186)	1
  (0, 53)	1
  (0, 60)	1
  (0, 42)	2
  (0, 162)	1
  (0, 235)	1
  (0, 75)	1
  (0, 1)	1
  (0, 259)	1
  :	:
  (9, 26)	1
  (9, 51)	1
  (9, 79)	1
  (9, 64)	1
  (9, 28)	1
  (9, 211)	1
  (9, 317)	1
  (9, 70)	1
  (9, 29)	1
  (9, 5)	1
  (9, 292)	1
  (9, 27)	1
  (9, 352)	2
  (9, 170)	2
  (9, 54)	2
  (9, 33)	1
  (9, 11)	1
  (9, 117)	1
  (9, 214)	1
  (9, 13)	1
  (9, 217)	1
  (9, 30)	1
  (9, 343)	2
  (9, 275)	2
  (9, 316)	1


In [25]:
# convert counts to tf-idf
transformer = TfidfTransformer(norm=None)
tfidf_scores_transformed = transformer.fit_transform(counts)
print(tfidf_scores_transformed)

  (0, 169)	7.153829441457081
  (0, 316)	6.0
  (0, 287)	2.2992829841302607
  (0, 132)	6.897848952390782
  (0, 139)	2.9039702474861144
  (0, 74)	2.2992829841302607
  (0, 321)	4.3812407192173
  (0, 39)	2.01160091167848
  (0, 87)	2.7047480922384253
  (0, 242)	5.4094961844768505
  (0, 324)	8.114244276715276
  (0, 110)	10.818992368953701
  (0, 44)	6.03480273503544
  (0, 227)	2.2992829841302607
  (0, 50)	2.7047480922384253
  (0, 89)	2.2992829841302607
  (0, 192)	2.7047480922384253
  (0, 258)	5.4094961844768505
  (0, 152)	5.273814924474138
  (0, 230)	2.2992829841302607
  (0, 239)	2.7047480922384253
  (0, 236)	3.5769147207285403
  (0, 111)	1.6061358035703155
  (0, 129)	2.01160091167848
  (0, 210)	1.6061358035703155
  :	:
  (9, 170)	5.4094961844768505
  (9, 352)	5.4094961844768505
  (9, 27)	2.7047480922384253
  (9, 292)	2.7047480922384253
  (9, 5)	2.7047480922384253
  (9, 29)	2.7047480922384253
  (9, 70)	2.7047480922384253
  (9, 317)	2.7047480922384253
  (9, 211)	2.7047480922384253
  (9, 28)	2.7


Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.



In [26]:
# initialize and fit TfidfVectorizer
vectorizer = TfidfVectorizer(norm=None)
tfidf_scores = vectorizer.fit_transform(processed_articles)


Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.



In [27]:
# check if tf-idf scores are equal
if np.allclose(tfidf_scores_transformed.todense(), tfidf_scores.todense()):
    print(pd.DataFrame({'Are the tf-idf scores the same?':['Yes :D']}))
else:
    print(pd.DataFrame({'Are the tf-idf scores the same?':['No :(']}))

  Are the tf-idf scores the same?
0                          Yes :D


In [30]:
# get vocabulary of terms
try:
    feature_names = vectorizer.get_feature_names()
except:
    pass

# get article index
try:
    article_index = [ i for i in range(len(articles))]
except:
    pass

# create pandas DataFrame with word counts
try:
    df_word_counts = pd.DataFrame(counts.T.todense(), index=feature_names, columns=article_index)
    print(df_word_counts)
except:
    pass



In [31]:
# create pandas DataFrame(s) with tf-idf scores
try:
    df_tf_idf = pd.DataFrame(tfidf_scores_transformed.T.todense(), index=feature_names, columns=article_index)
    print(df_tf_idf)
except:
    pass

try:
    df_tf_idf = pd.DataFrame(tfidf_scores.T.todense(), index=feature_names, columns=article_index)
    print(df_tf_idf)
except:
    pass

                      0         1         2         3         4         5  \
abbasi         0.000000  0.000000  0.000000  2.704748  0.000000  0.000000   
abide          2.704748  0.000000  0.000000  0.000000  0.000000  0.000000   
about          0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
accord         0.000000  0.000000  2.704748  0.000000  0.000000  0.000000   
add            2.299283  0.000000  0.000000  0.000000  0.000000  0.000000   
against        0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
agency         0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
aggressive     0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
aim            0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
air            0.000000  0.000000  0.000000  0.000000  0.000000  2.704748   
almost         0.000000  0.000000  0.000000  0.000000  0.000000  2.704748   
also           0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   

In [32]:
# get highest scoring tf-idf term for each article
for i in range(1, 10):
    print(df_tf_idf[[i]].idxmax())

1    hong
dtype: object
2    sugar
dtype: object
3    petrol
dtype: object
4    engine
dtype: object
5    australia
dtype: object
6    car
dtype: object
7    railway
dtype: object
8    cabinet
dtype: object
9    china
dtype: object
