In [1]:
import pyspark
import os
import re
sc = pyspark.SparkContext.getOrCreate()

In [2]:
import nltk
from nltk import word_tokenize

In [3]:
##nltk.download()

In [4]:
#here we load the data and for every file we get row with key-value set 
#key= the path of the file, value = content of the file

In [5]:
files_rdd = sc.wholeTextFiles("data/*/*.txt")
files_rdd.toDF(["path","text"]).toPandas()

Unnamed: 0,path,text
0,file:/home/edenpa/BigData/project1/data/busine...,Yukos unit buyer faces loan claim\n\nThe owner...
1,file:/home/edenpa/BigData/project1/data/busine...,Soaring oil 'hits world economy'\n\nThe soarin...
2,file:/home/edenpa/BigData/project1/data/busine...,Weak dollar hits Reuters\n\nRevenues at media ...
3,file:/home/edenpa/BigData/project1/data/busine...,Palestinian economy in decline\n\nDespite a sh...
4,file:/home/edenpa/BigData/project1/data/busine...,BMW drives record sales in Asia\n\nBMW has for...
5,file:/home/edenpa/BigData/project1/data/busine...,Metlife buys up Citigroup insurer\n\nUS bankin...
6,file:/home/edenpa/BigData/project1/data/busine...,"Parmalat boasts doubled profits\n\nParmalat, t..."
7,file:/home/edenpa/BigData/project1/data/busine...,India widens access to telecoms\n\nIndia has r...
8,file:/home/edenpa/BigData/project1/data/busine...,Japanese banking battle at an end\n\nJapan's S...
9,file:/home/edenpa/BigData/project1/data/busine...,Ask Jeeves tips online ad revival\n\nAsk Jeeve...


In [6]:
#here we organize the data, every line as such
#[category, article name, article text]

In [7]:
files_rdd = files_rdd.map(lambda (k,v): (k.split('/')[-2],v.split('\n', 1)[0] ,v))
files_rdd.toDF(["category", "article_name", "article_text"]).toPandas()

Unnamed: 0,category,article_name,article_text
0,business,Yukos unit buyer faces loan claim,Yukos unit buyer faces loan claim\n\nThe owner...
1,business,Soaring oil 'hits world economy',Soaring oil 'hits world economy'\n\nThe soarin...
2,business,Weak dollar hits Reuters,Weak dollar hits Reuters\n\nRevenues at media ...
3,business,Palestinian economy in decline,Palestinian economy in decline\n\nDespite a sh...
4,business,BMW drives record sales in Asia,BMW drives record sales in Asia\n\nBMW has for...
5,business,Metlife buys up Citigroup insurer,Metlife buys up Citigroup insurer\n\nUS bankin...
6,business,Parmalat boasts doubled profits,"Parmalat boasts doubled profits\n\nParmalat, t..."
7,business,India widens access to telecoms,India widens access to telecoms\n\nIndia has r...
8,business,Japanese banking battle at an end,Japanese banking battle at an end\n\nJapan's S...
9,business,Ask Jeeves tips online ad revival,Ask Jeeves tips online ad revival\n\nAsk Jeeve...


In [8]:
# 2.a & b: divide every article text to tokens and remove stop words

In [9]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
stop_words_broadcast = sc.broadcast(stop_words)

In [10]:
def tokenizeWithoutStopWords(text):
    filter_tokens = []
    tokens = word_tokenize(re.sub("[^A-Za-z]+"," ",text.lower()))
    for w in tokens:
        if w not in stop_words_broadcast.value and len(w) > 1:
            filter_tokens.append(w)

    return filter_tokens 

In [11]:
#here we separate the content of each file to tokens and remove stop words from the tokens
#every row: [category, article name, tokens]

In [12]:
TEXT = 2
files_devide_to_tokens_rdd = files_rdd.map(lambda row: [row[0], row[1],tokenizeWithoutStopWords(row[TEXT])])

files_devide_to_tokens_rdd.toDF(["category", "article_name", "tokens"]).toPandas()

Unnamed: 0,category,article_name,tokens
0,business,Yukos unit buyer faces loan claim,"[yukos, unit, buyer, faces, loan, claim, owner..."
1,business,Soaring oil 'hits world economy',"[soaring, oil, hits, world, economy, soaring, ..."
2,business,Weak dollar hits Reuters,"[weak, dollar, hits, reuters, revenues, media,..."
3,business,Palestinian economy in decline,"[palestinian, economy, decline, despite, short..."
4,business,BMW drives record sales in Asia,"[bmw, drives, record, sales, asia, bmw, foreca..."
5,business,Metlife buys up Citigroup insurer,"[metlife, buys, citigroup, insurer, us, bankin..."
6,business,Parmalat boasts doubled profits,"[parmalat, boasts, doubled, profits, parmalat,..."
7,business,India widens access to telecoms,"[india, widens, access, telecoms, india, raise..."
8,business,Japanese banking battle at an end,"[japanese, banking, battle, end, japan, sumito..."
9,business,Ask Jeeves tips online ad revival,"[ask, jeeves, tips, online, ad, revival, ask, ..."


In [13]:
#2.d: creating TFtd table


In [14]:
#this function create dictionary of term frequency
def calculate_TFt(tokens_list):
    counter = {}
    for token in tokens_list:
        counter[token] = counter.get(token, 0) + 1
    return counter

In [15]:
files_with_tf_rdd = files_devide_to_tokens_rdd.map(lambda row: [row[0],row[1],calculate_TFt(row[2])])
files_with_tf_rdd.toDF(["category", "article_name", "tf"]).toPandas()

Unnamed: 0,category,article_name,tf
0,business,Yukos unit buyer faces loan claim,"{u'founder': 1, u'obligations': 1, u'money': 1..."
1,business,Soaring oil 'hits world economy',"{u'help': 1, u'supported': 1, u'domestic': 1, ..."
2,business,Weak dollar hits Reuters,"{u'managed': 1, u'gradual': 1, u'executive': 1..."
3,business,Palestinian economy in decline,"{u'particularly': 2, u'unemployment': 1, u'hal..."
4,business,BMW drives record sales in Asia,"{u'operations': 1, u'assembling': 1, u'conside..."
5,business,Metlife buys up Citigroup insurer,"{u'financial': 1, u'deal': 2, u'distributed': ..."
6,business,Parmalat boasts doubled profits,"{u'less': 1, u'advisors': 1, u'years': 1, u'br..."
7,business,India widens access to telecoms,"{u'ignite': 1, u'welcomed': 1, u'executive': 1..."
8,business,Japanese banking battle at an end,"{u'operations': 2, u'yen': 2, u'increasingly':..."
9,business,Ask Jeeves tips online ad revival,"{u'among': 1, u'google': 2, u'excessive': 1, u..."


In [16]:
numOfDocuments = files_with_tf_rdd.count()

In [17]:
#2.e:creating DFt dictionary
#this function implement distinct

In [18]:
def removeDuplicates (tokens_list):
            
    return list(dict.fromkeys(tokens_list))

In [19]:
DFt_rdd = files_devide_to_tokens_rdd.map(lambda row: removeDuplicates(row[2]))
DFt_rdd = DFt_rdd.flatMap(lambda row: row).countByValue()
broadcastDFt = sc.broadcast(DFt_rdd)
broadcastDFt.value

defaultdict(int,
            {u'brownlees': 1,
             u'yellow': 1,
             u'four': 81,
             u'testosterone': 1,
             u'hanging': 2,
             u'conjuring': 1,
             u'cyprus': 1,
             u'towns': 2,
             u'payoff': 1,
             u'increase': 34,
             u'eligible': 5,
             u'electricity': 5,
             u'benitez': 1,
             u'wizardry': 1,
             u'originality': 1,
             u'list': 33,
             u'lord': 21,
             u'ioannidis': 6,
             u'meadows': 2,
             u'digit': 2,
             u'heintze': 1,
             u'kent': 6,
             u'dynamic': 1,
             u'regional': 8,
             u'pierce': 2,
             u'wimax': 2,
             u'holyrood': 6,
             u'stabbed': 2,
             u'bringing': 13,
             u'prize': 27,
             u'wednesday': 41,
             u'viable': 1,
             u'raoul': 1,
             u'hague': 2,
             u'succession'

In [20]:
#2.f Creating Vector table

In [21]:
vector_size = len(broadcastDFt.value.keys())

In [22]:
#this function get TFt dictionary of a document
#using the global DFt dictionary
#this function create the vector of the document as a list of float
#every cell represent coordinate for term t and the value is equal to:
#
#vector[i] = (1 + log(TFtd,10)) * log(numOfDocuments / DFt)  if TFtd > 0
#            0                                               Else
import math
def vectorize(TFt_dict):
    keys = broadcastDFt.value.keys()
    numOfTokens = len(keys)
    vector = [0]*numOfTokens
    i=0
    for token in keys:
        vector[i] = TFt_dict.get(token,0)
        if vector[i] != 0:
            #in case th tf for this token is bigger from 0 than we calculate the weight
            vector[i] = (1 + math.log(vector[i], 10)) * (math.log(numOfDocuments/ broadcastDFt.value.get(token), 10))
        i = i+1
    return vector
        


In [23]:
#this function just calculate the length of the vector and normalize the vector
def normalize(vector):
    sumOfPow2 = 0
    for i in vector:
        sumOfPow2 = sumOfPow2 + i*i
    vectorLength = math.sqrt(sumOfPow2)
    norm_vector = map(lambda x: x / vectorLength, vector)
    return norm_vector

In [24]:
#here we create table that every row: [category, article name, vector]
files_with_vectors_rdd = files_with_tf_rdd.map(lambda row: [row[0],row[1],vectorize(row[2])])
files_with_vectors_rdd.toDF(["category", "article_name", "vector"]).show(5)

+--------+--------------------+--------------------+
|category|        article_name|              vector|
+--------+--------------------+--------------------+
|business|Yukos unit buyer ...|[0, 0, 0, 0, 0, 0...|
|business|Soaring oil 'hits...|[0, 0, 0, 0, 0, 0...|
|business|Weak dollar hits ...|[0, 0, 0, 0, 0, 0...|
|business|Palestinian econo...|[0, 0,, 0, 0, 0, ...|
|business|BMW drives record...|[0, 0, 0, 0, 0, 0...|
+--------+--------------------+--------------------+
only showing top 5 rows



In [25]:
#here we normalize the vector for every document

In [26]:
files_with_norm_vectors_rdd = files_with_vectors_rdd.map(lambda row: [row[0],row[1],normalize(row[2])])
files_with_norm_vectors_rdd.toDF(["category", "article_name", "norm_vector"]).show(5)

+--------+--------------------+--------------------+
|category|        article_name|         norm_vector|
+--------+--------------------+--------------------+
|business|Yukos unit buyer ...|[0.0, 0.0, 0.0, 0...|
|business|Soaring oil 'hits...|[0.0, 0.0, 0.0, 0...|
|business|Weak dollar hits ...|[0.0, 0.0, 0.0, 0...|
|business|Palestinian econo...|[0.0, 0.0, 0.0542...|
|business|BMW drives record...|[0.0, 0.0, 0.0, 0...|
+--------+--------------------+--------------------+
only showing top 5 rows



In [27]:
#3 Finding for every document the 5 documents most similar

In [28]:
files_with_norm_vectors_rdd_broadcast = sc.broadcast(files_with_norm_vectors_rdd.collect())

In [29]:
def calculateCos(v1,v2):
    sizeV = len(v1)
    i = 0
    cos = 0
    while(i<sizeV):
        cos = cos + v1[i] * v2[i]
        i = i+1
    return cos

In [30]:
def confrontListOf5(row,cos,top_5_list):
    distance = abs(1-cos)
    top_5_list.append([row[0],row[1],cos,distance])
    if len(top_5_list)>5:
        maxD = -1
        maxRow = None
        for i in top_5_list:
            if(i[3]>maxD):
                maxD = i[3]
                maxRow = i
        top_5_list.remove(maxRow)
    return top_5_list

In [31]:
def findTopClose5(row):
    top_5_list = []
    counter=0
    for i in files_with_norm_vectors_rdd_broadcast.value:
        if i[1] == row[1]:
            continue
        cos = calculateCos(row[2],i[2])
        top_5_list = confrontListOf5(i,cos,top_5_list)
        print("iteration ",counter," with/",row[1])
        counter = counter +1
    row[2] = map(lambda x: x[:-1],top_5_list)
    print("finish with/",row[1])
    return row

In [32]:
top5_close_article_rdd = files_with_norm_vectors_rdd.map(findTopClose5)

top5_close_article_rdd.take(100)

[[u'business',
  u'Yukos unit buyer faces loan claim',
  [[u'business', u'Yukos accused of lying to court', 0.28030461610659074],
   [u'business', u'Parmalat to return to stockmarket', 0.09071300941278213],
   [u'business', u'China had role in Yukos split-up', 0.3275699602673347],
   [u'business', u'Russia gets investment blessing', 0.11657800664375875],
   [u'business', u'Yukos loses US bankruptcy battle', 0.22488672667476062]]],
 [u'business',
  u"Soaring oil 'hits world economy'",
  [[u'business', u'Industrial output falls in Japan', 0.15403681919859052],
   [u'business', u'German business confidence slides', 0.1287132265420408],
   [u'business', u'Japan economy slides to recession', 0.15576625073183353],
   [u'business', u'Strong demand triggers oil rally', 0.10383409403220573],
   [u'business', u"IMF 'cuts' German growth estimate", 0.1885405513233295]]],
 [u'business',
  u'Weak dollar hits Reuters',
  [[u'business', u'Parmalat boasts doubled profits', 0.08531322385112404],
   [u'b

In [33]:
#4: Search Documents

In [34]:
#this function take query and let it goes through the same process as every document
#to create a vector that represent the query
def calculateNormVector(query):
    tokens = tokenizeWithoutStopWords(query)
    Tft_dict = calculate_TFt(tokens)
    vector = vectorize(Tft_dict)
    return normalize(vector)

In [35]:
#this function get query and return the 10 documents most match to the query
def getTop10MatchDocuments(query):
    norm_vector = calculateNormVector(query)
    files_with_score_rdd = files_with_norm_vectors_rdd.map(lambda row: (calculateCos(norm_vector,row[2]),[row[0],row[1]]))
    top10Documents = files_with_score_rdd.sortByKey( ascending=False).top(10)
    return top10Documents

In [36]:
#here we demonstrate 5 query's results

In [37]:
result = getTop10MatchDocuments('Iranian MPs threaten mobile deal')
result

[(0.1824159334714062, [u'business', u'Iranian MPs threaten mobile deal']),
 (0.09581945879305956, [u'business', u"Turkey-Iran mobile deal 'at risk'"]),
 (0.09031794351730849, [u'tech', u'Iran jails blogger for 14 years']),
 (0.08253258656578713, [u'politics', u'Strike threat over pension plans']),
 (0.06932724186100198, [u'tech', u'Global blogger action day called']),
 (0.06041681644578025, [u'business', u'German growth goes into reverse']),
 (0.050730112629216756,
  [u'entertainment', u"Dutch watch Van Gogh's last film"]),
 (0.04958782009732701, [u'politics', u"EU China arms ban 'to be lifted'"]),
 (0.04746678500041163, [u'politics', u"Visa decision 'every 11 minutes'"]),
 (0.04740701986473111, [u'politics', u"Tories 'would cut number of MPs'"])]

In [38]:
result = getTop10MatchDocuments('apple release new phone')
result

[(0.13122358473408607, [u'tech', u'Apple iPod family expands market']),
 (0.1221101881003855, [u'tech', u'Creator of first Apple Mac dies']),
 (0.10688885716178981, [u'tech', u'Looks and music to drive mobiles']),
 (0.09702065243481324, [u'tech', u'Apple attacked over sources row']),
 (0.07712721790064272, [u'sport', u'Chepkemei joins Edinburgh line-up']),
 (0.07558380554858865, [u'tech', u"Apple laptop is 'greatest gadget'"]),
 (0.07367087468828566, [u'tech', u'The future in your pocket']),
 (0.07348096335376353, [u'tech', u"Mobiles 'not media players yet'"]),
 (0.06321309247495756, [u'tech', u'Moving mobile improves golf swing']),
 (0.05637205584105738, [u'tech', u"'Friends fear' with lost mobiles"])]

In [39]:
result = getTop10MatchDocuments('us economy')
result

[(0.13052158321173868, [u'business', u'US economy shows solid GDP growth']),
 (0.11089557786294216, [u'business', u"IMF 'cuts' German growth estimate"]),
 (0.10315688342552369, [u'business', u'Mixed signals from French economy']),
 (0.09775902160791343, [u'business', u'German growth goes into reverse']),
 (0.09614733272759357, [u'business', u'Japan economy slides to recession']),
 (0.09189225378449573, [u'business', u'Japan narrowly escapes recession']),
 (0.09052721366737784, [u'business', u'Palestinian economy in decline']),
 (0.08969383167401589, [u'business', u'Industrial output falls in Japan']),
 (0.0867721655632652, [u'politics', u"Job cuts 'false economy'  - TUC"])]

In [40]:
result = getTop10MatchDocuments('victoria secrets')
result

[(0.09364395452672605,
  [u'entertainment', u'Bennett play takes theatre prizes']),
 (0.07595090889996216,
  [u'entertainment', u"Artists' secret postcards on sale"]),
 (0.07253421468031208, [u'entertainment', u'West End to honour finest shows']),
 (0.07169090743957085, [u'entertainment', u'Da Vinci film to star Tom Hanks']),
 (0.052704826817009595, [u'politics', u"Nuclear strike 'key terror risk'"]),
 (0.049885607231128094, [u'politics', u'Russian ex-spy on hunger strike']),
 (0.0, [u'tech', u'Yahoo celebrates a decade online']),
 (0.0, [u'tech', u"Xbox power cable 'fire fear'"]),
 (0.0, [u'tech', u'Xbox 2 may be unveiled in summer']),
 (0.0, [u'tech', u'Wi-fi web reaches farmers in Peru'])]

In [41]:
result = getTop10MatchDocuments('GOOGLE TECH')
result

[(0.10679364007277317, [u'tech', u'Google launches TV search service']),
 (0.09663454844206254, [u'tech', u'Security scares spark browser fix']),
 (0.09104057398052501, [u'business', u'Ask Jeeves tips online ad revival']),
 (0.09089566403080483, [u'tech', u"Google's toolbar sparks concern"]),
 (0.07708767737670094, [u'tech', u"'No re-draft' for EU patent law"]),
 (0.06630451409394633, [u'tech', u'Microsoft launches its own search']),
 (0.06339658238293512, [u'tech', u'Web helps collect aid donations']),
 (0.06306294406686896, [u'tech', u'Rich pickings for hi-tech thieves']),
 (0.06229664510781492, [u'tech', u'EU software patent law faces axe']),
 (0.062070249378083975, [u'business', u'Ad sales boost Time Warner profit'])]

In [42]:
#Kmeans 

In [43]:
categories = files_with_vectors_rdd.map(lambda row: row[0]).distinct().collect()

In [44]:
def getRandomCenters():
    centers = {}
    group = 1
    for category in categories:
        row = files_with_norm_vectors_rdd.filter(lambda r: r[0] == category).first()
        centers[group] = row[2]
        group = group +1
    return centers
       

In [45]:
def calculateDistance(v1,v2):
    sub_V = [0]*len(v1)
    for i in range(0,len(v1)):
        sub_V[i] = (v1[i]-v2[i]) * (v1[i]-v2[i])
    return math.sqrt(sum(sub_V))   

In [46]:
def checkIfAlgorithmFinish(current_centers,new_centers):
    groups = current_centers.keys() 
    for group in groups:
        distance = calculateDistance(current_centers[group],new_centers[group])
        if(distance > 0.0000000001):
            return False
    return True

In [47]:
def findNearestCenters(row,current_centers):
    groups = current_centers.keys()
    minDistance = -1
    minGroup = 0
    for group in groups:
        distance = calculateDistance(row[2],current_centers[group])
        if(distance < minDistance or minDistance == -1):
            minDistance = distance
            minGroup = group
    return(minGroup, row) 

In [48]:
def calculateCenter(rows):
    center = [0]*vector_size
    if(len(rows) > 0):
        for r in rows:
            v=r[2]
            for i in range(0,vector_size):
                center[i] = center[i] + v[i]
        for i in range(0,vector_size):
            center[i] = center[i] / len(rows)
    return center

In [49]:
def calculateNewCenters(cluster_vectors):
    centers = cluster_vectors.map(lambda (k,v): (k,calculateCenter(list(v))))
    return dict(centers.collect())


In [50]:
def kmeans():
    current_centers = getRandomCenters()
    finish = False
    i = 0
    while finish== False:
        cluster_vectors = files_with_norm_vectors_rdd.map(lambda row:findNearestCenters(row,current_centers)).groupByKey()
        new_centers = calculateNewCenters(cluster_vectors)
        finish = checkIfAlgorithmFinish(current_centers,new_centers)
        current_centers = new_centers
        i = i+1
    return [cluster_vectors,current_centers]

In [51]:
cluster_vectors_and_centers = kmeans()
cluster_vectors = cluster_vectors_and_centers[0]
centers = cluster_vectors_and_centers[1]

In [52]:
#here we can see the result of the kmeans algorithm
#we can see that every cluster have a strong connection to one category label 
#although we have some noise in the data

In [53]:
cluster_vectors.map(lambda(k,v): (k,map(lambda x: x[:-1],list(v)))).take(5)

[(2,
  [[u'business', u'Yukos unit buyer faces loan claim'],
   [u'business', u'Weak dollar hits Reuters'],
   [u'business', u'BMW drives record sales in Asia'],
   [u'business', u'Metlife buys up Citigroup insurer'],
   [u'business', u'Parmalat boasts doubled profits'],
   [u'business', u'Japanese banking battle at an end'],
   [u'business', u'Ask Jeeves tips online ad revival'],
   [u'business', u'EU aiming to fuel development aid'],
   [u'business', u'Japanese mogul arrested for fraud'],
   [u'business', u'Burren awarded Egyptian contracts'],
   [u'business', u'Telegraph newspapers axe 90 jobs'],
   [u'business', u'Lufthansa may sue over Bush visit'],
   [u'business', u'Yukos accused of lying to court'],
   [u'business', u'Peugeot deal boosts Mitsubishi'],
   [u'business', u"Chinese wine tempts Italy's Illva"],
   [u'business', u'Hyundai to build new India plant'],
   [u'business', u'Electrolux to export Europe jobs'],
   [u'business', u'Worldcom ex-boss launches defence'],
   [u'bu

In [54]:
#here we try to learn from each center of cluster which words are have strong connection to the cluster

In [55]:
def top20WordsFromVector(vector):
    words = []
    keys = broadcastDFt.value.keys()
    for i in range(0, vector_size):
        if vector[i] > 0.0:
            words.append((vector[i],keys[i]))
        if len(words)>20:
            minW = float("inf")
            minElement = None
            for element in words:
                if(element[0] < minW):
                    minW = element[0]
                    minElement = element
            words.remove(minElement)
    return map(lambda x: x[1],words)

In [56]:
def getTopWordsForEveryCluster(cluster):
    words_cluster={}
    keys = cluster.keys()
    for key in keys:
        words_cluster[key] = top20WordsFromVector(cluster[key])
    return words_cluster

In [57]:
words_cluster = getTopWordsForEveryCluster(centers)
words_cluster

{1: [u'internet',
  u'information',
  u'virus',
  u'firm',
  u'software',
  u'web',
  u'websites',
  u'online',
  u'spyware',
  u'using',
  u'computer',
  u'net',
  u'programs',
  u'mail',
  u'microsoft',
  u'security',
  u'program',
  u'windows',
  u'users',
  u'use'],
 2: [u'financial',
  u'debt',
  u'sale',
  u'firm',
  u'bn',
  u'owned',
  u'production',
  u'shares',
  u'market',
  u'sales',
  u'investment',
  u'deal',
  u'profits',
  u'group',
  u'oil',
  u'chief',
  u'executive',
  u'company',
  u'profit',
  u'growth'],
 3: [u'say',
  u'many',
  u'ministers',
  u'government',
  u'secretary',
  u'labour',
  u'blair',
  u'economic',
  u'need',
  u'party',
  u'mr',
  u'public',
  u'would',
  u'report',
  u'election',
  u'says',
  u'plans',
  u'minister',
  u'economy',
  u'bbc'],
 4: [u'birmingham',
  u'training',
  u'world',
  u'iaaf',
  u'gold',
  u'race',
  u'holmes',
  u'thanou',
  u'medal',
  u'championships',
  u'sport',
  u'win',
  u'indoor',
  u'kenteris',
  u'olympics',
  u'

In [58]:
#Strong word for every cluster
# 1) (tech) => internet, virus, software, web, websites, online, spyware, computer, 
#              programs, mail, microsoft',u'security',u'program',u'windows',u'users',
#
# 2) (business) => financial, debt, sale, owned, production, market, sales,
#                  investment, deal, profits, company, profit, growth
#
# 3) (politics) => ministers, government, labour, party, election, minister
#
# 4) (sport) => training, world, race, medal, championships ,sport, win
#               olympics, olympic, season, athletics, athens, champion
#
# 5) (entertainment) =>  prize, starring, films, director, movie, comedy, award
#                        oscars, actor, star, oscar, hollywood, film, awards, ceremony