In [36]:
# Generate dataset from file
df = pd.read_csv ('dataset/corpus.csv', usecols= ['title','description'])

                                                   title                                        description
0           court agrees to expedite n.f.l.'s appeal\r\n  the decision means a ruling could be made near...
1      investing: can you profit in agricultural comm...  bad weather is one factor behind soaring food ...
2      no tsunami but fifa's corruption storm rages o...  though jack warner's threatened soccer tsunami...
3      critic's corner weekend: 'fringe' wraps third ...  joshua jackson's show goes out with a bang. pl...
4      f.b.i. seeks help cracking code in victim's no...  the f.b.i. is asking for the public's help in ...
...                                                  ...                                                ...
32599     new app answers practical, weird questions\r\n  need to know how to fend off a mountain lion o...
32600  alex burrows readies for final with plenty on ...  alex burrows took a day off from hockey, but n...
32601   chip sector bellweth

In [37]:
# Fuse the two columns title and description in one new colun called corpus
df['corpus'] = df['title'] + df['description']
document = df['corpus']
print(document)

0        court agrees to expedite n.f.l.'s appeal\r\nth...
1        investing: can you profit in agricultural comm...
2        no tsunami but fifa's corruption storm rages o...
3        critic's corner weekend: 'fringe' wraps third ...
4        f.b.i. seeks help cracking code in victim's no...
                               ...                        
32599    new app answers practical, weird questions\r\n...
32600    alex burrows readies for final with plenty on ...
32601    chip sector bellwether asml to see strong q1\r...
32602    amazon betting on cloud computing, sacrificing...
32603    twitter says it will stick with san francisco\...
Name: corpus, Length: 32604, dtype: object


In [30]:
# imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd

In [66]:
# Term Frequency - Inverse Document Frequency Matrix 
# on the document to get most frequents term in the dataset
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(document)

# We will now implement our k-means clustering algorithm in our vectorized document
true_k = 10
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=150, n_init=1)
model.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=150,
       n_clusters=10, n_init=1, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [67]:
# Get centroids and Features
order_centroids = model.cluster_centers_.argsort()[:, ::-1] # divided per cluster, all the indexes ordered by term frequency
terms = vectorizer.get_feature_names() # all terms from vectorized space

# Now we can print the centroids into which clusters they belongs
for i in range(true_k):
 print("Cluster %d:" % i),
 for ind in order_centroids[i, :10]:
  print(" %s" % terms[ind])

Cluster 0:
 said
 year
 state
 wednesday
 says
 tuesday
 thursday
 years
 world
 day
Cluster 1:
 game
 nfl
 win
 league
 players
 season
 heat
 mets
 red
 yankees
Cluster 2:
 china
 chinese
 said
 artist
 beijing
 asia
 ai
 inflation
 rights
 government
Cluster 3:
 week
 election
 ivory
 coast
 presidential
 gbagbo
 president
 said
 peru
 ouattara
Cluster 4:
 oil
 prices
 sales
 profit
 percent
 stocks
 billion
 year
 quarter
 fed
Cluster 5:
 japan
 nuclear
 plant
 earthquake
 tsunami
 crisis
 power
 radiation
 japanese
 quake
Cluster 6:
 new
 york
 study
 jersey
 city
 state
 said
 according
 theater
 women
Cluster 7:
 laden
 bin
 forces
 killed
 libyan
 pakistan
 nato
 al
 libya
 said
Cluster 8:
 south
 tornado
 north
 river
 tornadoes
 sudan
 mississippi
 storms
 korea
 joplin
Cluster 9:
 open
 nadal
 french
 djokovic
 canucks
 federer
 final
 masters
 round
 vancouver


In [68]:
# Predict on a custom phrase
print("\n")
TEST = pd.read_csv('dataset/test.csv', usecols= ['user','description'])['description']
X = vectorizer.transform(TEST)
predicteds = model.predict(X)

print("==========================================================")
print(predicteds)
print("==========================================================")

# Print results of prediction: topics in the test sentence
print("Top 3 Predicted topics for:")
print("==========================================================")

for i, predicted in enumerate(predicteds):
  print(TEST[i])
  print("============================ : ===========================")
  for j in order_centroids[predicted, :10]:
    print("%s" % terms[j])
  print("==========================================================")



[5 0 0 0 0 0 0]
Top 3 Predicted topics for:
A nuclear explosion is an explosion that occurs as a result of the rapid release of energy from a high-speed nuclear reaction. ... It is possible to have an air-burst nuclear explosion without those clouds. Nuclear explosions produce radiation and radioactive debris.
japan
nuclear
plant
earthquake
tsunami
crisis
power
radiation
japanese
quake
The absolute number of war deaths has been declining since 1946. In some years in the early post-war era, around half a million people died through direct violence in wars; in contrast, in 2016 the number of all battle-related deaths in conflicts involving at least one state was 87,432.
said
year
state
wednesday
says
tuesday
thursday
years
world
day
Fantasy is a genre of speculative fiction set in a fictional universe, often inspired by real world myth and folklore. Its roots are in oral traditions, which then became fantasy literature and drama. From the twentieth century it has expanded further into 