# Installing

In [1]:
!pip install pymongo

Collecting pymongo
[?25l  Downloading https://files.pythonhosted.org/packages/30/f9/78dd244df932309299288a452d1c3524f6f7746f1813b8a8417952b1d9ce/pymongo-3.6.1-cp36-cp36m-manylinux1_x86_64.whl (378kB)
[K    100% |████████████████████████████████| 389kB 9.1MB/s 
[?25hInstalling collected packages: pymongo
Successfully installed pymongo-3.6.1


# Importing

In [2]:
# Data preprocessing
import pandas as pd
import numpy as np
import datetime

# Stopwords, stemming, and tokenizing
import sys
import nltk
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from nltk.stem.snowball import SnowballStemmer
stemmerFR = SnowballStemmer("french")
stemmerEN = SnowballStemmer("english")
import re
from scipy import optimize

# MongoDB imports 
import pymongo
from pymongo import MongoClient

# Tf-idf and document similarity 
from sklearn.feature_extraction.text import TfidfVectorizer

# Clustering
from sklearn.cluster import KMeans



[nltk_data] Downloading package stopwords to /content/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /content/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Data Retrieval

**Connection to MongoDB**

In [0]:
client = MongoClient('mongodb://root:WebM2018Mse@dds-4xo14776d7c78aa41621-pub.mongodb.germany.rds.aliyuncs.com:3717/admin')
db = client.news

** Le temps dataset**

In [0]:
COLLECTION_NAME = "allArticles"

articles_temps = list(db[COLLECTION_NAME].find({}))
dataframe = pd.DataFrame(articles_temps)

** Convert ArticleDate to Datetime format and sort dataframe by date **

In [0]:
dataframe['articleDateFormatted'] = pd.to_datetime(dataframe['articleDate'], unit='ms')
dataframe = dataframe.sort_values(by=['articleDateFormatted'])

** Remove entries where clearedHTML is empty **

In [7]:
dataframe = dataframe[dataframe['clearedHTML'] != ""].reset_index()
dataframe.head()

Unnamed: 0,level_0,index,_id,articleDate,biMonth,cleanContent,clearedHTML,crawlDate,html,id,source,tags,test,title,url,articleDateFormatted
0,0,115858,5ada135224ac6a250e026bdf,-3600,1970001,publi octobr 2017la polic berlinois annonc qu'...,Publié le 03 octobre 2017La police berlinoise ...,1524241234,"<article>\n\t\t\t<div class=""rts-module-infosp...",cd233eda7c506f5b810fa10fc0839a92,rts,[Monde],simo,Evacuation massive à Berlin à cause d'une bomb...,https://www.rts.ch/info/monde/8968798-evacuati...,1969-12-31 23:59:56.400
1,1,113174,5ad9f48a24ac6a117a12608e,939074400,1999019,telecom c'est samedi gen telecom sera aussi da...,"Telecom 99, c'est samedi à Genève. Telecom 200...",1524233354,"<div itemprop=""articleBody"">\n <p class=""le...",e56272d2482f7cafda8e7c791b838bab,swissinfo,[],,Telecom en 2003 comme en 1999 : c’est Genève,https://www.swissinfo.ch/fre/telecom-en-2003-c...,1970-01-11 20:51:14.400
2,2,130914,5ad9f54f24ac6a117a128812,939333600,1999019,dimanch jusqu janvier mus beyel riehen dan can...,"Dès ce dimanche et jusqu’au 9 janvier, le musé...",1524233551,"<div itemprop=""articleBody"">\n <p class=""le...",47faaca8b5e189b507ba93301c7e74f2,swissinfo,[],,Le Musée Beyeler instaure un dialogue entre Cé...,https://www.swissinfo.ch/fre/le-mus%C3%A9e-bey...,1970-01-11 20:55:33.600
3,3,113175,5ad9f48a24ac6a117a12608f,939420000,1999019,c'est week end que telecom ouvr ses port des c...,C'est ce week-end que Telecom 99 ouvre ses por...,1524233354,"<div itemprop=""articleBody"">\n <p class=""le...",c112934bb26109ff5cc43ca3a0e473d0,swissinfo,[],,Genève ouvre Telecom 99 : mobilisation générale,https://www.swissinfo.ch/fre/gen%C3%A8ve-ouvre...,1970-01-11 20:57:00.000
4,4,113176,5ad9f48a24ac6a117a126090,939592800,1999019,gen est pour une semain mecqu l'industri des c...,"Genève est, pour une semaine, la Mecque de l'i...",1524233354,"<div itemprop=""articleBody"">\n <p class=""le...",4943a52f96385a12d62d65fc26bc54cd,swissinfo,[],,Telecom 99 : la grand-messe d’une industrie en...,https://www.swissinfo.ch/fre/telecom-99---la-g...,1970-01-11 20:59:52.800


** Retrieval of the corpus **

In [0]:
db_corpus = client.corpus
pairs_got = db_corpus.allArticles.find_one({"id": "pairs2"})["pairs"]

# Pre-processing (Stopwords, stemming, and tokenizing)

** New tokenize stem and cleaning method **

In [0]:
alphaOnly = lambda w: not w.replace('"', '').replace("'", '')\
          .replace(".", '').replace(",", '').isnumeric()

gt2 = lambda w: len(w) > 2

ppFilters = lambda w: alphaOnly(w) and gt2(w)

def clean_str(string):
  """Original taken
  from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
  """
  string = re.sub(r"[^A-Za-z0-9()èéàÉÀêÊçÇ]", " ", string)
  string = re.sub(r"\'s", " \'s", string)
  string = re.sub(r"\'ve", " \'ve", string)
  string = re.sub(r"n\'t", " n\'t", string)
  string = re.sub(r"\'re", " \'re", string)
  string = re.sub(r"\'d", " \'d", string)
  string = re.sub(r"\'ll", " \'ll", string)
  string = re.sub(r",", " , ", string)
  string = re.sub(r"!", " ! ", string)
  string = re.sub(r"\(", " \( ", string)
  string = re.sub(r"\)", " \) ", string)
  string = re.sub(r"\?", " \? ", string)
  string = re.sub(r"\s{2,}", " ", string)
  return string.strip().lower()


def ppMaps(w):
  #w = clean_str(w)
  w = stemmerFR.stem(w)
  #w = stemmerEN.stem(w)
  return w

def preprocessing(tokens):
  tokens = list(filter(ppFilters, map(ppMaps, tokens)))
  return tokens

# Call `pro£cess_line` to use it.
def process_line(line):
  return preprocessing(word_tokenize(clean_str(line)))

** Create the dataframe from two lists **

In [0]:
def vocabulary_frame_mongodb(totalvocab_tokenized, totalvocab_stemmed):     
  return pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)

# TFidf-vectorization

In [0]:
def tfidfVectorization(corpus, tokenize_func, stopwords_list, ngram_max=1, max_df=0.8, min_df=0.1):
  vectorizer = TfidfVectorizer(max_df=max_df, min_df=min_df, use_idf=True,
                               stop_words=stopwords_list, tokenizer=tokenize_func, 
                               ngram_range=(1,ngram_max))
   
  vectors_words_corpus = vectorizer.fit_transform(corpus) 
  terms = vectorizer.get_feature_names()
  return (vectors_words_corpus, terms)

# Clustering

In [0]:
def clustering(value_matrix, nb_clusters=5):
  # prevent kmeans from crashing if the number of clusters is too high
  nb_elements_in_matrix = len(value_matrix.toarray())
  if nb_elements_in_matrix < nb_clusters:
    nb_clusters = nb_elements_in_matrix
   
  # Makes the clustering andthe fitting
  kmeans_fitted = KMeans(n_clusters=nb_clusters)
  kmeans_fitted.fit(value_matrix)
  return kmeans_fitted

# Analyzing of the clustering

In [0]:
def findImportantWords(kmeans, vocab_frame, vocabulary_dataset, nb_words_for_clusters=6):
  order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1] 
  nb_clusters = len(order_centroids)
  words_all_cluster = []
  for i in range(nb_clusters):
    words_cluster = []
    for ind in order_centroids[i, :nb_words_for_clusters]: 
      word = vocab_frame.loc[vocabulary_dataset[ind].split(' ')].values.tolist()[0][0]      
      words_cluster.append(word)
    words_all_cluster.append(words_cluster)
  return words_all_cluster

# Final pipeline on a single subset

In [0]:
N_CLUSTERS = 5
FIELDNAME = 'clearedHTML'
TITLENAME = 'title'

In [0]:
unique_biMonth = dataframe['biMonth'].unique()
tuples_biMonth = [(unique_biMonth[i], unique_biMonth[i+1]) for i in range(len(unique_biMonth)-1)]

** Work on a subset of the entire dataframe (one month) **

In [0]:
dataframe_subset = dataframe[dataframe['biMonth'].isin(tuples_biMonth[-1])]

** Transform dataframe into two lists (title and content) **

In [0]:
corpus = np.asarray(dataframe_subset[FIELDNAME])

** Stopwords, stemming, and tokenizing **

In [0]:
totalvocab_tokenized = list(pairs_got.keys())
totalvocab_stemmed = list(pairs_got.values())
vocab_frame = vocabulary_frame_mongodb(totalvocab_tokenized, totalvocab_stemmed)
stopwords_french = nltk.corpus.stopwords.words('french')

** Tf-idf and document similarity **

In [0]:
(vectors_words_corpus, vocabulary_dataset) = tfidfVectorization(corpus, process_line, stopwords_french, 3)

** K-means clustering **

In [0]:
num_clusters = N_CLUSTERS
kmeans_processing = clustering(vectors_words_corpus, 5)

** Display of the results **

In [197]:
NB_WORDS_FOR_CLUSTER = 6
#sort cluster centers by proximity to centroid
order_centroids = kmeans_processing.cluster_centers_.argsort()[:, ::-1] 
nb_clusters = len(order_centroids)
for i in range(nb_clusters):
    print("Cluster %d words:" % i)
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(' %s,' % vocab_frame.loc[vocabulary_dataset[ind].split(' ')].values.tolist()[0][0], end='')
    print("")

Cluster 0 words:
 francea, partiez, deux, milliona, premiers, selona,
Cluster 1 words:
 suissea, conseils, contrera, premiers, paye, troisa,
Cluster 2 words:
 femmea, hommasse, tout, mêmes, égales, êtres,
Cluster 3 words:
 américainea, etat, unis, etat, présidente, européenn,
Cluster 4 words:
 commissions, tout, mêmes, faits, grandirent, deux,


# Final pipeline generalized

In [0]:
N_CLUSTERS = 5
FIELDNAME = 'clearedHTML'
NB_WORDS_FOR_CLUSTER = 6
NGRAM_MAX = 1

** Initialisation, make it when you change the dataframe **

In [0]:
unique_biMonth = dataframe['biMonth'].unique()
unique_biMonth.sort()
tuples_biMonth = [(unique_biMonth[i], unique_biMonth[i+1]) for i in range(len(unique_biMonth)-1)]

In [0]:
totalvocab_tokenized = list(pairs_got.keys())
totalvocab_stemmed = list(pairs_got.values())
vocab_frame = vocabulary_frame_mongodb(totalvocab_tokenized, totalvocab_stemmed)

In [0]:
stopwords_french = nltk.corpus.stopwords.words('french')

** Make the tfidf-vectorization, clustering, finding of words on one tuple biMonth **

In [0]:
def topicAnalysisProcessingForMonth(dataframe, tuple_biMonth, vocab_frame, result_dict, nb_clusters=5, nb_words_by_cluster=5, ngram_max=2):
  dataframe_subset = dataframe[dataframe['biMonth'].isin(tuple_biMonth)]
  corpus = np.asarray(dataframe_subset[FIELDNAME])
  print("Tuple %s: Begin " % (tuple_biMonth,), end='')
  
  # TFidf-vectorization
  try:
    (vectors_words_corpus, vocabulary_dataset) = tfidfVectorization(corpus, process_line, stopwords_french, ngram_max, min_df=0.1) 
    print(", TFidf-vectorization done", end='')
  except ValueError:
    print(", After pruning, no terms remain. Try a lower min_df or a higher max_df !!!!!!")
    return
  
  # clustering 
  kmeans_processing = clustering(vectors_words_corpus, nb_clusters)
  print(", Clustering done", end='')
    
  # find most Important Words
  words_all_cluster = findImportantWords(kmeans_processing, vocab_frame, vocabulary_dataset, nb_words_by_cluster)
  print(", findImportantWords done")
  
  key = "%d-%d" % (tuple_biMonth)
  result_dict[key] = words_all_cluster  

In [0]:
def printResults(result_dict):
  for key, words in result_dict.items():
    print("\n" + key + ":")
    for idx, val in enumerate(words):
      print("Cluster %d words:" % idx)
      print(val)

** Processing on the entire dataset **

In [0]:
def topicAnalysisProcessing(limit=None):
  reversed_tuples_biMonth = list(reversed(tuples_biMonth))
  result_dict = {}
  for idx, tuple_biMonth in enumerate(reversed_tuples_biMonth):
    """if limit != None and idx >= limit:
      return result_dict"""
    topicAnalysisProcessingForMonth(dataframe, tuple_biMonth, vocab_frame, result_dict, N_CLUSTERS, NB_WORDS_FOR_CLUSTER, NGRAM_MAX)
  return result_dict

In [22]:
%time result_dict = topicAnalysisProcessing()

Tuple (2018009, 2018010): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2018008, 2018009): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2018007, 2018008): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2018006, 2018007): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2018005, 2018006): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2018004, 2018005): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2018003, 2018004): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2018002, 2018003): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2018001, 2018002): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2017024, 2018001): Begin , TFidf-vectorization done, Clustering done, findImportantW

, TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2017007, 2017008): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2017006, 2017007): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2017005, 2017006): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2017004, 2017005): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2017003, 2017004): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2017002, 2017003): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2017001, 2017002): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2016024, 2017001): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2016023, 2016024): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2016022, 201602

, Clustering done, findImportantWords done
Tuple (2016006, 2016007): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2016005, 2016006): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2016004, 2016005): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2016003, 2016004): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2016002, 2016003): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2016001, 2016002): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2015024, 2016001): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2015023, 2015024): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2015022, 2015023): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2015021, 2015022): Begin , TFidf-vectoriz

, findImportantWords done
Tuple (2015005, 2015006): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2015004, 2015005): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2015003, 2015004): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2015002, 2015003): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2015001, 2015002): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2014024, 2015001): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2014023, 2014024): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2014022, 2014023): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2014021, 2014022): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2014020, 2014021): Begin , TFidf-vectorization done, Clust

, TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2014003, 2014004): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2014002, 2014003): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2014001, 2014002): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2013024, 2014001): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2013023, 2013024): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2013022, 2013023): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2013021, 2013022): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2013020, 2013021): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2013019, 2013020): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2013018, 201301

, Clustering done, findImportantWords done
Tuple (2013002, 2013003): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2013001, 2013002): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2012024, 2013001): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2012023, 2012024): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2012022, 2012023): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2012021, 2012022): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2012020, 2012021): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2012019, 2012020): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2012018, 2012019): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2012017, 2012018): Begin , TFidf-vectoriz

, findImportantWords done
Tuple (2012001, 2012002): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2011024, 2012001): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2011023, 2011024): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2011022, 2011023): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2011021, 2011022): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2011020, 2011021): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2011019, 2011020): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2011018, 2011019): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2011017, 2011018): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2011016, 2011017): Begin , TFidf-vectorization done, Clust

, TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2010023, 2010024): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2010022, 2010023): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2010021, 2010022): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2010020, 2010021): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2010019, 2010020): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2010018, 2010019): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2010017, 2010018): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2010016, 2010017): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2010015, 2010016): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2010014, 201001

, Clustering done, findImportantWords done
Tuple (2009022, 2009023): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2009021, 2009022): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2009020, 2009021): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2009019, 2009020): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2009018, 2009019): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2009017, 2009018): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2009016, 2009017): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2009015, 2009016): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2009014, 2009015): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2009013, 2009014): Begin , TFidf-vectoriz

, findImportantWords done
Tuple (2008021, 2008022): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2008020, 2008021): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2008019, 2008020): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2008018, 2008019): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2008017, 2008018): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2008016, 2008017): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2008015, 2008016): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2008014, 2008015): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2008013, 2008014): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2008012, 2008013): Begin , TFidf-vectorization done, Clust

, TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2007019, 2007020): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2007018, 2007019): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2007017, 2007018): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2007016, 2007017): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2007015, 2007016): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2007014, 2007015): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2007013, 2007014): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2007012, 2007013): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2007011, 2007012): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2007010, 200701

, Clustering done, findImportantWords done
Tuple (2006018, 2006019): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2006017, 2006018): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2006016, 2006017): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2006015, 2006016): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2006014, 2006015): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2006013, 2006014): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2006012, 2006013): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2006011, 2006012): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2006010, 2006011): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2006009, 2006010): Begin , TFidf-vectoriz

, findImportantWords done
Tuple (2005017, 2005018): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2005016, 2005017): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2005015, 2005016): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2005014, 2005015): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2005013, 2005014): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2005012, 2005013): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2005011, 2005012): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2005010, 2005011): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2005009, 2005010): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2005008, 2005009): Begin , TFidf-vectorization done, Clust

, TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2004015, 2004016): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2004014, 2004015): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2004013, 2004014): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2004012, 2004013): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2004011, 2004012): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2004010, 2004011): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2004009, 2004010): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2004008, 2004009): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2004007, 2004008): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2004006, 200400

, findImportantWords done
Tuple (2003014, 2003015): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2003013, 2003014): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2003012, 2003013): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2003011, 2003012): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2003010, 2003011): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2003009, 2003010): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2003008, 2003009): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2003007, 2003008): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2003006, 2003007): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2003005, 2003006): Begin , TFidf-vectorization done, Clust

, TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2002011, 2002012): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2002010, 2002011): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2002009, 2002010): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2002008, 2002009): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2002007, 2002008): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2002006, 2002007): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2002005, 2002006): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2002004, 2002005): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2002003, 2002004): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2002002, 200200

, TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2001004, 2001005): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2001003, 2001004): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2001002, 2001003): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2001001, 2001002): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2000024, 2001001): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2000023, 2000024): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2000022, 2000023): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2000021, 2000022): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2000020, 2000021): Begin , TFidf-vectorization done, Clustering done, findImportantWords done
Tuple (2000019, 200002

** Results of entire dataset **

In [23]:
printResults(result_dict)


2018009-2018010:
Cluster 0 words:
['suissea', 'francea', 'selona', 'deux', 'paye', 'êtres']
Cluster 1 words:
['femmea', 'hommasse', 'filmait', 'égales', 'pétasse', 'mêmes']
Cluster 2 words:
['conseils', 'etat', 'pierre', 'genevoises', 'fédérales', 'genèvea']
Cluster 3 words:
['dollari', 'américainea', 'trumpés', 'etat', 'accordait', 'unis']
Cluster 4 words:
['commissions', 'tout', 'grandirent', 'deux', 'filmait', 'anné']

2018008-2018009:
Cluster 0 words:
['policées', 'deux', 'nxp', 'final', 'minute', 'après']
Cluster 1 words:
['femmea', 'hommasse', 'plusa', 'auss', 'cétait', 'égales']
Cluster 2 words:
['plusa', 'tout', 'cétait', 'commissions', 'auss', 'grandirent']
Cluster 3 words:
['suissea', 'plusa', 'francea', 'milliona', 'cétait', 'entreprises']
Cluster 4 words:
['présidente', 'etat', 'nordé', 'américainea', 'accordait', 'ministre']

2018007-2018008:
Cluster 0 words:
['nxp', 'deux', 'plusa', 'cétait', 'minute', 'faits']
Cluster 1 words:
['présidente', 'américainea', 'nordé', 'eta

['mortes', 'paye', 'contrera', 'afp', 'força', 'régimea']

2011018-2011019:
Cluster 0 words:
['banque', 'suissea', 'milliard', 'financier', 'francea', 'not']
Cluster 1 words:
['u0092estiment', 'plusa', 'femmea', 'faits', 'tout', 'cétait']
Cluster 2 words:
['européenn', 'euroa', 'zoner', 'banque', 'crisea', 'septembres']
Cluster 3 words:
['afp', 'selona', 'mortes', 'après', 'présidente', 'déclaratives']
Cluster 4 words:
['suissea', 'conseils', 'francea', 'fédérales', 'milliona', 'septembres']

2011017-2011018:
Cluster 0 words:
['u0092estiment', 'u0092unit', 'plusa', 'tout', 'cétait', 'dernier']
Cluster 1 words:
['suissea', 'banque', 'francea', 'milliona', 'plusa', 'conseils']
Cluster 2 words:
['présidente', 'afp', 'selona', 'cétait', 'contrera', 'déclaratives']
Cluster 3 words:
['septembres', 'modifiable', 'conseils', 'etat', 'francea', 'économiquement']
Cluster 4 words:
['euroa', 'européenn', 'pointe', 'zoner', 'banque', 'marchea']

2011016-2011017:
Cluster 0 words:
['the', 'meilleura'


Cluster 2 words:
['modifiable', 'juin', 'février', 'marai', 'genes', 'mieuxa']
Cluster 3 words:
['milliard', 'suissea', 'plusa', 'pointe', 'francea', 'modifiable']
Cluster 4 words:
['suissea', 'droitea', 'paye', 'conseils', 'plusa', 'fédérales']

2008003-2008004:
Cluster 0 words:
['suissea', 'plusa', 'équipera', 'deux', 'milliona', 'villas']
Cluster 1 words:
['suissea', 'plusa', 'paye', 'droitea', 'person', 'cétait']
Cluster 2 words:
['modifiable', 'juin', 'février', 'plusa', 'suissea', 'haussait']
Cluster 3 words:
['milliard', 'banque', 'francea', 'suissea', 'nette', 'résultats']
Cluster 4 words:
['sarkozy', 'nicols', 'présidente', 'sitis', 'chef', 'républiques']

2008002-2008003:
Cluster 0 words:
['modifiable', 'juin', 'janvier', 'février', 'boursea', 'policées']
Cluster 1 words:
['suissea', 'gouvernait', 'accordait', 'présidente', 'cétait', 'européenn']
Cluster 2 words:
['milliard', 'banque', 'francea', 'pertes', 'suissea', 'credi']
Cluster 3 words:
['sarkozy', 'nicols', 'président


['postait', 'centrait', 'genèvea', 'services', 'commune', 'plusa']

2002018-2002019:
Cluster 0 words:
['swiss', 'milliona', 'grouper', 'francea', 'credi', 'direction']
Cluster 1 words:
['cantonais', 'droitea', 'conseils', 'contrera', 'socialité', 'nationales']
Cluster 2 words:
['européenn', 'paye', 'unionistes', 'accordait', 'entres', 'contrera']
Cluster 3 words:
['premiers', 'couperai', 'deux', 'peut', 'francea', 'tout']
Cluster 4 words:
['économiquement', 'marchea', 'cétait', 'secteur', 'entreprises', 'banque']

2002017-2002018:
Cluster 0 words:
['francea', 'grouper', 'milliona', 'marchea', 'milliard', 'banque']
Cluster 1 words:
['onu', 'nations', 'new', 'droitea', 'politiste', 'unie']
Cluster 2 words:
['prixa', 'tout', 'paye', 'grandirent', 'peut', 'auss']
Cluster 3 words:
['américainea', 'européenn', 'paye', 'etat', 'terroriste', 'contrera']
Cluster 4 words:
['loi', 'cantonais', 'fédérales', 'droitea', 'conseils', 'projeta']

2002016-2002017:
Cluster 0 words:
['eaua', 'sommets', '

** Upload the results  as a dictionnary to MongoDB **

In [24]:
db_corpus.topicAnalysis.insert_one(
    {
        "id": "resultDict2",
        "topics": result_dict,
        "description": "List of list of words for each tuple biMonth"
    }
)

<pymongo.results.InsertOneResult at 0x7fb71d2c6ec8>