In [29]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD
import spacy
import pandas as pd
nlp = spacy.load("en_core_web_sm")
# warnings imports
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [30]:
from sklearn.datasets import fetch_20newsgroups
categories = [
'alt.atheism',
'talk.religion.misc',
'comp.graphics',
'sci.space',
]

In [31]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans, MiniBatchKMeans

In [32]:
#import dataset
dataset = fetch_20newsgroups(subset='all', categories=categories,
shuffle=True, random_state=42)
#save labels
labels = dataset.target
print(labels)

[0 1 1 ... 2 1 1]


In [50]:
#get the unique labels
true_k = np.unique(labels).shape[0]

print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))
data = dataset.data
dataset.target_names

3387 documents
4 categories


['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']

In [34]:
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english',
use_idf=True)
X = vectorizer.fit_transform(data)

print("n_samples: %d, n_features: %d" % X.shape)

n_samples: 3387, n_features: 24545


In [35]:
#Dimensionality Reduction
# Vectorizer results are normalized, which makes KMeans behave better
    # Since LSA/SVD results are not normalized, we have to redo the normalization.

    #If we do not normalize the data, variables with different scaling 
    # will be weighted differently in the distance formula 
    # that is being optimized during training.
    
n_components = 5
svd = TruncatedSVD(n_components)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

#The final X is the input which we will be using. 
# It has been cleaned, TF-IDF transformed, and its dimensions reduced.
X = lsa.fit_transform(X)

In [36]:
#scikit-learn offers two implementations of kmeans:
# either in mini-batches or without
minibatch = True
if minibatch:
   km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
   init_size=1000, batch_size=1000)
else:
   km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
km.fit(X)
# top words per cluster
print("Clustering sparse data with %s" % km)

original_space_centroids = svd.inverse_transform(km.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(true_k):
   print("Cluster %d:" % i)
   for ind in order_centroids[i, :10]:
      print(' %s' % terms[ind])
print("First method:")
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f "
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))

#Note: You might see different results, as machine learning 
# algorithms do not produce the exact same results each time.
#km.predict(X_test) to test our model

#imports the KMeans algorithm from the scikit-learn library and 
# creates an instance of it with three clusters, a random state of 0, 
# and automatic initialization
#KMeans algorithm is a clustering algorithm that groups 
# similar data points together based on their distance from each other
kmeans = KMeans(n_clusters = 3, random_state = 0, n_init='auto')
#The fit method is then called on the normalized training data 
# to train the KMeans model on the data.
kmeans.fit(X)

Clustering sparse data with MiniBatchKMeans(batch_size=1000, init_size=1000, n_clusters=4, n_init=1)
Cluster 0:
 henry
 space
 toronto
 access
 nasa
 digex
 com
 pat
 zoo
 alaska
Cluster 1:
 graphics
 space
 image
 com
 nasa
 university
 posting
 program
 host
 images
Cluster 2:
 god
 people
 com
 jesus
 don
 say
 believe
 think
 bible
 just
Cluster 3:
 sgi
 livesey
 keith
 solntze
 wpd
 jon
 com
 caltech
 morality
 moral
First method:
Homogeneity: 0.558
Completeness: 0.606
V-measure: 0.581
Adjusted Rand-Index: 0.555
Silhouette Coefficient: 0.426 


In [37]:
print("Second method:")
original_space_centroids = svd.inverse_transform(km.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(true_k):
   print("Cluster %d:" % i)
   for ind in order_centroids[i, :10]:
      print(' %s' % terms[ind])
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f "
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))

Second method:
Cluster 0:
 henry
 space
 toronto
 access
 nasa
 digex
 com
 pat
 zoo
 alaska
Cluster 1:
 graphics
 space
 image
 com
 nasa
 university
 posting
 program
 host
 images
Cluster 2:
 god
 people
 com
 jesus
 don
 say
 believe
 think
 bible
 just
Cluster 3:
 sgi
 livesey
 keith
 solntze
 wpd
 jon
 com
 caltech
 morality
 moral
Homogeneity: 0.558
Completeness: 0.606
V-measure: 0.581
Adjusted Rand-Index: 0.555
Silhouette Coefficient: 0.421 


In [38]:
def load_dataset (file):
    dataset = pd.read_csv(file)
    return dataset


In [39]:
def visualize_data (dataset) :
    print(dataset.head())
    print ("Nombre de lignes du dataset :", dataset.shape[0])
    print("Nombre de colonnes du dataset : ", dataset.shape[1])
    print("Noms des colonnes : ", dataset.columns.tolist())


In [40]:
# dataset1 = load_dataset("Apple-Twitter-Sentiment-DFE.csv")
dataset2 = load_dataset("training.1600000.processed.noemoticon.csv")

In [41]:
visualize_data(dataset2)

   0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY _TheSpecialOne_  \
0  0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   scotthamilton   
1  0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY        mattycus   
2  0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY         ElleCTF   
3  0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY          Karoli   
4  0  1467811372  Mon Apr 06 22:20:00 PDT 2009  NO_QUERY        joy_wolf   

  @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D  
0  is upset that he can't update his Facebook by ...                                                                   
1  @Kenichan I dived many times for the ball. Man...                                                                   
2    my whole body feels itchy and like its on fire                                                                    
3  @nationwideclass no, it's not behaving at all....           

In [43]:
dataset2 = dataset2.iloc[:, -1:]
dataset2

Unnamed: 0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,is upset that he can't update his Facebook by ...
1,@Kenichan I dived many times for the ball. Man...
2,my whole body feels itchy and like its on fire
3,"@nationwideclass no, it's not behaving at all...."
4,@Kwesidei not the whole crew
...,...
1599994,Just woke up. Having no school is the best fee...
1599995,TheWDB.com - Very cool to hear old Walt interv...
1599996,Are you ready for your MoJo Makeover? Ask me f...
1599997,Happy 38th Birthday to my boo of alll time!!! ...


In [48]:
# Personalized labels for columns
personalized_labels = ['Tweets']

# Assigning personalized labels to columns
dataset2.columns = personalized_labels

# Dropping the first row (as it's now the column names)
dataset2 = dataset2.drop(0)

# Resetting the index after dropping the first row
dataset2 = dataset2.reset_index(drop=True)

# Displaying the corrected DataFrame with personalized labels
print(dataset2)


                                                    Tweets
0                                              Need a hug 
1        @LOLTrish hey  long time no see! Yes.. Rains a...
2                     @Tatiana_K nope they didn't have it 
3                                @twittera que me muera ? 
4              spring break in plain city... it's snowing 
...                                                    ...
1599989  Just woke up. Having no school is the best fee...
1599990  TheWDB.com - Very cool to hear old Walt interv...
1599991  Are you ready for your MoJo Makeover? Ask me f...
1599992  Happy 38th Birthday to my boo of alll time!!! ...
1599993  happy #charitytuesday @theNSPCC @SparksCharity...

[1599994 rows x 1 columns]
