In [1]:
import spacy
import pandas as pd
import en_core_web_sm
from pprint import pprint
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups()
nlp = en_core_web_sm.load()

In [2]:
pprint(news.DESCR)

('.. _20newsgroups_dataset:\n'
 '\n'
 'The 20 newsgroups text dataset\n'
 '------------------------------\n'
 '\n'
 'The 20 newsgroups dataset comprises around 18000 newsgroups posts on\n'
 '20 topics split in two subsets: one for training (or development)\n'
 'and the other one for testing (or for performance evaluation). The split\n'
 'between the train and test set is based upon a messages posted before\n'
 'and after a specific date.\n'
 '\n'
 'This module contains two loaders. The first one,\n'
 ':func:`sklearn.datasets.fetch_20newsgroups`,\n'
 'returns a list of the raw texts that can be fed to text feature\n'
 'extractors such as '
 ':class:`~sklearn.feature_extraction.text.CountVectorizer`\n'
 'with custom parameters so as to extract feature vectors.\n'
 'The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,\n'
 'returns ready-to-use features, i.e., it is not necessary to use a feature\n'
 'extractor.\n'
 '\n'
 '**Data Set Characteristics:**\n'
 '\n'
 '    Cla

In [3]:
df = pd.DataFrame(news.data, columns=["texte"])

In [4]:
df["cible"] = news.target

In [5]:
df_sample = df.sample(n=5000)

In [6]:
df_sample

Unnamed: 0,texte,cible
8518,From: ifaz706@utxvms.cc.utexas.edu (Noam Tract...,17
7887,From: mcovingt@aisun3.ai.uga.edu (Michael Covi...,15
3744,From: estasic@ic.sunysb.edu (Edward Stasic)\nS...,3
2913,From: hasan@McRCIM.McGill.EDU \nSubject: Re: N...,17
458,From: jodfishe@silver.ucs.indiana.edu (joseph ...,15
...,...,...
8736,Subject: Re: Shaft-drives and Wheelies\nFrom: ...,8
6173,From: mcelwre@cnsvax.uwec.edu\nSubject: THE DI...,19
9076,From: healta@saturn.wwc.edu (Tammy R Healy)\nS...,0
4114,From: Rick Miller - former spook <rick@ee.uwm....,11


In [7]:
df_sample["text_clean"] = df_sample["texte"].str.replace(r"[^A-Za-z0-9 ]+", " ")
df_sample["text_clean"] = df_sample["text_clean"].str.split("Subject").str[1]
df_sample["text_clean"] = df_sample["text_clean"].str.lower()


  df_sample["text_clean"] = df_sample["texte"].str.replace(r"[^A-Za-z0-9 ]+", " ")


In [8]:
def process_text(x):
    spacied = nlp(x)
    new_sentence = []
    for token in spacied:
        if token.lemma_.lower() not in STOP_WORDS:
            new_sentence.append(token.lemma_.lower())
    return  " ".join(new_sentence)        

df_sample["processed_documents"] = df_sample['text_clean'].apply(process_text)

In [9]:
df_sample

Unnamed: 0,texte,cible,text_clean,processed_documents
8518,From: ifaz706@utxvms.cc.utexas.edu (Noam Tract...,17,go hizbollah ii lines 28 nntp posting host ...,hizbollah ii line 28 nntp post host pur...
7887,From: mcovingt@aisun3.ai.uga.edu (Michael Covi...,15,re did he really rise organization ai prog...,rise organization ai programs univers...
3744,From: estasic@ic.sunysb.edu (Edward Stasic)\nS...,3,re ide vs scsi here we go again organizati...,ide vs scsi organization state univer...
2913,From: hasan@McRCIM.McGill.EDU \nSubject: Re: N...,17,re no land for peace no negotiatians orig...,land peace negotiatians originator h...
458,From: jodfishe@silver.ucs.indiana.edu (joseph ...,15,re eternity of hell was re hell organizat...,eternity hell hell organization ind...
...,...,...,...,...
8736,Subject: Re: Shaft-drives and Wheelies\nFrom: ...,8,re shaft drives and wheelies from stafford...,shaft drive wheelie stafford vax2 winon...
6173,From: mcelwre@cnsvax.uwec.edu\nSubject: THE DI...,19,the divine masters organization university ...,divine masters organization university wi...
9076,From: healta@saturn.wwc.edu (Tammy R Healy)\nS...,0,getting to the point lines 12 organization ...,point line 12 organization walla walla ...
4114,From: Rick Miller - former spook <rick@ee.uwm....,11,alternate legal wiretaps organization jus...,alternate legal wiretaps organization ...


In [10]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_sample['processed_documents'])


In [11]:
vectorizer.vocabulary_

{'hizbollah': 31972,
 'ii': 33205,
 'line': 38412,
 '28': 5582,
 'nntp': 44408,
 'post': 48303,
 'host': 32325,
 'purple': 49319,
 'cc': 19019,
 'utexas': 61036,
 'edu': 25361,
 'organization': 45815,
 'university': 60491,
 'texas': 58119,
 'austin': 14885,
 'israel': 34677,
 'thursday': 58461,
 'april': 14109,
 '22': 4896,
 '1993': 3659,
 'today': 58739,
 'ha': 30936,
 'aretz': 14244,
 'report': 51239,
 'woman': 63541,
 'injure': 33983,
 'katyusha': 36240,
 'rocket': 52092,
 'fall': 27270,
 'center': 19172,
 'community': 20636,
 'dozen': 24518,
 'fire': 27897,
 'galilee': 29199,
 'northern': 44571,
 'yesterday': 64943,
 'terrorist': 58064,
 'hizbullah': 31974,
 'article': 14405,
 '1993apr14': 3677,
 '125813': 2119,
 '21737': 4861,
 'ncsu': 43724,
 'hernlem': 31686,
 'chess': 19503,
 'brad': 17163,
 'write': 63729,
 'congratulation': 20955,
 'brave': 17215,
 'man': 39999,
 'lebanese': 37906,
 'resistance': 51355,
 'israeli': 34678,
 'son': 55239,
 'place': 47812,
 'grave': 30342,
 'und

In [12]:
svd_model = TruncatedSVD(n_components=20, n_iter=100, random_state=0)
lsa = svd_model.fit_transform(X)
sujet = [f"sujet_{i}" for i in range(1,21)]
topic_encoded_df = pd.DataFrame(lsa, columns = sujet, index=df_sample.index)

In [13]:
topic_encoded_df["class_pred"] = topic_encoded_df.idxmax(axis=1).str.split("_").str[1]

In [14]:
topic_encoded_df

Unnamed: 0,sujet_1,sujet_2,sujet_3,sujet_4,sujet_5,sujet_6,sujet_7,sujet_8,sujet_9,sujet_10,...,sujet_12,sujet_13,sujet_14,sujet_15,sujet_16,sujet_17,sujet_18,sujet_19,sujet_20,class_pred
8518,0.128766,0.073880,0.009735,0.043079,0.030638,-0.169242,0.044743,0.248812,0.013740,-0.043961,...,-0.075524,-0.052757,-0.080970,-0.087474,-0.053667,-0.020125,-0.063334,-0.020742,0.025689,8
7887,0.098157,0.047029,-0.014820,-0.044539,-0.058988,0.045472,-0.037858,0.038334,0.003096,-0.022491,...,0.015809,0.043630,0.034996,0.029846,-0.040566,-0.004224,-0.036927,0.008289,0.008985,1
3744,0.131643,-0.044241,-0.057205,0.003757,0.038206,-0.015205,-0.101250,0.058934,-0.060840,0.049803,...,0.027117,-0.009984,0.011796,0.017112,0.001728,0.036828,-0.033799,0.081724,0.033413,1
2913,0.115906,0.062678,0.005949,0.024895,0.033937,-0.125965,0.044883,0.167045,0.018293,-0.024345,...,-0.073781,-0.041579,-0.073949,-0.089031,-0.042504,-0.016928,-0.031258,-0.012558,0.002463,8
458,0.193075,0.173279,-0.000964,-0.109375,-0.004836,0.090771,0.002260,0.016263,0.003226,-0.024986,...,0.026876,0.039940,0.019305,0.000969,-0.023906,0.022916,-0.009993,0.000710,-0.000052,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8736,0.105896,-0.022567,-0.056799,0.015259,0.044098,-0.033906,-0.093723,0.019633,-0.029902,-0.012292,...,-0.020426,-0.039083,0.069004,0.011209,-0.028063,-0.010355,-0.012871,-0.036933,-0.033061,1
6173,0.161869,0.145887,0.015744,-0.122638,-0.014707,0.084662,0.013394,0.005724,0.034910,-0.006917,...,0.011182,0.037740,-0.013778,0.020581,-0.034053,-0.012331,0.006941,0.009386,-0.007938,1
9076,0.129130,0.028402,-0.037553,-0.003896,-0.017405,0.042212,-0.005292,0.001278,0.012745,-0.009851,...,0.004982,0.022745,-0.003815,0.002972,-0.038096,-0.034750,0.010185,0.017045,-0.013644,1
4114,0.134408,-0.001061,0.119474,0.037885,-0.002225,0.009657,-0.010681,-0.002563,-0.023169,-0.003040,...,0.014136,0.011444,0.020500,0.000202,0.001523,-0.000856,0.004937,0.009409,-0.003660,1


In [15]:
topic_encoded_df["class_pred"].value_counts()

1     3926
3      145
9      100
2       99
4       90
8       85
5       83
6       79
15      58
16      53
13      52
18      43
7       42
20      37
12      28
11      26
17      25
10      13
19      10
14       6
Name: class_pred, dtype: int64

In [16]:
topic_encoded_df["cible"] = df_sample["cible"]

In [17]:
topic_encoded_df

Unnamed: 0,sujet_1,sujet_2,sujet_3,sujet_4,sujet_5,sujet_6,sujet_7,sujet_8,sujet_9,sujet_10,...,sujet_13,sujet_14,sujet_15,sujet_16,sujet_17,sujet_18,sujet_19,sujet_20,class_pred,cible
8518,0.128766,0.073880,0.009735,0.043079,0.030638,-0.169242,0.044743,0.248812,0.013740,-0.043961,...,-0.052757,-0.080970,-0.087474,-0.053667,-0.020125,-0.063334,-0.020742,0.025689,8,17
7887,0.098157,0.047029,-0.014820,-0.044539,-0.058988,0.045472,-0.037858,0.038334,0.003096,-0.022491,...,0.043630,0.034996,0.029846,-0.040566,-0.004224,-0.036927,0.008289,0.008985,1,15
3744,0.131643,-0.044241,-0.057205,0.003757,0.038206,-0.015205,-0.101250,0.058934,-0.060840,0.049803,...,-0.009984,0.011796,0.017112,0.001728,0.036828,-0.033799,0.081724,0.033413,1,3
2913,0.115906,0.062678,0.005949,0.024895,0.033937,-0.125965,0.044883,0.167045,0.018293,-0.024345,...,-0.041579,-0.073949,-0.089031,-0.042504,-0.016928,-0.031258,-0.012558,0.002463,8,17
458,0.193075,0.173279,-0.000964,-0.109375,-0.004836,0.090771,0.002260,0.016263,0.003226,-0.024986,...,0.039940,0.019305,0.000969,-0.023906,0.022916,-0.009993,0.000710,-0.000052,1,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8736,0.105896,-0.022567,-0.056799,0.015259,0.044098,-0.033906,-0.093723,0.019633,-0.029902,-0.012292,...,-0.039083,0.069004,0.011209,-0.028063,-0.010355,-0.012871,-0.036933,-0.033061,1,8
6173,0.161869,0.145887,0.015744,-0.122638,-0.014707,0.084662,0.013394,0.005724,0.034910,-0.006917,...,0.037740,-0.013778,0.020581,-0.034053,-0.012331,0.006941,0.009386,-0.007938,1,19
9076,0.129130,0.028402,-0.037553,-0.003896,-0.017405,0.042212,-0.005292,0.001278,0.012745,-0.009851,...,0.022745,-0.003815,0.002972,-0.038096,-0.034750,0.010185,0.017045,-0.013644,1,0
4114,0.134408,-0.001061,0.119474,0.037885,-0.002225,0.009657,-0.010681,-0.002563,-0.023169,-0.003040,...,0.011444,0.020500,0.000202,0.001523,-0.000856,0.004937,0.009409,-0.003660,1,11


In [18]:
topic_encoded_df["cible"].value_counts()

14    274
11    272
7     271
8     268
9     266
2     266
10    265
17    262
13    262
3     260
4     259
6     259
15    254
5     252
12    251
16    243
1     242
0     220
18    201
19    153
Name: cible, dtype: int64