In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
df = pd.read_csv("spam.csv")
print(df.head(3)) ; print("-"*30)
print(df.shape) ; print("-"*30)
print(df.Category.value_counts()) ; print("-"*30)

  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
------------------------------
(5572, 2)
------------------------------
ham     4825
spam     747
Name: Category, dtype: int64
------------------------------


In [24]:
y = df.Category
X = df.Message

In [25]:
vetor = TfidfVectorizer()
X = vetor.fit_transform(X)

In [34]:
print(X.shape) ; print("-"*30)
print(type(X)) ; print("-"*30)
print(vetor.get_feature_names_out()[0:100])

(5572, 8709)
------------------------------
<class 'scipy.sparse._csr.csr_matrix'>
------------------------------
['00' '000' '000pes' '008704050406' '0089' '0121' '01223585236'
 '01223585334' '0125698789' '02' '0207' '02072069400' '02073162414'
 '02085076972' '021' '03' '04' '0430' '05' '050703' '0578' '06' '07'
 '07008009200' '07046744435' '07090201529' '07090298926' '07099833605'
 '07123456789' '0721072' '07732584351' '07734396839' '07742676969'
 '07753741225' '0776xxxxxxx' '07781482378' '07786200117' '077xxx' '078'
 '07801543489' '07808' '07808247860' '07808726822' '07815296484'
 '07821230901' '078498' '07880867867' '0789xxxxxxx' '07946746291'
 '0796xxxxxx' '07973788240' '07xxxxxxxxx' '08' '0800' '08000407165'
 '08000776320' '08000839402' '08000930705' '08000938767' '08001950382'
 '08002888812' '08002986030' '08002986906' '08002988890' '08006344447'
 '0808' '08081263000' '08081560665' '0825' '083' '0844' '08448350055'
 '08448714184' '0845' '08450542832' '08452810071' '08452810073'


In [36]:
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

In [38]:
model = RandomForestClassifier(n_estimators=150)

model.fit(x_train,y_train)

pred = model.predict(x_test)

print(confusion_matrix(y_test,pred)) ; print("-"*30)
print(accuracy_score(y_test,pred))

[[1455    2]
 [  34  181]]
------------------------------
0.9784688995215312


In [39]:
print(metrics.classification_report(y_test,pred))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1457
        spam       0.99      0.84      0.91       215

    accuracy                           0.98      1672
   macro avg       0.98      0.92      0.95      1672
weighted avg       0.98      0.98      0.98      1672



In [41]:
# Obtendo os nomes das features (termos)
feature_names = vetor.get_feature_names_out()

# Convertendo a matriz esparsa TF-IDF para um DataFrame do Pandas
df_tfidf = pd.DataFrame(X.toarray(), columns=feature_names)

# Visualizando os primeiros pesos atribuídos aos termos
print(df_tfidf.head(25))

     00       000  000pes  008704050406  0089  0121  01223585236  01223585334  \
0   0.0  0.000000     0.0           0.0   0.0   0.0          0.0          0.0   
1   0.0  0.000000     0.0           0.0   0.0   0.0          0.0          0.0   
2   0.0  0.000000     0.0           0.0   0.0   0.0          0.0          0.0   
3   0.0  0.000000     0.0           0.0   0.0   0.0          0.0          0.0   
4   0.0  0.000000     0.0           0.0   0.0   0.0          0.0          0.0   
5   0.0  0.000000     0.0           0.0   0.0   0.0          0.0          0.0   
6   0.0  0.000000     0.0           0.0   0.0   0.0          0.0          0.0   
7   0.0  0.000000     0.0           0.0   0.0   0.0          0.0          0.0   
8   0.0  0.000000     0.0           0.0   0.0   0.0          0.0          0.0   
9   0.0  0.000000     0.0           0.0   0.0   0.0          0.0          0.0   
10  0.0  0.000000     0.0           0.0   0.0   0.0          0.0          0.0   
11  0.0  0.196171     0.0   