# Tokenización

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [2]:
df = pd.read_csv('datos_prueba/df_total.csv', encoding='utf-8')
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['news'])
Y = df['Type']

In [3]:
X_train , X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [4]:
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

0.7786885245901639


# Stemming

In [5]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\felip\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\felip\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
def tokenize_and_stem(text):
    stemmer = SnowballStemmer('spanish')
    tokens = word_tokenize(text.lower())
    stems = [stemmer.stem(token) for token in tokens if token.isalpha()]
    return ' '.join(stems)


In [7]:
df['news_stemmer'] = df['news'].apply(tokenize_and_stem)
df['news_stemmer']

0       durant el for la banc articul empresarial par ...
1       el regul de valor de chin dij el doming que bu...
2       en una industri histor masculin com lo es la a...
3       con el dat de marz el ipc interanual encaden s...
4       ayer en cartagen se dio inici a la version num...
                              ...                        
1212    en la vid de tod empres emergent lleg un momen...
1213    la espiral alcist de los preci continu y gener...
1214    las grand derrot nacional son experient trauma...
1215    bbva ha alcanz un acuerd de colabor con barcel...
1216    casi entrand a la part final de noviembr la ep...
Name: news_stemmer, Length: 1217, dtype: object

In [8]:
X = vectorizer.fit_transform(df['news_stemmer'])
Y = df['Type']
X_train , X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

0.7704918032786885


# Lemmatizacion

In [9]:
%pip install spacy

Note: you may need to restart the kernel to use updated packages.




In [10]:
import spacy
from spacy.cli.download import download

download('es_core_news_sm')

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [11]:
nlp = spacy.load('es_core_news_sm')

In [13]:
def lemmatize_text(text):
    doc = nlp(text.lower())
    lemmas = [token.lemma_ for token in doc if token.is_alpha]
    return ' '.join(lemmas)

In [14]:
df['news_lemma'] = df['news'].apply(lemmatize_text)
df['news_lemma']

0       durante el foro el banca articulador empresari...
1       el regulador de valor de china decir el doming...
2       en uno industria históricamente masculino como...
3       con el dato de marzo el ipc interanual encaden...
4       ayer en cartagena él dar inicio a el versión n...
                              ...                        
1212    en el vida de todo empresa emergente llegar un...
1213    el espiral alcista de el precio continuar y ge...
1214    el grande derrota nacional ser experiencia tra...
1215    bbva haber alcanzar uno acuerdo de colaboració...
1216    casi entrar a el parte final de noviembre el é...
Name: news_lemma, Length: 1217, dtype: object

In [15]:
X = vectorizer.fit_transform(df['news_lemma'])
Y = df['Type']
X_train , X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

0.7868852459016393
