<a href="https://colab.research.google.com/github/paughis/MachineLearning/blob/main/template_text_mining_class_labelencoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Template NLP Classificazione con LabelEncoder

In [1]:
import numpy as np
import pandas as pd
import re

import warnings
warnings.filterwarnings('ignore')

# !pip install spacy ## Colab har already the installer
# !python -m spacy download en
#!python -m spacy download it_core_news_sm
#!python -m spacy download it_core_news_md
#!python -m spacy download it_core_news_lg

In [2]:
path = 'https://frenzy86.s3.eu-west-2.amazonaws.com/python/data/NLPes1.csv'
df_original = pd.read_csv(path)
df = df_original.copy()
df

Unnamed: 0,text,class
0,b'insane ( but inspired ) musical about alferd...,1
1,b'written by david j . schow and john shirley ...,0
2,"b'miramax "" disinvited "" on-line media from pr...",0
3,"b'in "" the astronaut\'s wife , "" charlize ther...",0
4,b'one of the 90s\' most unwelcome thriller tre...,0
...,...,...
408,"b'hav plenty , as we are told in the beginning...",0
409,b'brian de palma\'s snake eyes stars nicolas c...,0
410,b'contact ( pg ) there\'s a moment late in rob...,1
411,"b""this is a film that i was inclined to like a...",0


## 0. CLEANING

In [3]:
# STEP0 drop colonas innecesarias, duplicates, y gestionar null
# df = df.drop(columns = ['...', '...'])
df.info()
# df.value_counts()
# df.drop_duplicates()
# df.isnull().sum()
# df = df.dropna()
# df.info()
# df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 413 entries, 0 to 412
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    413 non-null    object
 1   class   413 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 6.6+ KB


In [4]:
# tenemos floats en la colonna text? --> pasamos todo a stringa!!
df['text'] = df['text'].astype(str)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 413 entries, 0 to 412
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    413 non-null    object
 1   class   413 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 6.6+ KB


In [5]:
# STEP1 DEFINIRE ED APPLICARE LA REGEX
df = pd.read_csv(path)

words_to_remove = ['gargamella', 'doggy', 'word3']
words_pattern = r'\b(?:' + '|'.join(map(re.escape, words_to_remove)) + r')\b'
                # \b            -> word boundary: garantiza que se eliminen palabras completas y no partes de palabras mas grandes
                # (?: '...')    -> "grupo de no captura": agrupa elementos sin "recordar" el texto que coincide con ellas. Eficiencia.
                # + '|'         -> concatenar varios "or", o sea: words_to_remove[0]
                # map()         -> funcion de python que aplica una funcion (re.escape) a cada elemento de in iterable (words_to_remove).
                # + r'          -> concatena el cierre del patron con la parte central, asegurando que \b no sea interpretado como regex
                # re.escape     -> hace que los caract especiales sean interpretados literalmente, no como regex

patterns = {
            r"[a-zA-Z]+n\'t": 'not',
            r'\s+': ' ',             # Substitute multiple spaces with a single space
            r'\d+': '',              # Remove digits
            r'[^\w\s]': '',          # Remove punctuation and symbols
            r'\b\w{1,2}\b': '',      # Remove all tokens less than 2 characters
            r'(http|www)[^\s]+': '', # Remove websites
            words_pattern: ''        # Remove specific words
            }

def clean_column(df, column, patterns):
    for pattern, replacement in patterns.items():
        df[column] = df[column].str.replace(pattern, replacement, regex=True)
    df[column] = df[column].str.lower() # Apply lowercase transformation
    return df

#solo in preview
clean_column(df, 'text', patterns)

Unnamed: 0,text,class
0,binsane but inspired musical about alferd pa...,1
1,bwritten david schow and john shirley base...,0
2,bmiramax disinvited online media from press ...,0
3,bin the astronauts wife charlize theron pla...,0
4,bone the most unwelcome thriller trends retu...,0
...,...,...
408,bhav plenty are told the beginning and rem...,0
409,bbrian palmas snake eyes stars nicolas cages ...,0
410,bcontact theres moment late robert zemeck...,1
411,bthis film that was inclined like the out...,0


In [6]:
# applichiamo la regex
df = clean_column(df, 'text', patterns)
df

Unnamed: 0,text,class
0,binsane but inspired musical about alferd pack...,1
1,bwritten david schow and john shirley based th...,0
2,bmiramax disinvited online media from press sc...,0
3,bin the astronauts wife charlize theron plays ...,0
4,bone the most unwelcome thriller trends return...,0
...,...,...
408,bhav plenty are told the beginning and reminde...,0
409,bbrian palmas snake eyes stars nicolas cages e...,0
410,bcontact theres moment late robert zemeckiss c...,1
411,bthis film that was inclined like the outset t...,0


In [7]:
df['text'][0]
# type(df['text'][0])

'binsane but inspired musical about alferd packer the first man ever tried for cannibalism america nas the story unfolds learn what events led packer trey parker supposedly killing and eating the five men who got lost while accompanying him through the mountains colorado search gold nthroughout packer must contend with evil trappers bent stealing his prized horse tribe ute indians that doesnt seem quite the mark and terrible winter storm that leads the eventual cannibalism nparker obviously knows his musicals very well because the viewer also treated some the most maddeningly catchy songs appear film would accomplish this again his postcannibal nmusical motion picture south park bigger longer and uncut napart from the inherent interest seeing pre south park parker and stone their first effort cannibal nalso offers some pretty decent laughs nthe aforementioned ute indian tribe that packers group encounters definite highlight are two rousing musical numbers about snowmen one which ends c

In [8]:
text = list(map(str, df['text']))
text = ''.join(text)
# type(text)

## 1. SENTIMENT ANALYSIS

In [10]:
!pip install spacytextblob -q
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob

nlp = spacy.load('en_core_web_sm',disable=["tagger", "parser", "attribute_ruler", "lemmatizer","ner"])
nlp.add_pipe('spacytextblob')

nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7f2d6470dcd0>),
 ('spacytextblob',
  <spacytextblob.spacytextblob.SpacyTextBlob at 0x7f2d650e9b50>)]

In [None]:
# texto de prueba (se puede bipasear!!):
text ='I love this so very much, it is amazing and beautiful'
doc = nlp(text)

def sentiment(testo):
    doc = nlp(testo)
    return doc._.blob.polarity

sentiment(text)

In [None]:
df['sentiment_value'] = df['text'].apply(sentiment)
df.head(5)

In [None]:
def sentiment_class(x):
    if x > 0.3:
        return 'positivo'
    elif x < -0.3:
        return 'negativo'
    else:
        return 'neutro'

In [None]:
df['sentiment'] = df['sentiment_value'].apply(sentiment_class)
df

## 2. EDA

In [None]:
# Lista de positivos y negativos:
positive_text = ''.join(df[df['sentiment'] == 'positivo']['text'].to_list())
negative_text = ''.join(df[df['sentiment'] == 'negativo']['text'].to_list())

print(f'Texto positivo: {positive_text}')
print(f'Texto negativo: {negative_text}')

In [None]:
# WORDCLOUD:

# !pip install wordcloud -q
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

In [None]:
stopwords = set(STOPWORDS)

# agrego stopwords:
# stopwords.update(['palabra1','palabra2', ...])

In [None]:
# funzione di plotting

def wordcloud_funct(text,color, stopwords,title):
    wordcloud1 = WordCloud(width = 3000, height = 2000,background_color=color,stopwords=stopwords)
    wordcloud1.generate(text)

    plt.figure(figsize=(8, 6))
    plt.title(title)
    plt.imshow(wordcloud1, interpolation='bilinear')
    plt.axis("off")

In [None]:
"""
otros parametros de wordcloud:
wordcloud = WordCloud(background_color = 'black', width = 800, height = 400,
                      max_words = 180, contour_width = 3,
                      max_font_size = 80, contour_color = 'steelblue',
                      stopwords = STOPWORDS, random_state = 667)
"""

In [None]:
# Wordcloud positivo:
wordcloud_funct(positive_text,'white',stopwords,'positivo')

In [None]:
# Wordcloud negativo:
wordcloud_funct(negative_text,'black',stopwords,'negativo')

## 3. PREPROCESSING

In [None]:
# Target: LabelEncoder
# Hay que hacer esto???? Luego ponerlo en la pipe?????????????????????????????????????????????????????????????
# PROBAR CON OTRO DATASET!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['sentiment_label'] = le.fit_transform(df['sentiment'])
df

# asigna los numeros segun el orden alfabetico de las labels. Si quiero visualizarlo:
# print(le.classes_)

In [None]:
##STEP 3: Definire X e y (features testuali e Target)

X = df['text']
y = df['sentiment_label']

In [None]:
##STEP 4: Vettorizzazione - Bag of Words
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=15000,    # toma solo las 15,000 palabras más frecuentes
                             min_df=5,              # elimina palabras que aparecen en menos de 5 documentos.
                             max_df=0.7,            # elimina palabras que aparecen en más del 70% de los documentos.
                             stop_words="english")

X_vect = vectorizer.fit_transform(X).toarray()
X_vect

In [None]:
# SI QUIERO AGREGAR MIS PROPIAS STOP_WORDS:

from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS

# Lista de stop words existentes ("english") + tus propias stop words
stopwords_nuevas = ["palabra1", "palabra2", "palabra3"]
stopwords_pers = list(ENGLISH_STOP_WORDS) + stopwords_nuevas

# Crear el vectorizador con tus stop words
vectorizer = CountVectorizer(max_features = 15000, min_df = 5, max_df = 0.7, stop_words = stopwords_pers)

# Aplicar el vectorizador
X_vect = vectorizer.fit_transform(X).toarray()
X_vect

In [None]:
# X_vect.shape

In [None]:
##STEP 5: TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer

tfidfconverter = TfidfTransformer()
X_tfidf = tfidfconverter.fit_transform(X_vect).toarray()
X_tfidf

In [None]:
# X_tfidf.shape

In [None]:
##STEP 6: Split Training and Test (lo hacemos con X_tfidf!!!)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y,
                                                    test_size=0.3,
                                                    random_state=667,
                                                    stratify=y
                                                    )

## 7. MODELLAZIONE

En este template:
- RandomForestClassifier (ej. del proff)
- MultinomialNB (ej. del proff)
- GradientBoostingClassifier    (tiene mejor rendimiento que Random Forest, pero más lento. Puede funcionar bien con buen preprocesado)
- LogisticRegression (uno de los mejores clasificadores para texto: rápido, efectivo con datos dispersos como TF-IDF)
- SGDClassifier (ej. del proff)

### Clasificadores por separado:

In [None]:
# RandomForestClassifier:

from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=1000, random_state=667,max_depth=5)
classifier.fit(X_train, y_train)

# accuracy score on the test data
y_pred = classifier.predict(X_test)
test_data_accuracy = accuracy_score(y_pred, y_test)
test_data_accuracy

In [None]:
# MultinomialNB:

from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB(alpha=0.1)
classifier.fit(X_train, y_train)

# accuracy score on the test data
y_pred = classifier.predict(X_test)
test_data_accuracy = accuracy_score(y_pred, y_test)
test_data_accuracy

In [None]:
# GradientBoostingClassifier:

from sklearn.ensemble import GradientBoostingClassifier

classifier = GradientBoostingClassifier(n_estimators=50)
classifier.fit(X_train, y_train)

# accuracy score on the test data
y_pred = classifier.predict(X_test)
test_data_accuracy = accuracy_score(y_pred, y_test)
test_data_accuracy

In [None]:
# LogisticRegression:

from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

# accuracy score on the test data
y_pred = classifier.predict(X_test)
test_data_accuracy = accuracy_score(y_pred, y_test)
test_data_accuracy

In [None]:
# SGDClassifier:

from sklearn.linear_model import SGDClassifier

classifier =SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=667, max_iter=5, tol=None)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
test_data_accuracy = accuracy_score(y_test,y_pred)
test_data_accuracy

In [None]:
# Classification report (si hago el entrenamiento por separado, lo hace del ultimo!)

# labels = ['Class0', 'Class1']
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred),
      # target_names=labels)
      )

# si quiero saber cuales son mis etiquetas: df['label'].unique(). En este caso estan comentadas, porque en la columna se llaman 0 y 1.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize = (7,6))
sns.heatmap(cm,cmap= "Blues",
            linecolor = 'black',
            linewidth = 1,
            annot = True,
            fmt='',
            # formato del texto que se muestra cuando annot = True. fmt='' -> valor sin formato, fmt='' -> entero, fmt='.2f' -> decimal con dos cifras
            xticklabels = classifier.classes_,
            yticklabels = classifier.classes_)
            # classifier.classes_ es un atributo de muchos clasificadores de scikit-learn (como LogisticRegression, SGDClassifier, RandomForestClassifier)
            # que contiene una lista de las clases conocidas por el modelo, en el orden en que fueron vistas durante el entrenamiento.
            # Asegura que los ticks en el eje coincidan con el orden real de las clases.

plt.xlabel("Predicted")
plt.ylabel("Actual")

### 'All in one' classifiers

In [None]:
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

classifiers = [RandomForestClassifier(n_estimators=200,random_state=667, max_depth=5),
               MultinomialNB(alpha=0.1),
               GradientBoostingClassifier(n_estimators=50),
               LogisticRegression(max_iter=1000),
               SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=667, max_iter=5, tol=None)
               ]

for cls in classifiers:
    cls.fit(X_train, y_train)
    cls.predict(X_test)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

from sklearn.metrics import ConfusionMatrixDisplay

fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(10,8))
#labels = ['target0','target1']

for cls, ax in zip(classifiers, axes.flatten()):
    ConfusionMatrixDisplay.from_estimator(cls,
                                          X_test,
                                          y_test,
                                          ax=ax,
                                          cmap='Blues',
                                          #display_labels=labels
                                          )

    ax.title.set_text(type(cls).__name__)

plt.tight_layout()
#plt.show()

In [None]:
from sklearn.metrics import classification_report

for cls in classifiers:
    cls.fit(X_train, y_train)
    cls.predict(X_test)
    print(cls)
    print(classification_report(y_test,cls.predict(X_test))) #, target_names=labels))

## 8. CREO CLASE TextCleaner (para poner en la pipeline):

TextCleaner es un componente personalizado disenado para usarlo en el pipeline de sklearn. Permite limpiar el texto siguiendo un conjunto de patrones definidos antes de la vectorizacion:

`from sklearn.base import BaseEstimator, TransformerMixin`
- BaseEstimator: proporciona los métodos get_params() y set_params() para gestionar parámetros.
- TransformerMixin: proporciona el método fit_transform()

- Esto hace que la clase sea compatible con los pipelines de scikit-learn.

`class TextCleaner(BaseEstimator, TransformerMixin):`

    `def __init__(self, patterns):  # Constructor, acepta el parametro "patterns", que es donde pusimos los regex`
        `self.patterns = patterns`

    `def fit(self, X, y=None):      # Metodo obligatorio para cualquier transformados en scikit-learn`
        `return self                # No realiza ninguna operacion de aprendizaje, solo devuelve self`

    `def transform(self, X):                            # Realiza la limpieza del texto`
        `df = pd.DataFrame(X, columns=['text'])         # Convierte X en un DFr con una columna llamada 'text`
        `df = clean_column(df, 'text', self.patterns)   # Llama la funcion clean_column() que aplica los patrones de limpieza`
        `return df['text'].values                       # Devuelve el texto limpio como un array`

In [None]:
# TextCleaner:

from sklearn.base import BaseEstimator, TransformerMixin

class TextCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, patterns):
        self.patterns = patterns

    def fit(self, X, y=None):       # probar sacando "y" y averiguar cuando seria necesario ponerlo!
        return self

    def transform(self, X):
        df = pd.DataFrame(X, columns=['text'])
        df = clean_column(df, 'text', self.patterns)
        return df['text'].values

## 9. PIPELINE

In [None]:
# Step 8: Pipeline Finale (nuovo train test split su dati raw testuali)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    random_state=667,
                                                    stratify=y
                                                    )

from sklearn.pipeline import Pipeline

text_cleaner = TextCleaner(patterns)
bow = CountVectorizer(max_features=1500, min_df=5, max_df=0.7,stop_words='english')
tfidf = TfidfTransformer()
clf = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=667, max_iter=5, tol=None)
    # MultinomialNB(alpha=0.1)
    # RandomForestClassifier(n_estimators=200,random_state=667, max_depth=5)
    # GradientBoostingClassifier(n_estimators=50)
    # LogisticRegression(max_iter=1000)

pipe = Pipeline([
                ('text cleaner', text_cleaner),
                ('bow',bow),
                ('tfidf',tfidf),
                ('clf',clf),
                ])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

test_data_accuracy = accuracy_score(y_test,y_pred)
print('Accuracy score of the test data : ', test_data_accuracy)

In [None]:
#labels = ['Class0', 'Class1']
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))#, target_names=labels))

In [None]:
# import seaborn as sns
# import matplotlib.pyplot as plt
# from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize = (7,6))
sns.heatmap(cm,cmap= "Blues",
            linecolor = 'black',
            linewidth = 1,
            annot = True,
            fmt='',
            xticklabels = classifier.classes_,
            yticklabels = classifier.classes_)

plt.xlabel("Predicted")
plt.ylabel("Actual")

In [None]:
# Step 10: Cross Validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(pipe, X, y, scoring = 'f1_micro', cv = 8)

print(f'scores={scores}')
print(f'mean={np.mean(scores)}')
print(f'std={np.std(scores)}')

## 10. DUMP PIPE

In [None]:
import joblib
joblib.dump(pipe,'NLPEs1.pkl')

## 11. LOAD AND TEST PIPE

In [None]:
import joblib

uploaded_model = joblib.load('NLPEs1.pkl')
pred = uploaded_model.predict(["test_text_positive"]) # 1
pred[0]

In [None]:
pred = uploaded_model.predict(["test_text_negative"]) # 0
pred[0]