In [4]:
#  https://www.cienciadedatos.net/documentos/py24-svm-python.html#:~:text=M%C3%A1quinas%20de%20Vector%20Soporte%20(Vector,campo%20de%20la%20ciencia%20computacional.
# https://pharos.sh/implementacion-de-svm-y-kernel-svm-con-scikit-learn-de-python/
# Tratamiento de datos
# ==============================================================================
import pandas as pd
import numpy as np

# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
from mlxtend.plotting import plot_decision_regions

# Preprocesado y modelado
# ==============================================================================
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Configuración matplotlib
# ==============================================================================
plt.rcParams['image.cmap'] = "bwr"
#plt.rcParams['figure.dpi'] = "100"
plt.rcParams['savefig.bbox'] = "tight"
style.use('ggplot') or plt.style.use('ggplot')

# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

In [5]:
#DATOS
d = pd.read_csv("CSV_Informacion_Categorizada/cont_sent_CR_PE_MX_UY.csv", sep="|")
df= d.rename(columns={'Unnamed: 0':'index'})
df.head(3)

Unnamed: 0,index,Content,Sentiment
0,0,@NoilyMV yo soy totalmente puntual,NONE
1,1,@SandraCauffman Hola Sandrita. No le habia des...,P
2,2,Si andan haciendo eso mejor se quedaran callad...,N


In [6]:
df.shape

(3675, 3)

In [7]:
df['Sentiment'] = df.Sentiment.map({'N':0,'P':1})
df1= df.dropna()
df1

Unnamed: 0,index,Content,Sentiment
1,1,@SandraCauffman Hola Sandrita. No le habia des...,1.0
2,2,Si andan haciendo eso mejor se quedaran callad...,0.0
3,3,Que pereza quiero choco banano,0.0
4,4,"@robertobrenes Bueno, no es tanto lo mayor com...",0.0
6,6,@doriamdiaz El de Halfon de Germinal se ve mor...,1.0
...,...,...,...
3666,934,@CuadradoAndres @grazianopascale @adeladubra j...,1.0
3668,936,@Niaso01 @LuisSuarez9 @neymarjr El fútbol es h...,1.0
3669,937,#FelizDOMINGO que la paz de dios llene tu vida...,1.0
3672,940,A mi desayuno le hizo falta un alfajor podrida...,0.0


In [8]:
df1.shape

(2449, 3)

In [9]:
X = df1.drop('Sentiment', axis=1)
y = df1['Sentiment']

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split('Sentiment', test_size = 0.20)

ValueError: not enough values to unpack (expected 4, got 2)

In [19]:
from sklearn.svm import SVC
svclassifier = SVC(kernel="linear")
svclassifier.fit(X_train, y_train)

ValueError: could not convert string to float: 'Cuando te enamoras de cada flaca que pasa. Chimbote era chevere pero no me quirro ir de trujillo.'

In [20]:
y_pred = svclassifier.predict(X_test)

NotFittedError: This SVC instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [21]:
fig, ax = plt.subplots(figsize=(6,4))
ax.scatter(df.index, df.Content, c=df.Sentiment);
ax.set_title("Datos contenido y sentimiento");

ValueError: 
@veronica_alonso @RamelaCarlos dice q no existe clase media de donde cree q sale $$ para las subvenciones, de Disney?, ah no, tampoco.
                                                                                 ^
Expected end of text, found '$'  (at char 81), (line:1, col:82)

<Figure size 432x288 with 1 Axes>

In [22]:
# División de los datos en train y test
# ==============================================================================
from sklearn.model_selection import train_test_split

X = df.drop(columns = 'Sentiment')
y = df['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(
                                        X,
                                        y.values.reshape(-1,1),
                                        train_size   = 0.8,
                                        random_state = 1234,
                                        shuffle      = True
                                    )

In [23]:
# Creación del modelo SVM lineal
# ==============================================================================
modelo = SVC(C = 100, kernel = 'linear', random_state=123)
modelo.fit(X_train, y_train)

ValueError: could not convert string to float: '@JArmandoCS solo me quedan 3 sábados más así jiji. ¿Qué días estás yendo a la U que no te he visto?'

In [None]:
# Representación gráfica de los límites de clasificación
# ==============================================================================
# Grid de valores
x = np.linspace(np.min(X_train.index), np.max(X_train.index), 50)
y = np.linspace(np.min(X_train.Content), np.max(X_train.Content), 50)
Y, X = np.meshgrid(y, x)
grid = np.vstack([X.ravel(), Y.ravel()]).T

# Predicción valores grid
pred_grid = modelo.predict(grid)

fig, ax = plt.subplots(figsize=(6,4))
ax.scatter(grid[:,0], grid[:,1], c=pred_grid, alpha = 0.2)
ax.scatter(X_train.index, X_train.Content, c=y_train, alpha = 1)

# Vectores soporte
ax.scatter(
    modelo.support_vectors_[:, 0],
    modelo.support_vectors_[:, 1],
    s=200, linewidth=1,
    facecolors='none', edgecolors='black'
)

# Hiperplano de separación
ax.contour(
    X,
    Y,
    modelo.decision_function(grid).reshape(X.shape),
    colors = 'k',
    levels = [-1, 0, 1],
    alpha  = 0.5,
    linestyles = ['--', '-', '--']
)

ax.set_title("Resultados clasificación SVM lineal");

In [None]:
predicciones = modelo.predict(X_test)
predicciones

In [None]:
# Accuracy de test del modelo 
# ==============================================================================
accuracy = accuracy_score(
            y_true    = y_test,
            y_pred    = predicciones,
            normalize = True
           )
print("")
print(f"El accuracy de test es: {100*accuracy}%")

In [None]:
# Grid de hiperparámetros
# ==============================================================================
param_grid = {'C': np.logspace(-5, 7, 20)}

# Búsqueda por validación cruzada
# ==============================================================================
grid = GridSearchCV(
        estimator  = SVC(kernel= "rbf", gamma='scale'),
        param_grid = param_grid,
        scoring    = 'accuracy',
        n_jobs     = -1,
        cv         = 3, 
        verbose    = 0,
        return_train_score = True
      )

# Se asigna el resultado a _ para que no se imprima por pantalla
_ = grid.fit(X = X_train, y = y_train)

# Resultados del grid
# ==============================================================================
resultados = pd.DataFrame(grid.cv_results_)
resultados.filter(regex = '(param.*|mean_t|std_t)')\
    .drop(columns = 'params')\
    .sort_values('mean_test_score', ascending = False) \
    .head(5)

In [None]:
# Mejores hiperparámetros por validación cruzada
# ==============================================================================
print("----------------------------------------")
print("Mejores hiperparámetros encontrados (cv)")
print("----------------------------------------")
print(grid.best_params_, ":", grid.best_score_, grid.scoring)

modelo = grid.best_estimator_

In [None]:
# Representación gráfica de los límites de clasificación
# ==============================================================================
# Grid de valores
x = np.linspace(np.min(X_train.X1), np.max(X_train.X1), 50)
y = np.linspace(np.min(X_train.X2), np.max(X_train.X2), 50)
Y, X = np.meshgrid(y, x)
grid = np.vstack([X.ravel(), Y.ravel()]).T

# Predicción valores grid
pred_grid = modelo.predict(grid)

fig, ax = plt.subplots(figsize=(6,4))
ax.scatter(grid[:,0], grid[:,1], c=pred_grid, alpha = 0.2)
ax.scatter(X_train.index, X_train.Content, c=y_train, alpha = 1)

# Vectores soporte
ax.scatter(
    modelo.support_vectors_[:, 0],
    modelo.support_vectors_[:, 1],
    s=200, linewidth=1,
    facecolors='none', edgecolors='black'
)

# Hiperplano de separación
ax.contour(
    X,
    Y,
    modelo.decision_function(grid).reshape(X.shape),
    colors='k',
    levels=[0],
    alpha=0.5,
    linestyles='-'
)

ax.set_title("Resultados clasificación SVM radial");

In [None]:
# Representación gráfica utilizando plot_decision_regions() de mlxtend
# ==============================================================================
fig, ax = plt.subplots(figsize=(6,4))
plot_decision_regions(
    X = X_train.to_numpy(),
    y = y_train.flatten(),
    clf = modelo,
    ax = ax
)
ax.set_title("Resultados clasificación SVM radial");

In [None]:
# Predicciones test
# ==============================================================================
predicciones = modelo.predict(X_test)

In [None]:
# Accuracy de test del modelo 
# ==============================================================================
accuracy = accuracy_score(
            y_true    = y_test,
            y_pred    = predicciones,
            normalize = True
           )
print("")
print(f"El accuracy de test es: {100*accuracy}%")

In [None]:
# Matriz de confusión de las predicciones de test
# ==============================================================================
confusion_matrix = pd.crosstab(
    y_test.ravel(),
    predicciones,
    rownames=['Real'],
    colnames=['Predicción']
)
confusion_matrix

In [None]:
from sinfo import sinfo
sinfo()