# Visualización PCA

Está un poco desordenada esta notebook pero les dejo los bloques de código por si quieren probarlo con el output de su preprocesado 

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
import plotly.express as px

In [4]:
df_classif = df[["lemmatized_content", "cleaned_content", "target"]]
df_classif.head(5)

Unnamed: 0,lemmatized_content,cleaned_content,target
0,holar comunidad internetpaso dejar consejo ser...,hola comunidad internetpaso dejar consejo serv...,1
1,encontrar recurso navegar web parecer útil que...,encontré recurso navegando web pareció útil qu...,1
2,semana condición logra dormir noche empezar en...,semanas condición logras dormir noches empezó ...,1
3,semana crisis hipertensivo ver mejora distr...,semanas crisis hipertensiva veía mejora dist...,1
4,problema ansiedad tiempo cambio siento miedo s...,problemas ansiedad tiempo cambio siento miedo ...,1


## Vectorización + PCA

### Bag of Words

In [5]:
# Bag-of-Words (BoW) Vectorization
vectorizer = CountVectorizer(max_features=1000)  # Adjust max_features as needed
X_bow = vectorizer.fit_transform(df['lemmatized_content'])

# PCA
pca = PCA(n_components=3)  # Reduce to 3 principal components for 3D visualization
X_pca = pca.fit_transform(X_bow.toarray())

# Step 3: Prepare DataFrame for Plotly
pca_df = pd.DataFrame(X_pca, columns=['PC1', 'PC2', 'PC3'])
pca_df['target'] = df['target']
feature_names = vectorizer.get_feature_names()

# Add feature names for hover information
top_feature_names = []
for row in X_bow.toarray():
    top_feature_names.append([feature_names[idx] for idx in row.argsort()[-5:]])

pca_df['top_features'] = top_feature_names

# 3D Scatter Plot
fig = px.scatter_3d(
    pca_df,
    x='PC1',
    y='PC2',
    z='PC3',
    color='target',
    hover_data={'top_features': True, 'target': True},
    labels={'PC1': 'Principal Component 1', 'PC2': 'Principal Component 2', 'PC3': 'Principal Component 3'},
    title='PCA - BoW'
)

fig.show(renderer = 'browser')



### TF-IDF

Con data sin lematizar

In [13]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import plotly.express as px

# Assuming df is your DataFrame with "cleaned_content" and "target" columns

# Step 1: TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=1000)  # Adjust max_features as needed
X_tfidf = vectorizer.fit_transform(df['cleaned_content'])

# Step 2: PCA
pca = PCA(n_components=3)  # Reduce to 3 principal components for 3D visualization
X_pca = pca.fit_transform(X_tfidf.toarray())

# Step 3: Prepare DataFrame for Plotly
pca_df = pd.DataFrame(X_pca, columns=['PC1', 'PC2', 'PC3'])
pca_df['target'] = df['target']
feature_names = vectorizer.get_feature_names_out()

# Add feature names for hover information
top_feature_names = []
for row in X_tfidf.toarray():
    top_feature_names.append([feature_names[idx] for idx in row.argsort()[-5:]])

pca_df['top_features'] = top_feature_names

# Step 4: Create Plotly 3D Scatter Plot
fig = px.scatter_3d(
    pca_df,
    x='PC1',
    y='PC2',
    z='PC3',
    color='target',
    hover_data={'top_features': True, 'target': True},
    labels={'PC1': 'Principal Component 1', 'PC2': 'Principal Component 2', 'PC3': 'Principal Component 3'},
    title='PCA - TF-IDF - cleaned_content'
)

fig.show(renderer='browser')

Con data lematizada

In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import plotly.express as px

# Assuming df is your DataFrame with "cleaned_content" and "target" columns

# Step 1: TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=1000)  # Adjust max_features as needed
X_tfidf = vectorizer.fit_transform(df['lemmatized_content'])

# Step 2: PCA
pca = PCA(n_components=3)  # Reduce to 3 principal components for 3D visualization
X_pca = pca.fit_transform(X_tfidf.toarray())

# Step 3: Prepare DataFrame for Plotly
pca_df = pd.DataFrame(X_pca, columns=['PC1', 'PC2', 'PC3'])
pca_df['target'] = df['target']
feature_names = vectorizer.get_feature_names_out()

# Add feature names for hover information
top_feature_names = []
for row in X_tfidf.toarray():
    top_feature_names.append([feature_names[idx] for idx in row.argsort()[-5:]])

pca_df['top_features'] = top_feature_names

# Step 4: Create Plotly 3D Scatter Plot
fig = px.scatter_3d(
    pca_df,
    x='PC1',
    y='PC2',
    z='PC3',
    color='target',
    hover_data={'top_features': True, 'target': True},
    labels={'PC1': 'Principal Component 1', 'PC2': 'Principal Component 2', 'PC3': 'Principal Component 3'},
    title='PCA - TF-IDF - lemmatized_content'
)

fig.show(renderer='browser')