In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import nltk
from nltk.corpus import stopwords
import string
import plotly.express as px

nltk.download('stopwords')

# Crear base de datos de muestra más grande y diversa
data = {
    'tweet': [
        'I love machine learning and data science!',
        'Python is an awesome programming language',
        'Data visualization is crucial for data analysis',
        'Natural language processing with Python is great',
        'PCA is useful for dimensionality reduction',
        'I enjoy learning new things in AI',
        'Big data and AI are transforming industries',
        'Statistics and probability are important for data science',
        'Deep learning is a subset of machine learning',
        'AI is the future of technology',
        'Blockchain technology is fascinating',
        'Cryptocurrencies are transforming finance',
        'Quantum computing is the next big thing',
        'Self-driving cars are the future of transportation',
        'Climate change is a significant global issue',
        'Renewable energy sources are crucial for sustainability',
        'Healthcare technology is advancing rapidly',
        'Education technology is enhancing learning experiences',
        'Space exploration is expanding our horizons',
        'Cybersecurity is essential in the digital age'
    ]
}

df = pd.DataFrame(data)

# Preprocesamiento de los datos
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

df['cleaned_tweet'] = df['tweet'].apply(preprocess_text)

# Convertir los textos a vectores TF-IDF y obtener la palabra más representativa
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['cleaned_tweet']).toarray()

def get_top_word(text, vectorizer):
    tfidf_matrix = vectorizer.transform([text])
    feature_array = np.array(vectorizer.get_feature_names_out())
    tfidf_sorting = np.argsort(tfidf_matrix.toarray()).flatten()[::-1]
    top_word = feature_array[tfidf_sorting][0]
    return top_word

df['top_word'] = df['cleaned_tweet'].apply(lambda x: get_top_word(x, vectorizer))

# Estandarizar los datos
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Aplicar PCA con tres componentes
pca = PCA(n_components=3)
principal_components = pca.fit_transform(X_scaled)

df_pca = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2', 'PC3'])
df_pca['top_word'] = df['top_word']

# Clustering para asignar colores según la cercanía
kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(principal_components)
df_pca['cluster'] = clusters

# Visualización interactiva en 3D con plotly
fig = px.scatter_3d(df_pca, x='PC1', y='PC2', z='PC3', color='cluster', text='top_word',
                    title='PCA of Tweet Data in 3D', labels={'PC1': 'Principal Component 1', 'PC2': 'Principal Component 2', 'PC3': 'Principal Component 3'})

fig.update_traces(marker=dict(size=5), selector=dict(mode='markers'))
fig.update_layout(scene=dict(
                    xaxis_title='Principal Component 1',
                    yaxis_title='Principal Component 2',
                    zaxis_title='Principal Component 3'),
                  margin=dict(l=0, r=0, b=0, t=40))

# Guardar el gráfico interactivo en un archivo HTML
fig.write_html("pca_tweet_data.html")

fig.show()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nicol\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
