In [10]:
import os
import pandas as pd
import kagglehub

# 1) Descarga y extracción
path = kagglehub.dataset_download("crowdflower/twitter-airline-sentiment")

# 2) Lee el archivo CSV descargado
df = pd.read_csv(os.path.join(path, "Tweets.csv"))

# 3) Muestra las primeras filas del DataFrame
print("Primeras 5 filas del DataFrame:\n", df.head())

Primeras 5 filas del DataFrame:
              tweet_id airline_sentiment  airline_sentiment_confidence  \
0  570306133677760513           neutral                        1.0000   
1  570301130888122368          positive                        0.3486   
2  570301083672813571           neutral                        0.6837   
3  570301031407624196          negative                        1.0000   
4  570300817074462722          negative                        1.0000   

  negativereason  negativereason_confidence         airline  \
0            NaN                        NaN  Virgin America   
1            NaN                     0.0000  Virgin America   
2            NaN                        NaN  Virgin America   
3     Bad Flight                     0.7033  Virgin America   
4     Can't Tell                     1.0000  Virgin America   

  airline_sentiment_gold        name negativereason_gold  retweet_count  \
0                    NaN     cairdin                 NaN              0   

In [11]:
import pandas as pd
# Verifica y normaliza los nombres de las columnas
df.columns = [col.strip().lower() for col in df.columns]
print(df.columns.tolist())

# 4) Muestra la cantidad de filas y columnas
print("Cantidad de filas y columnas:", df.shape)

df = df[['text', 'airline_sentiment']]
print(df.columns.tolist())

# 5. División en train/test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['airline_sentiment'], test_size=0.2, random_state=42, stratify=df['airline_sentiment']
)
# 6. vectorización
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# 7. Entrenamiento
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train_vectorized, y_train)

# 8. Predicción
y_pred = clf.predict(X_test_vectorized)
print("Predicciones:", y_pred[:10])

# 9. Evaluación
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(y_test, y_pred)
print("Precisión del modelo:", accuracy)
print("Reporte de clasificación:\n", classification_report(y_test, y_pred))

# 10. Guardar el modelo y el vectorizador
import joblib
joblib.dump(clf, 'modelo_naive_bayes.pkl')
joblib.dump(vectorizer, 'vectorizador.pkl')

# 11. Cargar el modelo y el vectorizador
clf_loaded = joblib.load('modelo_naive_bayes.pkl')
vectorizer_loaded = joblib.load('vectorizador.pkl')

# 12. Realizar una predicción con el modelo cargado
sample_text = ["I love flying with this airline!", "The service was terrible."]
sample_vectorized = vectorizer_loaded.transform(sample_text)
sample_predictions = clf_loaded.predict(sample_vectorized)
print("Predicciones de muestra:", sample_predictions)

# 13. Guardar el DataFrame procesado
df.to_csv('tweets_procesados.csv', index=False)

# 14. Guardar el DataFrame de entrenamiento y prueba
train_df = pd.DataFrame({'text': X_train, 'airline_sentiment': y_train})
test_df = pd.DataFrame({'text': X_test, 'airline_sentiment': y_test})
train_df.to_csv('train_data.csv', index=False)
test_df.to_csv('test_data.csv', index=False)


['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence', 'negativereason', 'negativereason_confidence', 'airline', 'airline_sentiment_gold', 'name', 'negativereason_gold', 'retweet_count', 'text', 'tweet_coord', 'tweet_created', 'tweet_location', 'user_timezone']
Cantidad de filas y columnas: (14640, 15)
['text', 'airline_sentiment']
Predicciones: ['negative' 'negative' 'negative' 'negative' 'negative' 'negative'
 'negative' 'negative' 'negative' 'neutral']
Precisión del modelo: 0.7657103825136612
Reporte de clasificación:
               precision    recall  f1-score   support

    negative       0.77      0.96      0.86      1835
     neutral       0.70      0.40      0.51       620
    positive       0.81      0.49      0.61       473

    accuracy                           0.77      2928
   macro avg       0.76      0.62      0.66      2928
weighted avg       0.76      0.77      0.74      2928

Predicciones de muestra: ['positive' 'negative']
