In [1]:
import pandas as pd
df = pd.read_csv("../data/sentimentdataset_cleaned.csv")
df.head()

Unnamed: 0,Id,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,...,Day,Hour,text_len,word_count,hashtag_count,emoji_count,SentimentSimple,Sentiment_norm,TextLength,LengthGroup
0,0,Enjoying a beautiful day at the park! ...,Positive,2023-01-15 12:30:00,User123,Twitter,#Nature #Park,15.0,30.0,Usa,...,15,12,52,7,2,1,Positive,Positive,52,30-59
1,1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,...,15,8,52,5,2,1,Negative,Negative,52,30-59
2,2,Just finished an amazing workout! ðŸ’ª ...,Positive,2023-01-15 15:45:00,FitnessFan,Instagram,#Fitness #Workout,20.0,40.0,Usa,...,15,15,51,6,2,2,Positive,Positive,51,30-59
3,3,Excited about the upcoming weekend getaway! ...,Positive,2023-01-15 18:20:00,AdventureX,Facebook,#Travel #Adventure,8.0,15.0,United Kingdom,...,15,18,52,6,2,1,Positive,Positive,52,30-59
4,4,Trying out a new recipe for dinner tonight. ...,Neutral,2023-01-15 19:55:00,ChefCook,Instagram,#Cooking #Food,12.0,25.0,Australia,...,15,19,52,8,2,1,Neutral,Neutral,52,30-59


In [52]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X = df[[
    "Text",
    "Hashtags",
    "LengthGroup",
    "SentimentSimple",
    "emoji_count",
    "Retweets",
    "Likes",
]]
y = df["Platform"]

preprocess = ColumnTransformer(
    transformers=[
        ("text", TfidfVectorizer(max_features=3000, ngram_range=(1,2)), "Text"),
       
        ("length", OneHotEncoder(handle_unknown="ignore"), ["LengthGroup"]),
        ("sentiment", OneHotEncoder(handle_unknown="ignore"), ["SentimentSimple"]),
        ("retweets", OneHotEncoder(handle_unknown="ignore"), ["Retweets"]),
        
    ]
)

model = Pipeline([
    ("prep", preprocess),
    ("clf", MultinomialNB())
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

model.fit(X_train, y_train)

preds = model.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

    Facebook       0.43      0.28      0.34        46
   Instagram       0.41      0.54      0.46        52
     Twitter       0.44      0.43      0.43        49

    accuracy                           0.42       147
   macro avg       0.43      0.42      0.41       147
weighted avg       0.42      0.42      0.42       147



In [4]:
import numpy as np

clf = model.named_steps["clf"]
tfidf_text = model.named_steps["prep"].named_transformers_["text"]

text_feature_names = tfidf_text.get_feature_names_out()
n_text = len(text_feature_names)

class_labels = clf.classes_
log_probs = clf.feature_log_prob_

for i, platform in enumerate(class_labels):
    print("\nTop words for:", platform)
    
    # solo la parte de log_probs que corresponde a las palabras de Text
    class_log_probs_text = log_probs[i][:n_text]
    
    top_idx = class_log_probs_text.argsort()[-15:]
    top_words = [text_feature_names[j] for j in top_idx]
    print(top_words)


Top words for: Facebook
['of life', 'new', 'after', 'by the', 'an', 'by', 'on', 'and', 'with', 'for', 'to', 'in the', 'in', 'of', 'the']

Top words for: Instagram
['as', 'my', 'on', 'by', 'through', 'at', 'for', 'and', 'with', 'new', 'to', 'in the', 'in', 'of', 'the']

Top words for: Twitter
['an', 'by', 'by the', 'of the', 'for the', 'on the', 'on', 'and', 'to', 'with', 'in the', 'for', 'in', 'of', 'the']
