In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

In [2]:
path_in = "twitterClimateData.csv"
df = pd.read_csv(path_in, delimiter=";")
df = df.sample(1000)
df = df[pd.notnull(df["text"])]

In [3]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,id,author_id,text,retweets,permalink,date,formatted_date,favorites,mentions,hashtags,geo,urls,search_hashtags,location
64732,64732,1.18312e+18,191250884.0,Thrilled to be a part of @morristownbooks and ...,1,https://twitter.com/DianaOlick/status/11831206...,2019-10-12 20:41:51+00:00,Sat Oct 12 20:41:51 +0000 2019,5,@MorristownBooks,#realestate #climatechange #gilbertgaul #money...,,https://www.instagram.com/p/B3iC0nwhFPr/?igshi...,#climatechange,"Washington, USA"


In [4]:
count_vec = TfidfVectorizer()
bow = count_vec.fit_transform(df['text'])
bow = np.array(bow.todense())

In [5]:
X = bow
y = df.search_hashtags

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    stratify=y)

In [6]:
model = MultinomialNB().fit(X_train, y_train)

In [7]:
y_pred = model.predict(X_test)

In [8]:
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"F1 score: {f1_score(y_test, y_pred, average='macro')}")

Accuracy: 0.33
F1 score: 0.09115426860680993


In [9]:
print(classification_report(y_test, y_pred))

                   precision    recall  f1-score   support

    #actonclimate       0.00      0.00      0.00         8
       #bushfires       0.00      0.00      0.00         3
   #climateaction       0.00      0.00      0.00        25
   #climatechange       0.73      0.26      0.38        62
   #climatecrisis       1.00      0.04      0.08        24
   #climatestrike       0.29      1.00      0.45        80
     #environment       1.00      0.05      0.09        22
#fridaysforfuture       0.00      0.00      0.00        13
   #globalwarming       0.00      0.00      0.00        18
    #greennewdeal       0.00      0.00      0.00        18
   #savetheplanet       0.00      0.00      0.00         7
  #sustainability       1.00      0.05      0.10        20

         accuracy                           0.33       300
        macro avg       0.33      0.12      0.09       300
     weighted avg       0.45      0.33      0.22       300



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Referencias

* How to create a Naive Bayes text classification model using scikit-learn: https://practicaldatascience.co.uk/machine-learning/how-to-create-a-naive-bayes-text-classification-model-using-scikit-learn