In [None]:
!conda install -c intel scikit-learn

# Training a sentiment analysis classifier based on supervised machine learning algorithms

In [None]:
import string

import pymongo

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import ConfusionMatrixDisplay, precision_score, recall_score, f1_score

import pandas as pd

In [None]:
stop_words = set(stopwords.words('spanish'))

In [None]:
# MongoDB connection data
HOST = 'localhost'
PORT = '27017'
DB_NAME = 'trending'

### Connecting to database

In [None]:
db = None
try:
    db_client = pymongo.MongoClient(HOST + ':' + PORT)
    db = db_client[DB_NAME]
    print('Conecction established successfully!')
except pymongo.errors.ConnectionFailure:
    print('Connection cannot be established')

In [None]:
HISTORY_COL = db['history']

### Getting all texts and removing duplicates

In [None]:
tweets = HISTORY_COL.find({}, {'_id': 0, 'full_text': 1})

In [None]:
tweets_df =  pd.DataFrame(list(tweets))

In [None]:
tweets_df.shape

In [None]:
tweets_df['full_text'] = tweets_df['full_text'].str.replace('RT ', '')

In [None]:
tweets_df.drop_duplicates(inplace = True)

In [None]:
tweets_df.shape

In [None]:
tweets_df['sentiment'] = None

In [None]:
tweets_df.head()

### Labeling

In [None]:
tweets_df.to_csv('./data/medellin_tweets.csv', index = False, sep = '|')

<br />
<span style="color: red;">Labeling...</span>
<br /><br />

In [None]:
tweets_df = pd.read_csv('./data/medellin_tweets_labeled.csv', sep = ',')

In [None]:
tweets_df.head()

In [None]:
tweets_df['sentiment'].value_counts(dropna = False, normalize = True)

### Leaving out unlabeled texts, this data is not useful for training or validating a supervised model

In [None]:
tweets_labeled_df = tweets_df.loc[tweets_df['sentiment'].notnull()]

In [None]:
tweets_labeled_df.shape

In [None]:
tweets_nolabeled_df = tweets_df.loc[tweets_df['sentiment'].isnull()]

In [None]:
tweets_nolabeled_df.shape

### Vectorizing text using BOW

<img src="./imgs/bow.png" style="height: 250px;">

In [None]:
def tokenizer(text):
    tt = TweetTokenizer()
    return tt.tokenize(text)

In [None]:
vectorizer = CountVectorizer(tokenizer = tokenizer, stop_words = stop_words)

In [None]:
X = vectorizer.fit_transform(tweets_labeled_df['full_text'])

In [None]:
vectorizer.get_feature_names()[:10]

### Splitting train and test datasets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, tweets_labeled_df['sentiment'], test_size = 0.3, stratify = tweets_labeled_df['sentiment'], random_state = 1)

In [None]:
pd.Series(y_train).value_counts(normalize = True)

In [None]:
pd.Series(y_test).value_counts(normalize = True)

### Training a first model

In [None]:
logistic_model = LogisticRegression(random_state = 2)

In [None]:
logistic_model.fit(X_train, y_train)

In [None]:
y_train_predict = logistic_model.predict(X_train)
y_test_predict = logistic_model.predict(X_test)

### Validating how this first model behaves

<img src="./imgs/confusion-matrix.png" style="height: 300px;">

In [None]:
ConfusionMatrixDisplay.from_predictions(y_train, y_train_predict)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_test_predict)

<img src="./imgs/precision-recall.png" style="height: 700px;">

In [None]:
print(precision_score(y_test, y_test_predict))
print(recall_score(y_test, y_test_predict))
print(f1_score(y_test, y_test_predict))

### Next steps

- Vectorize words using TF-IDF:

<img src="./imgs/tf-idf.png" style="height: 350px;">

- Test different algorithms: árboles de decision (random forest), suppurt vector machines, otros.
- ¿Which model is better and why?