In [None]:
import string

#Database extraction
import json
import psycopg2
from config import config
from sqlalchemy import create_engine
from config import config

#Data Analysis
import pandas as pd
pd.set_option('display.max_columns', 5000)
pd.set_option('display.max_rows', 5000)
import matplotlib.pyplot as plt
import seaborn as sns

#Data Preprocessing and Feature Engineering
from textblob import TextBlob
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

#Model Selection and Validation
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score

In [None]:
db_params = config()
conn = psycopg2.connect(**db_params)
db_cursor = conn.cursor()

raw_data = pd.read_sql_query('SELECT * FROM confinamientodomiciliario', conn)

db_cursor.close()
conn.close()

In [None]:
raw_data.head(5)

In [None]:
df_raw = raw_data.copy()
print('cantidad de valores nulos: \n', df_raw.isnull().sum())

We don't have any null (NaN) at this point.

In [None]:
df_processed = pd.read_csv('confinamientodomiciliarioProcessed.csv')


PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

In [None]:
STOPWORDS = set(stopwords.words('spanish'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

In [None]:
from collections import Counter
cnt = Counter()
for text in df_processed["full_text"].values:
    for word in text.split():
        cnt[word] += 1
        
cnt.most_common(10)

In [None]:
FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
def remove_freqwords(text):
    """custom function to remove the frequent words"""
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

In [None]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [None]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [None]:
for index, row in df_processed.iterrows():
    df_processed['text_no_url'] = df_processed["full_text"].apply(lambda text: remove_urls(text))
    df_processed['text_no_punct'] = df_processed["text_no_url"].apply(lambda text: remove_punctuation(text))
    df_processed['text_no_stop'] = df_processed["text_no_punct"].apply(lambda text: remove_stopwords(text))
    df_processed["text_no_stopfreq"] = df_processed["text_no_stop"].apply(lambda text: remove_freqwords(text))
    df_processed['text_no_emojis'] = df_processed["text_no_stopfreq"].apply(lambda text: remove_emoji(text))
#df_processed.to_csv('confinamientodomiciliarioProcessed.csv')
df_processed.head(5)

In [None]:
df_sentiment = df_processed.copy()

for index, row in df_sentiment.iterrows():
    blob = TextBlob(row['text_no_emojis'])
    sentiment = blob.sentiment
    print(sentiment)
    df_sentiment['sentiment'] = sentiment.polarity
    df_sentiment['subjectivity'] = sentiment.subjectivity
df_sentiment.head(5)