### 0. Importación de librerías 

En las siguientes líneas de código se importan las librerías y herramientas necesarias para desarrollar el caso de uso.

In [None]:
!pip install nltk

In [None]:
# Librerías para manejo de datos
import pandas as pd
pd.set_option('display.max_columns', 25) # Número máximo de columnas a mostrar
pd.set_option('display.max_rows', 50) # Numero máximo de filas a mostar
import numpy as np
np.random.seed(3301)
import re

import seaborn as sns   
from matplotlib import pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import sklearn.model_selection
from sklearn.feature_extraction import text
import warnings
warnings.filterwarnings("ignore")

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

# Carga de los datos
A través de la librería **pandas** podemos realizar la carga de datos desde diferentes fuentes de información, en este caso se realizará la carga de un archivo plano csv, el separador del archivo es ; y se remplazaron los valores "-" por valores nulos.

## Dataset tweets

In [None]:
# Se cargan los datos. 
df_tweets=pd.read_csv('./archive/mbsa.csv', sep=',', encoding = 'utf-8', header=0)

In [None]:
# Cantidad de datos y número de variables
df_tweets.shape

In [None]:
# Mostrar los datos
df_tweets.head()

In [None]:
# Podemos ver los tipos de todas la variables.
df_tweets.dtypes

In [None]:
df_tweets["Sentiment"].value_counts()

## Dataset Precios Bitcoin
dado que hay muy pocas muestras nuetrales, decidimos usar el dataset de precios de bitcoin para poder definir cuando hay un cambio positivo, cuando negativo y cuando neutral

In [None]:
# Se cargan los datos. 
df_precios=pd.read_csv('./archive2/coin_Bitcoin.csv', sep=',', encoding = 'utf-8', header=0)

In [None]:
# Cantidad de datos y número de variables
df_precios.shape

In [None]:
# Mostrar los datos
df_precios.head()

In [None]:
# Podemos ver los tipos de todas la variables.
df_precios.dtypes

# Limpieza y preparación de los datos

Primero vamos a ejecutar los pasos de limpieza de los datos, relacionados el tratamiento de ausencias y registros duplicados.

## Tweets

In [None]:
# Es recomendable que todos los pasos de limpieza y preparación se realicen sobre otro archivo.
df_tweets_l = df_tweets

In [None]:
# Se observa que hay ausencias, sin embargo no son una cantidad significativa:
df_tweets_l.isnull().sum()

In [None]:
# Eliminación registros con ausencias
df_tweets_l = df_tweets_l.dropna()
# Eliminación de registros duplicados.
df_tweets_l = df_tweets_l.drop_duplicates()

In [None]:
# Cantidad de datos y número de variables
df_tweets_l.shape

In [None]:
df_tweets_l["Sentiment"].value_counts()

In [None]:
df_tweets_l["Date"]=pd.to_datetime(df_tweets_l["Date"], format='%Y-%m-%d', errors='coerce')

In [None]:
df_tweets_l["Date"].describe()

## Precios

In [None]:
# Es recomendable que todos los pasos de limpieza y preparación se realicen sobre otro archivo.
df_precios_l = df_precios

In [None]:
# Se observa que hay ausencias, sin embargo no son una cantidad significativa:
df_precios_l.isnull().sum()

In [None]:
df_precios_l["Date"]=pd.to_datetime(df_precios_l["Date"], format='%Y-%m-%d', errors='coerce').dt.date
df_precios_l["Date"]=pd.to_datetime(df_precios_l["Date"], format='%Y-%m-%d', errors='coerce')

In [None]:
df_precios_l["Date"].describe()

In [None]:
df_precios_l["Variacion"]= df_precios_l["Close"]-df_precios_l["Open"]

In [None]:
df_precios_l.drop(["SNo", "Name", "Symbol", "High", "Low", "Open", "Close", "Volume", "Marketcap"], axis=1,inplace=True)

In [None]:
df_precios_l.dtypes

## Union de datasets

In [None]:
df_datos=df_tweets_l.merge(df_precios_l, left_on='Date', right_on='Date', how="left")

In [None]:
df_datos.head()

In [None]:
df_datos.describe()

In [None]:
percentile_positive=df_datos["Variacion"].quantile(0.66)
percentile_negative=df_datos["Variacion"].quantile(0.33)

In [None]:
df_datos.loc[df_datos["Variacion"]>percentile_positive, "Influencia"]= 1
df_datos.loc[df_datos["Variacion"]<percentile_negative, "Influencia"]= -1
df_datos.loc[(df_datos["Variacion"]>=percentile_negative) & (df_datos["Variacion"]<=percentile_positive), "Influencia"]= 0

In [None]:
df_datos["Influencia"].value_counts()

In [None]:
df_datos.drop( ["Date","Sentiment", "Variacion"],axis=1,inplace=True)

In [None]:
df_datos.head()

In [None]:
df_datos["text"]=df_datos['text'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

## Limpieza

In [None]:
df_datos=df_datos.loc[1:1000000]

In [None]:
df_datos.shape

In [None]:
# A list of contractions from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [None]:
def clean_text(text, remove_stopwords = True):
    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''
    
    # Convert words to lower case
    text = text.lower()
    
    # Replace contractions with their longer forms 
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)
    
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
    # remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    # Tokenize each word
    text =  nltk.WordPunctTokenizer().tokenize(text)
        
    return text

In [None]:
df_datos['text'] = list(map(clean_text, df_datos.text))

In [None]:

def lemmatized_words(text):
    lemm = nltk.stem.WordNetLemmatizer()
    df_datos['lemmatized'] = list(map(lambda word:
                                     list(map(lemm.lemmatize, word)),
                                     df_datos.text))
    

lemmatized_words(df_datos.text)

In [None]:
df_datos.head(3)


In [None]:
pd.set_option('max_colwidth', 500)
df_datos[['Influencia', 'text', 'lemmatized']].sample(3)

In [None]:
bow_converter = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)
x = bow_converter.fit_transform(df_datos['text'])

words = bow_converter.get_feature_names()
len(words)

In [None]:
bigram_converter = CountVectorizer(tokenizer=lambda doc: doc, ngram_range=[2,2], lowercase=False) 
x2 = bigram_converter.fit_transform(df_datos['text'])
bigrams = bigram_converter.get_feature_names()
len(bigrams)

In [None]:
bigrams[-10:]

In [None]:
trigram_converter = CountVectorizer(tokenizer=lambda doc: doc, ngram_range=[3,3], lowercase=False) 
x3 = trigram_converter.fit_transform(df_datos['text'])
trigrams = trigram_converter.get_feature_names()
len(trigrams)

In [None]:
trigrams[:10]


In [None]:
print(len(words), len(bigrams), len(trigrams))


In [None]:
sns.set_style("white")
counts = [len(words), len(bigrams), len(trigrams)]
plt.plot(counts, color='blue')
plt.plot(counts, 'bo')
#plt.margins(0.1)
plt.ticklabel_format(style = 'plain')
plt.xticks(range(3), ['unigram', 'bigram', 'trigram'])
plt.tick_params(labelsize=14)
plt.title('Number of ngrams ', {'fontsize':16})
plt.show()

# Modelo

In [None]:
training_data, test_data = sklearn.model_selection.train_test_split(df_datos, train_size = 0.7, random_state=42)

In [None]:
print(training_data.shape)
print(test_data.shape)

In [None]:
bow_transform = CountVectorizer(tokenizer=lambda doc: doc, ngram_range=[3,3], lowercase=False)

In [None]:
X_tr_bow = bow_transform.fit_transform(training_data['text'])

In [None]:
len(bow_transform.vocabulary_)

In [None]:
X_tr_bow.shape

In [None]:
X_te_bow = bow_transform.transform(test_data['text'])

In [None]:
y_tr = training_data['Influencia']
y_te = test_data['Influencia']

In [None]:
tfidf_transform = text.TfidfTransformer(norm=None)
X_tr_tfidf = tfidf_transform.fit_transform(X_tr_bow)

In [None]:
X_te_tfidf = tfidf_transform.transform(X_te_bow)


In [None]:
def simple_logistic_classify(X_tr, y_tr, X_test, y_test, description, _C=1.0):
    model = LogisticRegression(C=_C).fit(X_tr, y_tr)
    score = model.score(X_test, y_test)
    print('Test Score with', description, 'features', score)
    return model

In [None]:
model_bow = simple_logistic_classify(X_tr_bow, y_tr, X_te_bow, y_te, 'bow')
model_tfidf = simple_logistic_classify(X_tr_tfidf, y_tr, X_te_tfidf, y_te, 'tf-idf')

In [None]:
param_grid_ = {'C': [1e-5, 1e-3, 1e-1, 1e0, 1e1, 1e2]}
bow_search = sklearn.model_selection.GridSearchCV(LogisticRegression(), cv=5, param_grid=param_grid_)
tfidf_search = sklearn.model_selection.GridSearchCV(LogisticRegression(), cv=5,
                                   param_grid=param_grid_)

In [None]:
bow_search.fit(X_tr_bow, y_tr)

In [None]:
bow_search.best_score_

In [None]:
tfidf_search.fit(X_tr_tfidf, y_tr)

In [None]:
tfidf_search.best_score_


In [None]:
bow_search.best_params_


In [None]:
tfidf_search.best_params_


In [None]:
bow_search.cv_results_