In [1]:
# 

applying deep learning on twitter’s sentiment analysis

*   Train Model - use keras to build and train a deep neural network model

*   Evaluate Model - measure the accuracy of the predictive model, and suggest further improvements


IMPORTING DATASET


In [1]:
from time import time
import pandas as pd
import numpy as np
import re
import csv
import matplotlib.pyplot as plt
import seaborn as sns

import itertools
import datetime

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
#being able to read csv stored in google drive 
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Reading the dataset with no columns titles and with latin encoding 
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/NLP/tweetsClean.csv')
df.sample(3)



Unnamed: 0.1,Unnamed: 0,date,year,clean,url,tags,promote
544671,544671,2018-07-05,2018,kobo btc usd ngn zar kes kobocoin,,Kobocoin,
562714,562714,2018-07-31,2018,every buy buy every sell buy remeber folks btc...,,,
2795051,2795051,2019-07-01,2019,onecoin ceo denied bail read crypto jail nyc b...,https://t.co/Egi2CkzYp5,bitcoin jail nyc crypto,


In [4]:
# Checking if there is any missing value and datatype 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7080772 entries, 0 to 7080771
Data columns (total 7 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   Unnamed: 0  int64 
 1   date        object
 2   year        int64 
 3   clean       object
 4   url         object
 5   tags        object
 6   promote     object
dtypes: int64(2), object(5)
memory usage: 378.2+ MB


In [5]:

# checking for null values, if any
df.isnull().sum()

Unnamed: 0        0
date              0
year              0
clean         15390
url               0
tags           9289
promote           0
dtype: int64

In [6]:
#ditching all row when text is null, as need text for analysis
df.dropna(how='any', inplace=True)

In [5]:
df.sample(3)

Unnamed: 0.1,Unnamed: 0,date,year,clean,url,tags,promote
797778,797778,2019-01-19,2019,exchange make fiat pairing available least for...,,,
3485925,3485925,2019-07-22,2019,use referral link sign get usd crypto btc mco cro,https://t.co/S3qN709MjY https://t.co/qlc2UYkIzd,btc cro mco crypto,
3006068,3006068,2019-07-08,2019,binance btc market elf unusual selling activit...,,ELF,


stop

testing some embedding for deep learning

embed

In [7]:
df['tags'].sample(10000)

dtype('O')

In [None]:
# Defining the window for context
window = 2

# Creating a placeholder for the scanning of the word list
word_lists = []
all_text = []

for text in df['tags'].sample(10000):

    # Appending to the all text list
    all_text += text 

    # Creating a context dictionary
    for i, word in enumerate(text):
        for w in range(window):
            # Getting the context that is ahead by *window* words
            if i + 1 + w < len(text): 
                word_lists.append([word] + [text[(i + 1 + w)]])
            # Getting the context that is behind by *window* words    
            if i - w - 1 >= 0:
                word_lists.append([word] + [text[(i - w - 1)]])

print(len(word_lists)), print(len(all_text))


In [13]:
def create_unique_word_dict(text:list) -> dict:
    """
    A method that creates a dictionary where the keys are unique words
    and key values are indices
    """
    # Getting all the unique words from our text and sorting them alphabetically
    words = list(set(text))
    words.sort()

    # Creating the dictionary for the unique words
    unique_word_dict = {}
    for i, word in enumerate(words):
        unique_word_dict.update({
            word: i
        })

    return unique_word_dict 

In [15]:
unique_word_dict = create_unique_word_dict(all_text)
unique_word_dict

{'a': 0, 'c': 1, 'e': 2, 'l': 3, 'n': 4}

In [None]:
# Defining the number of features (unique words)
n_words = len(unique_word_dict)

# Getting all the unique words 
words = list(unique_word_dict.keys())

In [None]:
# Creating the X and Y matrices using one hot encoding
X = []
Y = []

In [None]:
for i, word_list in tqdm(enumerate(word_lists)):
    # Getting the indices
    main_word_index = unique_word_dict.get(word_list[0])
    context_word_index = unique_word_dict.get(word_list[1])

    # Creating the placeholders   
    X_row = np.zeros(n_words)
    Y_row = np.zeros(n_words)

    # One hot encoding the main word
    X_row[main_word_index] = 1

    # One hot encoding the Y matrix words 
    Y_row[context_word_index] = 1

    # Appending to the main matrices
    X.append(X_row)
    Y.append(Y_row)

In [None]:
# Converting the matrices into a sparse format because the vast majority of the data are 0s
X = sparse.csr_matrix(X)
Y = sparse.csr_matrix(Y)

We now have X and Y matrices built from the focus word and context word pairs. The next step is to choose the embedding dimension. I will choose the dimension to be equal to 2 in order to later plot the words and see whether similar words form clusters.

In [None]:
# Defining the size of the embedding
embed_size = 2


The output layers activation function is softmax. The activation function of the hidden layer is linear. The input dimension is equal to the total number of unique words (remember, our X matrix is of the dimension n x 21). Each input node will have two weights connecting it to the hidden layer. These weights are the word embeddings! After the training of the network, we extract these weights and remove all the rest. We do not necessarily care about the output.

In [None]:
# Defining the neural network
inp = Input(shape=(X.shape[1],))
x = Dense(units=embed_size, activation='linear')(inp)
x = Dense(units=Y.shape[1], activation='softmax')(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

In [None]:
# Optimizing the network weights
model.fit(
    x=X, 
    y=Y, 
    batch_size=256,
    epochs=1000
    )

In [None]:
# Obtaining the weights from the neural network. 
# These are the so called word embeddings

# The input layer 
weights = model.get_weights()[0]


In [None]:
# Creating a dictionary to store the embeddings in. The key is a unique word and 
# the value is the numeric vector
embedding_dict = {}
for word in words: 
    embedding_dict.update({
        word: weights[unique_word_dict.get(word)]
        })

In [None]:
# Ploting the embeddings
plt.figure(figsize=(10, 10))
for word in list(unique_word_dict.keys()):
    coord = embedding_dict.get(word)
    plt.scatter(coord[0], coord[1])
    plt.annotate(word, (coord[0], coord[1]))       


EXTRACTING FEATURES FROM CLEANED TWEETS 10 min

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [None]:
#bag of words = OPTION A
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
bow = bow_vectorizer.fit_transform(df['clean'])
bow.shape

(7056094, 1000)

In [None]:
#TfIdf = OPTION B 

tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(df['clean'])
tfidf.shape

(7056094, 1000)

Word 2 Vec : KeyError: "word 'eth vs btc relative vol spread interesting junction esp given btc dominance v alt season sentiment participants cryptooptions releativevalue' not in vocabulary"
or  "word 'bizpaye trading platform system unique never done history modern day trade exchanges bizpaye marketplace hodl bartercredit crypto cryptotrading btc onlineshopping merchants ecommerce bb bc retail' not in vocabulary"

PREPARE FOR MODELING






---



DEFINING X and Y

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
# ML Libraries
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [None]:
#1- vectoring data
def get_feature_vector(train_fit):
    vector = TfidfVectorizer(sublinear_tf=True)
    vector.fit(train_fit)
    return vector


In [None]:
#2- CREATING a FAKE Y
#ate 11 dec 2017
#ate 10 dec 2018
#ate end

def senti(x):
  if x < 2018:
    return 'BULL'
  elif x > 2018:
    return 'BULL2'
  else:
    return 'BEAR'

df['sent'] = df['year'].apply(lambda x: senti(x) )
df.tail(3)

Unnamed: 0.1,Unnamed: 0,date,text,year,month,day,text1,url,tags,promote,clean,sent
7080769,21513683,2019-11-23,@ABC Setup your FREE account Now : https://t.c...,2019,11,23,@ABC Setup your FREE account Now : https://t.c...,https://t.co/J2f8AlXFqZ https://t.co/J2f8AlXFqZ,Crypto Bitcoin btc Cryptocurrency BTC,,setup free account automatic bitcome get paid ...,BULL2
7080770,21513685,2019-11-23,"@OJRenick So you don't need bitcoin, aye? http...",2019,11,23,"@OJRenick So you don't need bitcoin, aye? http...",https://t.co/F8QCKgKM8Y,,,need bitcoin aye,BULL2
7080771,21540059,2019-11-23,$BTC - an update on the longer term view for B...,2019,11,23,$BTC - an update on the longer term view for B...,https://t.co/yBEMdy9pwp,,,btc update longer term view btc price action s...,BULL2


In [None]:
df['sent'].value_counts()

BULL2    6310080
BEAR      507078
BULL      238936
Name: sent, dtype: int64

In [None]:
#splitting - takes 5 minutes
tf_vector = get_feature_vector(np.array(df['clean']).ravel())
X = tf_vector.transform(np.array(df['clean']).ravel())

In [None]:
X[0]

<1x950508 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [None]:
y = np.array(df['sent']).ravel()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=30)

LAUNCHING MODEL BASES after 1 hour of running the preproc



In [None]:
# Training Naive Bayes model
NB_model = MultinomialNB()
NB_model.fit(X_train, y_train)
y_predict_nb = NB_model.predict(X_test)
print(accuracy_score(y_test, y_predict_nb))

0.9115918196509969


In [None]:
# Training Logistics Regression model - reducing to solver lbfgs for 5 min cause libelinear or newton_cg are to expansive and take 12 good minutes
LR_model = LogisticRegression(solver='lbfgs', max_iter=100)
LR_model.fit(X_train, y_train)
y_predict_lr = LR_model.predict(X_test)
print(accuracy_score(y_test, y_predict_lr))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9313789635346077


In [None]:
#SVM - takes 15 min
from sklearn import svm
svc = svm.SVC(kernel='linear')
svc.fit(X_train, y_train)
y_predict_svm =  svc.predict_proba(X_test)


In [None]:
from sklearn import metrics

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_predict_svm))
# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_predict_svm))
# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_predict_svm))

In [None]:
#Training Random Forest still nothing after 37 mn

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1000, random_state=0)
rf.fit(X_train, y_train) 
y_predict_rf = rf.predict_proba(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(accuracy_score(y_test, y_predict_rf))
print(confusion_matrix(y_test,y_predict_rf))
print(classification_report(y_test,y_predict_rf))
print(accuracy_score(y_test, y_predict_rf))

In [None]:
# TRaining XGB
from xgboost import XGBClassifier
xgb = XGBClassifier(max_depth=6, n_estimators=1000).fit(X_train, y_train)
y_predict_xgb = xgb.predict(X_test)
print(accuracy_score(yvalid, y_predict_xgb))

GO FURTHER
score pour chaque column / mot - lesquels ont ete le plus utilise pour predire
carac du model ou PCA (mix de col qui marchent le mieux, qu est ce qui max la variance et apporte le plus d info) => qu est ce qui a ete utilise le plus par le modele

ajouter d autres colonnes avec  ou essayer d autres modeles comme RF ou classifier plus finement les Y ou faire un clustering non supervise, si pas de Y, patterns par time, plusieurs clusters, can it work

In [None]:
mport numpy as np
from keras.preprocessing.sequence import pad_sequences
     

class TextToTensor():

    def __init__(self, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.max_len = max_len

    def string_to_tensor(self, string_list: list) -> list:
        """
        A method to convert a string list to a tensor for a deep learning model
        """    
        string_list = self.tokenizer.texts_to_sequences(string_list)
        string_list = pad_sequences(string_list, maxlen=self.max_len)
        
        return string_list

In [None]:
# Tokenizing the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
# Getting the longest sentence
max_len = np.max([len(text.split()) for text in X_train])
# Converting to tensor
TextToTensor_instance = TextToTensor(
tokenizer=tokenizer,
max_len=max_len
)
X_train_NN = TextToTensor_instance.string_to_tensor(X_train)

In [None]:
from keras.models import Sequential
from keras.layers import Embedding

model = Sequential()
model.add(Embedding(
  input_dim=44, 
  output_dim=3, 
  input_length=max_len))

model.compile('rmsprop', 'mse')
output_array = model.predict(X_train_NN)[0]

In [None]:
embed_path = 'embeddings\\glove.840B.300d.txt'
embed_dim = 300
# Tokenizing the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
# Creating the embedding matrix
embedding = Embeddings(embed_path, embed_dim)
embedding_matrix = embedding.create_embedding_matrix(tokenizer, len(tokenizer.word_counts))

FINE TUNING







Feature importance :

*   from model coefficients.
*   from decision trees
*   from permutation testing.
