In [None]:
# 

applying deep learning on twitter’s sentiment analysis

*   Train Model - use keras to build and train a deep neural network model

*   Evaluate Model - measure the accuracy of the predictive model, and suggest further improvements


IMPORTING DATASET


In [None]:
from time import time
import pandas as pd
import numpy as np
import re
import csv
import matplotlib.pyplot as plt
import seaborn as sns

import itertools
import datetime

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
#being able to read csv stored in google drive 
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
# Reading the dataset with no columns titles and with latin encoding 
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/NLP/tweetsClean.csv')
df.sample(3)



Unnamed: 0.1,Unnamed: 0,date,year,clean,url,tags,promote
1095120,1095120,2019-05-13,2019,lack price uniformity market via icos investor...,,crypto blockchain toqqn ooobtc ethereum obx bi...,
4585477,4585477,2019-08-26,2019,new episode cnbc ’ crypto trader head global c...,https://t.co/kEILCVV97t,Money global Crypto analytics Bitcoin’s custom...,
1280441,1280441,2019-05-17,2019,bitcoin btc price analysis danger ahead ’ look...,https://t.co/hNNOAC5yl3 https://t.co/lHMOLyy38O,cryptocurrencynews eth ethereum bitcoin liteco...,


In [None]:
# Checking if there is any missing value and datatype 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7080772 entries, 0 to 7080771
Data columns (total 7 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   Unnamed: 0  int64 
 1   date        object
 2   year        int64 
 3   clean       object
 4   url         object
 5   tags        object
 6   promote     object
dtypes: int64(2), object(5)
memory usage: 378.2+ MB


In [None]:

# checking for null values, if any
df.isnull().sum()

Unnamed: 0        0
date              0
year              0
clean         15390
url               0
tags           9289
promote           0
dtype: int64

In [None]:
#ditching all row when text is null, as need text for analysis
df.dropna(how='any', inplace=True)

In [None]:
df.sample(3)

Unnamed: 0.1,Unnamed: 0,date,year,clean,url,tags,promote
1615388,1615388,2019-05-27,2019,nice sunday funday skeet skeet finally burp re...,https://t.co/2Rp2iSq8Tk,,
3547929,3547929,2019-07-23,2019,picking baskets going balls deep next years le...,,,
219531,219531,2017-12-09,2017,buy gold silver bitcoin panama gold silver pan...,https://panamagoldbullion.com/,Gold Silver Bitcoin Panama,


EXTRACTING FEATURES FROM CLEANED TWEETS 10 min

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [None]:
#bag of words = OPTION A
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
bow = bow_vectorizer.fit_transform(df['clean'])
bow.shape

(7056094, 1000)

In [None]:
#TfIdf = OPTION B 

tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(df['clean'])
tfidf.shape

(7056094, 1000)

Word 2 Vec : KeyError: "word 'eth vs btc relative vol spread interesting junction esp given btc dominance v alt season sentiment participants cryptooptions releativevalue' not in vocabulary"
or  "word 'bizpaye trading platform system unique never done history modern day trade exchanges bizpaye marketplace hodl bartercredit crypto cryptotrading btc onlineshopping merchants ecommerce bb bc retail' not in vocabulary"

PREPARE FOR MODELING






---



DEFINING X and Y

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
# ML Libraries
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [None]:
#1- vectoring data
def get_feature_vector(train_fit):
    vector = TfidfVectorizer(sublinear_tf=True)
    vector.fit(train_fit)
    return vector


In [None]:
#2- CREATING a FAKE Y
#ate 11 dec 2017
#ate 10 dec 2018
#ate end

def senti(x):
  if x < 2018:
    return 'BULL'
  elif x > 2018:
    return 'BULL2'
  else:
    return 'BEAR'

df['sent'] = df['year'].apply(lambda x: senti(x) )
df.tail(3)

Unnamed: 0.1,Unnamed: 0,date,text,year,month,day,text1,url,tags,promote,clean,sent
7080769,21513683,2019-11-23,@ABC Setup your FREE account Now : https://t.c...,2019,11,23,@ABC Setup your FREE account Now : https://t.c...,https://t.co/J2f8AlXFqZ https://t.co/J2f8AlXFqZ,Crypto Bitcoin btc Cryptocurrency BTC,,setup free account automatic bitcome get paid ...,BULL2
7080770,21513685,2019-11-23,"@OJRenick So you don't need bitcoin, aye? http...",2019,11,23,"@OJRenick So you don't need bitcoin, aye? http...",https://t.co/F8QCKgKM8Y,,,need bitcoin aye,BULL2
7080771,21540059,2019-11-23,$BTC - an update on the longer term view for B...,2019,11,23,$BTC - an update on the longer term view for B...,https://t.co/yBEMdy9pwp,,,btc update longer term view btc price action s...,BULL2


In [None]:
df['sent'].value_counts()

BULL2    6310080
BEAR      507078
BULL      238936
Name: sent, dtype: int64

In [None]:
#splitting - takes 5 minutes
tf_vector = get_feature_vector(np.array(df['clean']).ravel())
X = tf_vector.transform(np.array(df['clean']).ravel())

In [None]:
X[0]

<1x950508 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [None]:
y = np.array(df['sent']).ravel()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=30)

LAUNCHING MODEL BASES after 1 hour of running the preproc



In [None]:
# Training Naive Bayes model
NB_model = MultinomialNB()
NB_model.fit(X_train, y_train)
y_predict_nb = NB_model.predict(X_test)
print(accuracy_score(y_test, y_predict_nb))

0.9115918196509969


In [None]:
# Training Logistics Regression model - reducing to solver lbfgs for 5 min cause libelinear or newton_cg are to expansive and take 12 good minutes
LR_model = LogisticRegression(solver='lbfgs', max_iter=100)
LR_model.fit(X_train, y_train)
y_predict_lr = LR_model.predict(X_test)
print(accuracy_score(y_test, y_predict_lr))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9313789635346077


In [None]:
#SVM - takes 15 min
from sklearn import svm
svc = svm.SVC(kernel='linear')
svc.fit(X_train, y_train)
y_predict_svm =  svc.predict_proba(X_test)


In [None]:
from sklearn import metrics

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_predict_svm))
# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_predict_svm))
# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_predict_svm))

In [None]:
#Training Random Forest still nothing after 37 mn

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1000, random_state=0)
rf.fit(X_train, y_train) 
y_predict_rf = rf.predict_proba(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(accuracy_score(y_test, y_predict_rf))
print(confusion_matrix(y_test,y_predict_rf))
print(classification_report(y_test,y_predict_rf))
print(accuracy_score(y_test, y_predict_rf))

In [None]:
# TRaining XGB
from xgboost import XGBClassifier
xgb = XGBClassifier(max_depth=6, n_estimators=1000).fit(X_train, y_train)
y_predict_xgb = xgb.predict(X_test)
print(accuracy_score(yvalid, y_predict_xgb))

GO FURTHER
score pour chaque column / mot - lesquels ont ete le plus utilise pour predire
carac du model ou PCA (mix de col qui marchent le mieux, qu est ce qui max la variance et apporte le plus d info) => qu est ce qui a ete utilise le plus par le modele

ajouter d autres colonnes avec  ou essayer d autres modeles comme RF ou classifier plus finement les Y ou faire un clustering non supervise, si pas de Y, patterns par time, plusieurs clusters, can it work

FINE TUNING







Feature importance :

*   from model coefficients.
*   from decision trees
*   from permutation testing.
