In [None]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

In [None]:
#for installing the packages for the 1st time use !pip install [package name]
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import spacy
import re
from html import unescape

!pip install emoji
from emoji import UNICODE_EMOJI

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

In [None]:
data_dir = '/content/drive/MyDrive/SF data science research program/Research Project'
working_dir = '/content/drive/MyDrive/SF data science research program/florence d&g saved things'

In [None]:
# load in all data sets

# sentiment 140 data
sentiment140_train = pd.read_csv(os.path.join(data_dir, 'Sentiment140.data', 'training.1600000.processed.noemoticon.csv'), encoding = "latin-1", names=["label", "id", "timestamp", "query", "user", "text"])
sentiment140_train = sentiment140_train.sample(10000) # testing

sentiment140_test = pd.read_csv(os.path.join(data_dir, 'Sentiment140.data', 'testdata.manual.2009.06.14.csv'), encoding = "latin-1", names=["label", "id", "timestamp", "query", "user", "text"])


# D&G data
dg_chopsticks = pd.read_csv(os.path.join(data_dir, 'D&G data', "dolcegabbana_chopsticks_mentions_daily_expanded.csv"))
dg_general = pd.read_csv(os.path.join(data_dir, 'D&G data', "dolcegabbana_mentions_daily_all.csv"), lineterminator='\n')

### Data Preprosessing: 
##### https://towardsdatascience.com/text-preprocessing-steps-and-universal-pipeline-94233cb6725a

In [None]:
# load NLP model
nlp = spacy.load("en_core_web_sm")

### Data Cleaning

In [None]:
# helper function for pre-processing/cleaning a tweet
def preprocessor(tweet):
    tweet = re.sub (r'@[A-Za-z0-9_]+', '_AT_USER_', tweet) # replace @X with _AT_USER_
    tweet = re.sub (r'#[A-Za-z0-9_]+', '_HASHTAG_', tweet) # replace #X with _HASTHAG_
    tweet = re.sub (r'^RT[\s]+', '', tweet) # remove RT (retweet) at the start of the tweet
    tweet = unescape(tweet) # unescape the HTML
    tweet = tweet.lower() # make everything lowercase
    return tweet

# helper function for tokenization of a tweet
def tokenizer(tweet):
    tokens = nlp(tweet) # this processes the tweet text  
    # only keep tokens (lemmatized) that are alphanumeric (including "-" and "_") and not a stop word, or represent an emoji
    tokens = [t.lemma_ for t in tokens if (re.match("^[a-zA-Z0-9_-]*$", t.text) and not t.is_stop and len(t.text) > 2) or t.text in UNICODE_EMOJI]
    return tokens

### Creating a Count Vectorizer  (Assigning Value to text for modelling)


In [None]:
corpus = list(df['text']) #the list of sample tweets

In [None]:
corpus

In [None]:
# use the count vectorizer to get a document-word matrix (counts of tokens in each tweet) basically assigning value to words per tweet
# pass in through the preprocessor (tweet cleaner) and tokenizer (seperating tweet into individual words)
# max_features basically the number of features selected at random and without replacement at split
model = CountVectorizer(preprocessor=preprocessor, tokenizer=tokenizer, max_features=2098)
word_counts = model.fit_transform(corpus)
tokens = model.get_feature_names()

In [None]:
word_counts # this is a sparse matrix

In [None]:
word_counts.toarray() # converts it to dense matrix form (takes up a lot of space)

In [None]:
tokens

### Creating TF-IDF transformer
Read more here:  
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html#sklearn.feature_extraction.text.TfidfTransformer  
and here:  
https://towardsdatascience.com/tf-idf-explained-and-python-sklearn-implementation-b020c5e83275

In [None]:
tfidf_transformer = TfidfTransformer()
#tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
X = tfidf_transformer.fit_transform(word_counts) # we use the TF-IDF counts as the feature matrix into our models
y = list(df['label']) # labels for supervised model


In [None]:
X

In [None]:
y

In [None]:
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.10, random_state=42, shuffle=True)

###Training the Machine Learning Model 

In [None]:
# Spot Check Algorithms
models = []

# logistic regression
models.append(('LR', LogisticRegression()))

# naive Bayes model
models.append(('NB', MultinomialNB()))

# evaluate each model in turn using cross validation
results = []
names = []
print('model: cross-validation accuracy (cross-validation standard deviation)')
for name, model in models:
	kfold = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
	cv_results = cross_val_score(model, X, y, cv=kfold, scoring='accuracy')
	results.append(cv_results)
	names.append(name)
	print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))
 

In [None]:
# alternatively, you can train on the train set and set on the test set (see below for testing)
penalities = ['l1', 'l2']

for penality in penalities:
  NB = MultinomialNB()
  NB.fit(X_train, y_train)
  NB.predict(X)

In [None]:
for penality in penalities:
  LR = LogisticRegression()
  LR.fit(X_train, y_train)
  LR.predict(X)

###Model Metrics

In [None]:
y_pred = NB.predict(X_validation)

In [None]:
# evaluate predictions
accuracy = accuracy_score(y_validation, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# precision tp / (tp + fp)
precision = precision_score(y_validation, y_pred, pos_label=4)
print('Precision: %f' % precision)

# recall: tp / (tp + fn)
recall = recall_score(y_validation, y_pred, pos_label=4)
print('Recall: %f' % recall)

# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_validation, y_pred, pos_label=4)
print('F1 score: %f' % f1)

# ROC AUC
auc = roc_auc_score(y_validation, y_pred)
print('ROC AUC: %f' % auc)


In [None]:
y__pred = LR.predict(X_validation)

In [None]:
# evaluate predictions
accuracy = accuracy_score(y_validation, y__pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# precision tp / (tp + fp)
precision = precision_score(y_validation, y__pred, pos_label=4)
print('Precision: %f' % precision)

# recall: tp / (tp + fn)
recall = recall_score(y_validation, y__pred, pos_label=4)
print('Recall: %f' % recall)

# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_validation, y__pred, pos_label=4)
print('F1 score: %f' % f1)

# ROC AUC
auc = roc_auc_score(y_validation, y__pred)
print('ROC AUC: %f' % auc)

In [None]:
# confusion matrix
matrix = confusion_matrix(y_validation, y_pred)
print(matrix)

In [None]:
# confusion matrix nicely formatted
plt.figure()
plot_confusion_matrix(NB, X_validation, y_validation, cmap='Blues')  
plt.show()  

## Dolce and Gabbanna Addition


In [None]:
#This is importing dolce and gabanna data
dg = pd.read_csv('/content/drive/My Drive/chopstickadd.csv', encoding = "latin-1", )
dg

#Clustering the D&G Dataset

In [None]:
"""
txt = lambda a: "   ".join(a) 

#aggragte text based on the sentiments 
dt_all = dg.groupby(by=['predictions']).agg({'text': txt}).reset_index()
dt_all

"""

In [None]:
collection = list(dg['text']) #the list of sample tweets
collection

In [None]:
# use the count vectorizer to get a document-word matrix (counts of tokens in each tweet) basically assigning value to words per tweet
# pass in through the preprocessor (tweet cleaner) and tokenizer (seperating tweet into individual words)
# max_features basically the number of features selected at random and without replacement at split
model = CountVectorizer(preprocessor=preprocessor, tokenizer=tokenizer, max_features=2098)
word_counters = model.fit_transform(collection)
cloud = word_counters
tokenizers = model.get_feature_names()

In [None]:
#taking the tokenizer and formatting it into the array
word_counters.toarray()

In [None]:
tokenizers

##TF IDF-Vectorizer part for D&G

In [None]:
A = tfidf_transformer.fit_transform(word_counters) # we use the TF-IDF counts as the feature matrix into our models


In [None]:
A

In [None]:
for penality in penalities:
  what = LR.predict(A)
  whatever = NB.predict(A)

##World Cloud


In [None]:
# Look at Jackies code for word cloud
import seaborn as sns; sns.set()
import tweepy
from textblob import TextBlob
from wordcloud import WordCloud
plt.style.use('fivethirtyeight')

In [None]:
#an example of word cloud with me just taking the tokenizers after being cleaned
allWords = ' '.join(tokenizers)
wordCloud  = WordCloud(width=500, height=300, random_state = 21, max_font_size =110).generate(allWords)

plt.imshow(wordCloud, interpolation = "bilinear")
plt.axis('off')
plt.show

In [None]:
# Add More Visualizations